/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package java.text; import java.util.Locale; import libcore.icu.ICU; import libcore.icu.NativeBreakIterator; /** * Locates boundaries in text. This class defines a protocol for objects that * break up a piece of natural-language text according to a set of criteria. * Instances or subclasses of {@code BreakIterator} can be provided, for * example, to break a piece of text into words, sentences, or logical * characters according to the conventions of some language or group of * languages. We provide four built-in types of {@code BreakIterator}: * {@code BreakIterator}'s interface follows an "iterator" model (hence * the name), meaning it has a concept of a "current position" and methods like * {@code first()}, {@code last()}, {@code next()}, and {@code previous()} that * update the current position. All {@code BreakIterator}s uphold the following * invariants: * *

* {@code BreakIterator} accesses the text it analyzes through a * {@link CharacterIterator}, which makes it possible to use {@code * BreakIterator} to analyze text in any text-storage vehicle that provides a * {@code CharacterIterator} interface. *

* Note: Some types of {@code BreakIterator} can take a long time to * create, and instances of {@code BreakIterator} are not currently cached by * the system. For optimal performance, keep instances of {@code BreakIterator} * around as long as it makes sense. For example, when word-wrapping a document, * don't create and destroy a new {@code BreakIterator} for each line. Create * one break iterator for the whole document (or whatever stretch of text you're * wrapping) and use it to do the whole job of wrapping the text. *

* Examples: *

* Creating and using text boundaries: *

* *
 * public static void main(String args[]) {
 *     if (args.length == 1) {
 *         String stringToExamine = args[0];
 *         //print each word in order
 *         BreakIterator boundary = BreakIterator.getWordInstance();
 *         boundary.setText(stringToExamine);
 *         printEachForward(boundary, stringToExamine);
 *         //print each sentence in reverse order
 *         boundary = BreakIterator.getSentenceInstance(Locale.US);
 *         boundary.setText(stringToExamine);
 *         printEachBackward(boundary, stringToExamine);
 *         printFirst(boundary, stringToExamine);
 *         printLast(boundary, stringToExamine);
 *     }
 * }
 * 
* *
*

* Print each element in order: *

* *
 * public static void printEachForward(BreakIterator boundary, String source) {
 *     int start = boundary.first();
 *     for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
 *         System.out.println(source.substring(start, end));
 *     }
 * }
 * 
* *
*

* Print each element in reverse order: *

* *
 * public static void printEachBackward(BreakIterator boundary, String source) {
 *     int end = boundary.last();
 *     for (int start = boundary.previous(); start != BreakIterator.DONE; end = start, start = boundary
 *             .previous()) {
 *         System.out.println(source.substring(start, end));
 *     }
 * }
 * 
* *
*

* Print the first element: *

* *
 * public static void printFirst(BreakIterator boundary, String source) {
 *     int start = boundary.first();
 *     int end = boundary.next();
 *     System.out.println(source.substring(start, end));
 * }
 * 
* *
*

* Print the last element: *

* *
 * public static void printLast(BreakIterator boundary, String source) {
 *     int end = boundary.last();
 *     int start = boundary.previous();
 *     System.out.println(source.substring(start, end));
 * }
 * 
* *
*

* Print the element at a specified position: *

* *
 * public static void printAt(BreakIterator boundary, int pos, String source) {
 *     int end = boundary.following(pos);
 *     int start = boundary.previous();
 *     System.out.println(source.substring(start, end));
 * }
 * 
* *
*

* Find the next word: *

* *
 * public static int nextWordStartAfter(int pos, String text) {
 *     BreakIterator wb = BreakIterator.getWordInstance();
 *     wb.setText(text);
 *     int last = wb.following(pos);
 *     int current = wb.next();
 *     while (current != BreakIterator.DONE) {
 *         for (int p = last; p < current; p++) {
 *             if (Character.isLetter(text.charAt(p)))
 *                 return last;
 *         }
 *         last = current;
 *         current = wb.next();
 *     }
 *     return BreakIterator.DONE;
 * }
 * 
* *
*

* The iterator returned by {@code BreakIterator.getWordInstance()} is unique in * that the break positions it returns don't represent both the start and end of * the thing being iterated over. That is, a sentence-break iterator returns * breaks that each represent the end of one sentence and the beginning of the * next. With the word-break iterator, the characters between two boundaries * might be a word, or they might be the punctuation or whitespace between two * words. The above code uses a simple heuristic to determine which boundary is * the beginning of a word: If the characters between this boundary and the next * boundary include at least one letter (this can be an alphabetical letter, a * CJK ideograph, a Hangul syllable, a Kana character, etc.), then the text * between this boundary and the next is a word; otherwise, it's the material * between words.) * * @see CharacterIterator */ public abstract class BreakIterator implements Cloneable { /** * This constant is returned by iterate methods like {@code previous()} or * {@code next()} if they have returned all valid boundaries. */ public static final int DONE = -1; // the wrapped ICU implementation NativeBreakIterator wrapped; /** * Default constructor, for use by subclasses. */ protected BreakIterator() { } /* * wrapping constructor */ BreakIterator(NativeBreakIterator iterator) { wrapped = iterator; } /** * Returns an array of locales for which custom {@code BreakIterator} instances * are available. *

Note that Android does not support user-supplied locale service providers. */ public static Locale[] getAvailableLocales() { return ICU.getAvailableBreakIteratorLocales(); } /** * Returns a new instance of {@code BreakIterator} to iterate over * characters using the user's default locale. * See "Be wary of the default locale". * @return a new instance of {@code BreakIterator} using the default locale. */ public static BreakIterator getCharacterInstance() { return getCharacterInstance(Locale.getDefault()); } /** * Returns a new instance of {@code BreakIterator} to iterate over * characters using the given locale. * * @param where * the given locale. * @return a new instance of {@code BreakIterator} using the given locale. */ public static BreakIterator getCharacterInstance(Locale where) { return new RuleBasedBreakIterator(NativeBreakIterator.getCharacterInstance(where)); } /** * Returns a new instance of {{@code BreakIterator} to iterate over * line breaks using the user's default locale. * See "Be wary of the default locale". * @return a new instance of {@code BreakIterator} using the default locale. */ public static BreakIterator getLineInstance() { return getLineInstance(Locale.getDefault()); } /** * Returns a new instance of {@code BreakIterator} to iterate over * line breaks using the given locale. * * @param where * the given locale. * @return a new instance of {@code BreakIterator} using the given locale. * @throws NullPointerException if {@code where} is {@code null}. */ public static BreakIterator getLineInstance(Locale where) { return new RuleBasedBreakIterator(NativeBreakIterator.getLineInstance(where)); } /** * Returns a new instance of {@code BreakIterator} to iterate over * sentence-breaks using the default locale. * See "Be wary of the default locale". * @return a new instance of {@code BreakIterator} using the default locale. */ public static BreakIterator getSentenceInstance() { return getSentenceInstance(Locale.getDefault()); } /** * Returns a new instance of {@code BreakIterator} to iterate over * sentence-breaks using the given locale. * * @param where * the given locale. * @return a new instance of {@code BreakIterator} using the given locale. * @throws NullPointerException if {@code where} is {@code null}. */ public static BreakIterator getSentenceInstance(Locale where) { return new RuleBasedBreakIterator(NativeBreakIterator.getSentenceInstance(where)); } /** * Returns a new instance of {@code BreakIterator} to iterate over * word-breaks using the default locale. * See "Be wary of the default locale". * @return a new instance of {@code BreakIterator} using the default locale. */ public static BreakIterator getWordInstance() { return getWordInstance(Locale.getDefault()); } /** * Returns a new instance of {@code BreakIterator} to iterate over * word-breaks using the given locale. * * @param where * the given locale. * @return a new instance of {@code BreakIterator} using the given locale. * @throws NullPointerException if {@code where} is {@code null}. */ public static BreakIterator getWordInstance(Locale where) { return new RuleBasedBreakIterator(NativeBreakIterator.getWordInstance(where)); } /** * Indicates whether the given offset is a boundary position. If this method * returns true, the current iteration position is set to the given * position; if the function returns false, the current iteration position * is set as though {@link #following(int)} had been called. * * @param offset * the given offset to check. * @return {@code true} if the given offset is a boundary position; {@code * false} otherwise. */ public boolean isBoundary(int offset) { return wrapped.isBoundary(offset); } /** * Returns the position of last boundary preceding the given offset, and * sets the current position to the returned value, or {@code DONE} if the * given offset specifies the starting position. * * @param offset * the given start position to be searched for. * @return the position of the last boundary preceding the given offset. * @throws IllegalArgumentException * if the offset is invalid. */ public int preceding(int offset) { return wrapped.preceding(offset); } /** * Sets the new text string to be analyzed, the current position will be * reset to the beginning of this new string, and the old string will be * lost. * * @param newText * the new text string to be analyzed. */ public void setText(String newText) { wrapped.setText(newText); } /** * Returns this iterator's current position. * * @return this iterator's current position. */ public abstract int current(); /** * Sets this iterator's current position to the first boundary and returns * that position. * * @return the position of the first boundary. */ public abstract int first(); /** * Sets the position of the first boundary to the one following the given * offset and returns this position. Returns {@code DONE} if there is no * boundary after the given offset. * * @param offset * the given position to be searched for. * @return the position of the first boundary following the given offset. * @throws IllegalArgumentException * if the offset is invalid. */ public abstract int following(int offset); /** * Returns a {@code CharacterIterator} which represents the text being * analyzed. Please note that the returned value is probably the internal * iterator used by this object. If the invoker wants to modify the status * of the returned iterator, it is recommended to first create a clone of * the iterator returned. * * @return a {@code CharacterIterator} which represents the text being * analyzed. */ public abstract CharacterIterator getText(); /** * Sets this iterator's current position to the last boundary and returns * that position. * * @return the position of last boundary. */ public abstract int last(); /** * Sets this iterator's current position to the next boundary after the * current position, and returns this position. Returns {@code DONE} if no * boundary was found after the current position. * * @return the position of last boundary. */ public abstract int next(); /** * Sets this iterator's current position to the next boundary after the * given position, and returns that position. Returns {@code DONE} if no * boundary was found after the given position. * * @param n * the given position. * @return the position of last boundary. */ public abstract int next(int n); /** * Sets this iterator's current position to the previous boundary before the * current position and returns that position. Returns {@code DONE} if * no boundary was found before the current position. * * @return the position of last boundary. */ public abstract int previous(); /** * Sets the new text to be analyzed by the given {@code CharacterIterator}. * The position will be reset to the beginning of the new text, and other * status information of this iterator will be kept. * * @param newText * the {@code CharacterIterator} referring to the text to be * analyzed. */ public abstract void setText(CharacterIterator newText); /** * Returns a copy of this iterator. */ @Override public Object clone() { try { BreakIterator cloned = (BreakIterator) super.clone(); cloned.wrapped = (NativeBreakIterator) wrapped.clone(); return cloned; } catch (CloneNotSupportedException e) { throw new AssertionError(e); } } }