/* * Copyright (C) 2014 The Android Open Source Project * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package java.util.regex; import libcore.util.NativeAllocationRegistry; import java.util.Iterator; import java.util.ArrayList; import java.util.NoSuchElementException; import java.util.Spliterator; import java.util.Spliterators; import java.util.function.Predicate; import java.util.stream.Stream; import java.util.stream.StreamSupport; import libcore.util.EmptyArray; /** * A compiled representation of a regular expression. * *

A regular expression, specified as a string, must first be compiled into * an instance of this class. The resulting pattern can then be used to create * a {@link Matcher} object that can match arbitrary {@link * java.lang.CharSequence character sequences} against the regular * expression. All of the state involved in performing a match resides in the * matcher, so many matchers can share the same pattern. * *

A typical invocation sequence is thus * *

 * Pattern p = Pattern.{@link #compile compile}("a*b");
 * Matcher m = p.{@link #matcher matcher}("aaaaab");
 * boolean b = m.{@link Matcher#matches matches}();
* *

A {@link #matches matches} method is defined by this class as a * convenience for when a regular expression is used just once. This method * compiles an expression and matches an input sequence against it in a single * invocation. The statement * *

 * boolean b = Pattern.matches("a*b", "aaaaab");
* * is equivalent to the three statements above, though for repeated matches it * is less efficient since it does not allow the compiled pattern to be reused. * *

Instances of this class are immutable and are safe for use by multiple * concurrent threads. Instances of the {@link Matcher} class are not safe for * such use. * * * *

Summary of regular-expression constructs

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
ConstructMatches
 
Characters
xThe character x
\\The backslash character
\0nThe character with octal value 0n * (0 <= n <= 7)
\0nnThe character with octal value 0nn * (0 <= n <= 7)
\0mnnThe character with octal value 0mnn * (0 <= m <= 3, * 0 <= n <= 7)
\xhhThe character with hexadecimal value 0xhh
\uhhhhThe character with hexadecimal value 0xhhhh
\x{h...h}The character with hexadecimal value 0xh...h * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} *  <= 0xh...h <=  * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})
\tThe tab character ('\u0009')
\nThe newline (line feed) character ('\u000A')
\rThe carriage-return character ('\u000D')
\fThe form-feed character ('\u000C')
\aThe alert (bell) character ('\u0007')
\eThe escape character ('\u001B')
\cxThe control character corresponding to x
 
Character classes
[abc]a, b, or c (simple class)
[^abc]Any character except a, b, or c (negation)
[a-zA-Z]a through z * or A through Z, inclusive (range)
[a-d[m-p]]a through d, * or m through p: [a-dm-p] (union)
[a-z&&[def]]d, e, or f (intersection)
[a-z&&[^bc]]a through z, * except for b and c: [ad-z] (subtraction)
[a-z&&[^m-p]]a through z, * and not m through p: [a-lq-z](subtraction)
 
Predefined character classes
.Any character (may or may not match line terminators)
\dA digit: [0-9]
\DA non-digit: [^0-9]
\sA whitespace character: [ \t\n\x0B\f\r]
\SA non-whitespace character: [^\s]
\wA word character: [a-zA-Z_0-9]
\WA non-word character: [^\w]
 
POSIX character classes (US-ASCII only)
\p{Lower}A lower-case alphabetic character: [a-z]
\p{Upper}An upper-case alphabetic character:[A-Z]
\p{ASCII}All ASCII:[\x00-\x7F]
\p{Alpha}An alphabetic character:[\p{Lower}\p{Upper}]
\p{Digit}A decimal digit: [0-9]
\p{Alnum}An alphanumeric character:[\p{Alpha}\p{Digit}]
\p{Punct}Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
\p{Graph}A visible character: [\p{Alnum}\p{Punct}]
\p{Print}A printable character: [\p{Graph}\x20]
\p{Blank}A space or a tab: [ \t]
\p{Cntrl}A control character: [\x00-\x1F\x7F]
\p{XDigit}A hexadecimal digit: [0-9a-fA-F]
\p{Space}A whitespace character: [ \t\n\x0B\f\r]
 
java.lang.Character classes (simple java character type)
\p{javaLowerCase}Equivalent to java.lang.Character.isLowerCase()
\p{javaUpperCase}Equivalent to java.lang.Character.isUpperCase()
\p{javaWhitespace}Equivalent to java.lang.Character.isWhitespace()
\p{javaMirrored}Equivalent to java.lang.Character.isMirrored()
 
Classes for Unicode scripts, blocks, categories and binary properties
\p{IsLatin}A Latin script character (script)
\p{InGreek}A character in the Greek block (block)
\p{Lu}An uppercase letter (category)
\p{IsAlphabetic}An alphabetic character (binary property)
\p{Sc}A currency symbol
\P{InGreek}Any character except one in the Greek block (negation)
[\p{L}&&[^\p{Lu}]] Any letter except an uppercase letter (subtraction)
 
Boundary matchers
^The beginning of a line
$The end of a line
\bA word boundary
\BA non-word boundary
\AThe beginning of the input
\GThe end of the previous match
\ZThe end of the input but for the final * terminator, if any
\zThe end of the input
 
Greedy quantifiers
X?X, once or not at all
X*X, zero or more times
X+X, one or more times
X{n}X, exactly n times
X{n,}X, at least n times
X{n,m}X, at least n but not more than m times
 
Reluctant quantifiers
X??X, once or not at all
X*?X, zero or more times
X+?X, one or more times
X{n}?X, exactly n times
X{n,}?X, at least n times
X{n,m}?X, at least n but not more than m times
 
Possessive quantifiers
X?+X, once or not at all
X*+X, zero or more times
X++X, one or more times
X{n}+X, exactly n times
X{n,}+X, at least n times
X{n,m}+X, at least n but not more than m times
 
Logical operators
XYX followed by Y
X|YEither X or Y
(X)X, as a capturing group
 
Back references
\nWhatever the nth * capturing group matched
\k<name>Whatever the * named-capturing group "name" matched
 
Quotation
\Nothing, but quotes the following character
\QNothing, but quotes all characters until \E
\ENothing, but ends quoting started by \Q
 
Special constructs (named-capturing and non-capturing)
(?<name>X)X, as a named-capturing group
(?:X)X, as a non-capturing group
(?idmsuxU-idmsuxU) Nothing, but turns match flags i * d m s * u x U * on - off
(?idmsux-idmsux:X)  X, as a non-capturing group with the * given flags i d * m s u * x on - off
(?=X)X, via zero-width positive lookahead
(?!X)X, via zero-width negative lookahead
(?<=X)X, via zero-width positive lookbehind
(?<!X)X, via zero-width negative lookbehind
(?>X)X, as an independent, non-capturing group
* *
* * *
*

Backslashes, escapes, and quoting

* *

The backslash character ('\') serves to introduce escaped * constructs, as defined in the table above, as well as to quote characters * that otherwise would be interpreted as unescaped constructs. Thus the * expression \\ matches a single backslash and \{ matches a * left brace. * *

It is an error to use a backslash prior to any alphabetic character that * does not denote an escaped construct; these are reserved for future * extensions to the regular-expression language. A backslash may be used * prior to a non-alphabetic character regardless of whether that character is * part of an unescaped construct. * *

Backslashes within string literals in Java source code are interpreted * as required by * The Java™ Language Specification * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6) * It is therefore necessary to double backslashes in string * literals that represent regular expressions to protect them from * interpretation by the Java bytecode compiler. The string literal * "\b", for example, matches a single backspace character when * interpreted as a regular expression, while "\\b" matches a * word boundary. The string literal "\(hello\)" is illegal * and leads to a compile-time error; in order to match the string * (hello) the string literal "\\(hello\\)" * must be used. * * *

Character Classes

* *

Character classes may appear within other character classes, and * may be composed by the union operator (implicit) and the intersection * operator (&&). * The union operator denotes a class that contains every character that is * in at least one of its operand classes. The intersection operator * denotes a class that contains every character that is in both of its * operand classes. * *

The precedence of character-class operators is as follows, from * highest to lowest: * *

* * * * * * * * * * * * * * * *
1    Literal escape    \x
2    Grouping[...]
3    Rangea-z
4    Union[a-e][i-u]
5    Intersection[a-z&&[aeiou]]
* *

Note that a different set of metacharacters are in effect inside * a character class than outside a character class. For instance, the * regular expression . loses its special meaning inside a * character class, while the expression - becomes a range * forming metacharacter. * * *

Line terminators

* *

A line terminator is a one- or two-character sequence that marks * the end of a line of the input character sequence. The following are * recognized as line terminators: * *

*

If {@link #UNIX_LINES} mode is activated, then the only line terminators * recognized are newline characters. * *

The regular expression . matches any character except a line * terminator unless the {@link #DOTALL} flag is specified. * *

By default, the regular expressions ^ and $ ignore * line terminators and only match at the beginning and the end, respectively, * of the entire input sequence. If {@link #MULTILINE} mode is activated then * ^ matches at the beginning of input and after any line terminator * except at the end of input. When in {@link #MULTILINE} mode $ * matches just before a line terminator or the end of the input sequence. * * *

Groups and capturing

* * *
Group number
*

Capturing groups are numbered by counting their opening parentheses from * left to right. In the expression ((A)(B(C))), for example, there * are four such groups:

* *
* * * * * * * * *
1    ((A)(B(C)))
2    (A)
3    (B(C))
4    (C)
* *

Group zero always stands for the entire expression. * *

Capturing groups are so named because, during a match, each subsequence * of the input sequence that matches such a group is saved. The captured * subsequence may be used later in the expression, via a back reference, and * may also be retrieved from the matcher once the match operation is complete. * * *

Group name
*

A capturing group can also be assigned a "name", a named-capturing group, * and then be back-referenced later by the "name". Group names are composed of * the following characters. The first character must be a letter. * *

* *

A named-capturing group is still numbered as described in * Group number. * *

The captured input associated with a group is always the subsequence * that the group most recently matched. If a group is evaluated a second time * because of quantification then its previously-captured value, if any, will * be retained if the second evaluation fails. Matching the string * "aba" against the expression (a(b)?)+, for example, leaves * group two set to "b". All captured input is discarded at the * beginning of each match. * *

Groups beginning with (? are either pure, non-capturing groups * that do not capture text and do not count towards the group total, or * named-capturing group. * *

Unicode support

* *

This class is in conformance with Level 1 of Unicode Technical * Standard #18: Unicode Regular Expression, plus RL2.1 * Canonical Equivalents. *

* Unicode escape sequences such as \u2014 in Java source code * are processed as described in section 3.3 of * The Java™ Language Specification. * Such escape sequences are also implemented directly by the regular-expression * parser so that Unicode escapes can be used in expressions that are read from * files or from the keyboard. Thus the strings "\u2014" and * "\\u2014", while not equal, compile into the same pattern, which * matches the character with hexadecimal value 0x2014. *

* A Unicode character can also be represented in a regular-expression by * using its Hex notation(hexadecimal code point value) directly as described in construct * \x{...}, for example a supplementary character U+2011F * can be specified as \x{2011F}, instead of two consecutive * Unicode escape sequences of the surrogate pair * \uD840\uDD1F. *

* Unicode scripts, blocks, categories and binary properties are written with * the \p and \P constructs as in Perl. * \p{prop} matches if * the input has the property prop, while \P{prop} * does not match if the input has that property. *

* Scripts, blocks, categories and binary properties can be used both inside * and outside of a character class. * *

* Scripts are specified either with the prefix {@code Is}, as in * {@code IsHiragana}, or by using the {@code script} keyword (or its short * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. *

* The script names supported by Pattern are the valid script names * accepted and defined by * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. * *

* Blocks are specified with the prefix {@code In}, as in * {@code InMongolian}, or by using the keyword {@code block} (or its short * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. *

* The block names supported by Pattern are the valid block names * accepted and defined by * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. *

* * Categories may be specified with the optional prefix {@code Is}: * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode * letters. Same as scripts and blocks, categories can also be specified * by using the keyword {@code general_category} (or its short form * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. *

* The supported categories are those of * * The Unicode Standard in the version specified by the * {@link java.lang.Character Character} class. The category names are those * defined in the Standard, both normative and informative. *

* * Binary properties are specified with the prefix {@code Is}, as in * {@code IsAlphabetic}. The supported binary properties by Pattern * are *

*

* Predefined Character classes and POSIX character classes are in * conformance with the recommendation of Annex C: Compatibility Properties * of Unicode Regular Expression * . *

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
ClassesMatches
\p{Lower}A lowercase character:\p{IsLowercase}
\p{Upper}An uppercase character:\p{IsUppercase}
\p{ASCII}All ASCII:[\x00-\x7F]
\p{Alpha}An alphabetic character:\p{IsAlphabetic}
\p{Digit}A decimal digit character:p{IsDigit}
\p{Alnum}An alphanumeric character:[\p{IsAlphabetic}\p{IsDigit}]
\p{Punct}A punctuation character:p{IsPunctuation}
\p{Graph}A visible character: [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]
\p{Print}A printable character: [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]
\p{Blank}A space or a tab: [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]
\p{Cntrl}A control character: \p{gc=Cc}
\p{XDigit}A hexadecimal digit: [\p{gc=Nd}\p{IsHex_Digit}]
\p{Space}A whitespace character:\p{IsWhite_Space}
\dA digit: \p{IsDigit}
\DA non-digit: [^\d]
\sA whitespace character: \p{IsWhite_Space}
\SA non-whitespace character: [^\s]
\wA word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]
\WA non-word character: [^\w]
*

* * Categories that behave like the java.lang.Character * boolean ismethodname methods (except for the deprecated ones) are * available through the same \p{prop} syntax where * the specified property has the name javamethodname. * *

Comparison to Perl 5

* *

The Pattern engine performs traditional NFA-based matching * with ordered alternation as occurs in Perl 5. * *

Perl constructs not supported by this class:

* *
* *

Constructs supported by this class but not by Perl:

* * * *

Notable differences from Perl:

* * * * *

For a more precise description of the behavior of regular expression * constructs, please see * Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl, * O'Reilly and Associates, 2006. *

* * @see java.lang.String#split(String, int) * @see java.lang.String#split(String) * * @author Mike McCloskey * @author Mark Reinhold * @author JSR-51 Expert Group * @since 1.4 * @spec JSR-51 */ public final class Pattern implements java.io.Serializable { /** * Regular expression modifier values. Instead of being passed as * arguments, they can also be passed as inline modifiers. * For example, the following statements have the same effect. *
     * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
     * RegExp r2 = RegExp.compile("(?im)abc", 0);
     * 
* * The flags are duplicated so that the familiar Perl match flag * names are available. */ /** * Enables Unix lines mode. * *

In this mode, only the '\n' line terminator is recognized * in the behavior of ., ^, and $. * *

Unix lines mode can also be enabled via the embedded flag * expression (?d). */ public static final int UNIX_LINES = 0x01; /** * Enables case-insensitive matching. * *

By default, case-insensitive matching assumes that only characters * in the US-ASCII charset are being matched. Unicode-aware * case-insensitive matching can be enabled by specifying the {@link * #UNICODE_CASE} flag in conjunction with this flag. * *

Case-insensitive matching can also be enabled via the embedded flag * expression (?i). * *

Specifying this flag may impose a slight performance penalty.

*/ public static final int CASE_INSENSITIVE = 0x02; /** * Permits whitespace and comments in pattern. * *

In this mode, whitespace is ignored, and embedded comments starting * with # are ignored until the end of a line. * *

Comments mode can also be enabled via the embedded flag * expression (?x). */ public static final int COMMENTS = 0x04; /** * Enables multiline mode. * *

In multiline mode the expressions ^ and $ match * just after or just before, respectively, a line terminator or the end of * the input sequence. By default these expressions only match at the * beginning and the end of the entire input sequence. * *

Multiline mode can also be enabled via the embedded flag * expression (?m).

*/ public static final int MULTILINE = 0x08; /** * Enables literal parsing of the pattern. * *

When this flag is specified then the input string that specifies * the pattern is treated as a sequence of literal characters. * Metacharacters or escape sequences in the input sequence will be * given no special meaning. * *

The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on * matching when used in conjunction with this flag. The other flags * become superfluous. * *

There is no embedded flag character for enabling literal parsing. * @since 1.5 */ public static final int LITERAL = 0x10; /** * Enables dotall mode. * *

In dotall mode, the expression . matches any character, * including a line terminator. By default this expression does not match * line terminators. * *

Dotall mode can also be enabled via the embedded flag * expression (?s). (The s is a mnemonic for * "single-line" mode, which is what this is called in Perl.)

*/ public static final int DOTALL = 0x20; /** * Enables Unicode-aware case folding. * *

When this flag is specified then case-insensitive matching, when * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner * consistent with the Unicode Standard. By default, case-insensitive * matching assumes that only characters in the US-ASCII charset are being * matched. * *

Unicode-aware case folding can also be enabled via the embedded flag * expression (?u). * *

Specifying this flag may impose a performance penalty.

*/ public static final int UNICODE_CASE = 0x40; /** * Enables canonical equivalence. * *

When this flag is specified then two characters will be considered * to match if, and only if, their full canonical decompositions match. * The expression "a\u030A", for example, will match the * string "\u00E5" when this flag is specified. By default, * matching does not take canonical equivalence into account. * *

There is no embedded flag character for enabling canonical * equivalence. * *

Specifying this flag may impose a performance penalty.

*/ public static final int CANON_EQ = 0x80; /** * Enables the Unicode version of Predefined character classes and * POSIX character classes as eefined by Unicode Technical * Standard #18: Unicode Regular Expression * Annex C: Compatibility Properties. *

* * This flag has no effect on Android, unicode character classes are always * used. * * @since 1.7 */ public static final int UNICODE_CHARACTER_CLASS = 0x100; /* Pattern has only two serialized components: The pattern string * and the flags, which are all that is needed to recompile the pattern * when it is deserialized. */ /** use serialVersionUID from Merlin b59 for interoperability */ private static final long serialVersionUID = 5073258162644648461L; /** * The original regular-expression pattern string. * * @serial */ private final String pattern; /** * The original pattern flags. * * @serial */ private final int flags; transient long address; private static final NativeAllocationRegistry registry = new NativeAllocationRegistry( Pattern.class.getClassLoader(), getNativeFinalizer(), nativeSize()); /** * Compiles the given regular expression into a pattern.

* * @param regex * The expression to be compiled * * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static Pattern compile(String regex) { return new Pattern(regex, 0); } /** * Compiles the given regular expression into a pattern with the given * flags.

* * @param regex * The expression to be compiled * * @param flags * Match flags, a bit mask that may include * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} * and {@link #COMMENTS} * * @throws IllegalArgumentException * If bit values other than those corresponding to the defined * match flags are set in flags * * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static Pattern compile(String regex, int flags) throws PatternSyntaxException { return new Pattern(regex, flags); } /** * Returns the regular expression from which this pattern was compiled. *

* * @return The source of this pattern */ public String pattern() { return pattern; } /** *

Returns the string representation of this pattern. This * is the regular expression from which this pattern was * compiled.

* * @return The string representation of this pattern * @since 1.5 */ public String toString() { return pattern; } /** * Creates a matcher that will match the given input against this pattern. *

* * @param input * The character sequence to be matched * * @return A new matcher for this pattern */ public Matcher matcher(CharSequence input) { Matcher m = new Matcher(this, input); return m; } /** * Returns this pattern's match flags.

* * @return The match flags specified when this pattern was compiled */ public int flags() { return flags; } /** * Compiles the given regular expression and attempts to match the given * input against it. * *

An invocation of this convenience method of the form * *

     * Pattern.matches(regex, input);
* * behaves in exactly the same way as the expression * *
     * Pattern.compile(regex).matcher(input).matches()
* *

If a pattern is to be used multiple times, compiling it once and reusing * it will be more efficient than invoking this method each time.

* * @param regex * The expression to be compiled * * @param input * The character sequence to be matched * * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static boolean matches(String regex, CharSequence input) { Pattern p = Pattern.compile(regex); Matcher m = p.matcher(input); return m.matches(); } /** * Splits the given input sequence around matches of this pattern. * *

The array returned by this method contains each substring of the * input sequence that is terminated by another subsequence that matches * this pattern or is terminated by the end of the input sequence. The * substrings in the array are in the order in which they occur in the * input. If this pattern does not match any subsequence of the input then * the resulting array has just one element, namely the input sequence in * string form. * *

The limit parameter controls the number of times the * pattern is applied and therefore affects the length of the resulting * array. If the limit n is greater than zero then the pattern * will be applied at most n - 1 times, the array's * length will be no greater than n, and the array's last entry * will contain all input beyond the last matched delimiter. If n * is non-positive then the pattern will be applied as many times as * possible and the array can have any length. If n is zero then * the pattern will be applied as many times as possible, the array can * have any length, and trailing empty strings will be discarded. * *

The input "boo:and:foo", for example, yields the following * results with these parameters: * *

* * * * * * * * * * * * * * * * * * * * * *

Regex    

Limit    

Result    

:2{ "boo", "and:foo" }
:5{ "boo", "and", "foo" }
:-2{ "boo", "and", "foo" }
o5{ "b", "", ":and:f", "", "" }
o-2{ "b", "", ":and:f", "", "" }
o0{ "b", "", ":and:f" }
* * * @param input * The character sequence to be split * * @param limit * The result threshold, as described above * * @return The array of strings computed by splitting the input * around matches of this pattern */ public String[] split(CharSequence input, int limit) { String[] fast = fastSplit(pattern, input.toString(), limit); if (fast != null) { return fast; } int index = 0; boolean matchLimited = limit > 0; ArrayList matchList = new ArrayList<>(); Matcher m = matcher(input); // Add segments before each match found while(m.find()) { if (!matchLimited || matchList.size() < limit - 1) { String match = input.subSequence(index, m.start()).toString(); matchList.add(match); index = m.end(); } else if (matchList.size() == limit - 1) { // last one String match = input.subSequence(index, input.length()).toString(); matchList.add(match); index = m.end(); } } // If no match was found, return this if (index == 0) return new String[] {input.toString()}; // Add remaining segment if (!matchLimited || matchList.size() < limit) matchList.add(input.subSequence(index, input.length()).toString()); // Construct result int resultSize = matchList.size(); if (limit == 0) while (resultSize > 0 && matchList.get(resultSize-1).equals("")) resultSize--; String[] result = new String[resultSize]; return matchList.subList(0, resultSize).toArray(result); } private static final String FASTSPLIT_METACHARACTERS = "\\?*+[](){}^$.|"; /** * Returns a result equivalent to {@code s.split(separator, limit)} if it's able * to compute it more cheaply than native impl, or null if the caller should fall back to * using native impl. * * fastpath will work if the regex is a * (1)one-char String and this character is not one of the * RegEx's meta characters ".$|()[{^?*+\\", or * (2)two-char String and the first char is the backslash and * the second is one of regEx's meta characters ".$|()[{^?*+\\". * @hide */ public static String[] fastSplit(String re, String input, int limit) { // Can we do it cheaply? int len = re.length(); if (len == 0) { return null; } char ch = re.charAt(0); if (len == 1 && FASTSPLIT_METACHARACTERS.indexOf(ch) == -1) { // We're looking for a single non-metacharacter. Easy. } else if (len == 2 && ch == '\\') { // We're looking for a quoted character. // Quoted metacharacters are effectively single non-metacharacters. ch = re.charAt(1); if (FASTSPLIT_METACHARACTERS.indexOf(ch) == -1) { return null; } } else { return null; } // We can do this cheaply... // Unlike Perl, which considers the result of splitting the empty string to be the empty // array, Java returns an array containing the empty string. if (input.isEmpty()) { return new String[] { "" }; } // Count separators int separatorCount = 0; int begin = 0; int end; while (separatorCount + 1 != limit && (end = input.indexOf(ch, begin)) != -1) { ++separatorCount; begin = end + 1; } int lastPartEnd = input.length(); if (limit == 0 && begin == lastPartEnd) { // Last part is empty for limit == 0, remove all trailing empty matches. if (separatorCount == lastPartEnd) { // Input contains only separators. return EmptyArray.STRING; } // Find the beginning of trailing separators. do { --begin; } while (input.charAt(begin - 1) == ch); // Reduce separatorCount and fix lastPartEnd. separatorCount -= input.length() - begin; lastPartEnd = begin; } // Collect the result parts. String[] result = new String[separatorCount + 1]; begin = 0; for (int i = 0; i != separatorCount; ++i) { end = input.indexOf(ch, begin); result[i] = input.substring(begin, end); begin = end + 1; } // Add last part. result[separatorCount] = input.substring(begin, lastPartEnd); return result; } /** * Splits the given input sequence around matches of this pattern. * *

This method works as if by invoking the two-argument {@link * #split(java.lang.CharSequence, int) split} method with the given input * sequence and a limit argument of zero. Trailing empty strings are * therefore not included in the resulting array.

* *

The input "boo:and:foo", for example, yields the following * results with these expressions: * *

* * * * * * *

Regex    

Result

:{ "boo", "and", "foo" }
o{ "b", "", ":and:f" }
* * * @param input * The character sequence to be split * * @return The array of strings computed by splitting the input * around matches of this pattern */ public String[] split(CharSequence input) { return split(input, 0); } /** * Returns a literal pattern String for the specified * String. * *

This method produces a String that can be used to * create a Pattern that would match the string * s as if it were a literal pattern.

Metacharacters * or escape sequences in the input sequence will be given no special * meaning. * * @param s The string to be literalized * @return A literal string replacement * @since 1.5 */ public static String quote(String s) { int slashEIndex = s.indexOf("\\E"); if (slashEIndex == -1) return "\\Q" + s + "\\E"; StringBuilder sb = new StringBuilder(s.length() * 2); sb.append("\\Q"); slashEIndex = 0; int current = 0; while ((slashEIndex = s.indexOf("\\E", current)) != -1) { sb.append(s.substring(current, slashEIndex)); current = slashEIndex + 2; sb.append("\\E\\\\E\\Q"); } sb.append(s.substring(current, s.length())); sb.append("\\E"); return sb.toString(); } /** * Recompile the Pattern instance from a stream. The original pattern * string is read in and the object tree is recompiled from it. */ private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException { // Read in all fields s.defaultReadObject(); compile(); } /** * This private constructor is used to create all Patterns. The pattern * string and match flags are all that is needed to completely describe * a Pattern. */ private Pattern(String p, int f) { if ((f & CANON_EQ) != 0) { throw new UnsupportedOperationException("CANON_EQ flag not supported"); } int supportedFlags = CASE_INSENSITIVE | COMMENTS | DOTALL | LITERAL | MULTILINE | UNICODE_CASE | UNIX_LINES; if ((f & ~supportedFlags) != 0) { throw new IllegalArgumentException("Unsupported flags: " + (f & ~supportedFlags)); } this.pattern = p; this.flags = f; compile(); } private void compile() throws PatternSyntaxException { if (pattern == null) { throw new NullPointerException("pattern == null"); } String icuPattern = pattern; if ((flags & LITERAL) != 0) { icuPattern = quote(pattern); } // These are the flags natively supported by ICU. // They even have the same value in native code. int icuFlags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES); address = compileImpl(icuPattern, icuFlags); registry.registerNativeAllocation(this, address); } private static native long compileImpl(String regex, int flags); private static native long getNativeFinalizer(); private static native int nativeSize(); /** * Creates a predicate which can be used to match a string. * * @return The predicate which can be used for matching on a string * @since 1.8 */ public Predicate asPredicate() { return s -> matcher(s).find(); } /** * Creates a stream from the given input sequence around matches of this * pattern. * *

The stream returned by this method contains each substring of the * input sequence that is terminated by another subsequence that matches * this pattern or is terminated by the end of the input sequence. The * substrings in the stream are in the order in which they occur in the * input. Trailing empty strings will be discarded and not encountered in * the stream. * *

If this pattern does not match any subsequence of the input then * the resulting stream has just one element, namely the input sequence in * string form. * *

When there is a positive-width match at the beginning of the input * sequence then an empty leading substring is included at the beginning * of the stream. A zero-width match at the beginning however never produces * such empty leading substring. * *

If the input sequence is mutable, it must remain constant during the * execution of the terminal stream operation. Otherwise, the result of the * terminal stream operation is undefined. * * @param input * The character sequence to be split * * @return The stream of strings computed by splitting the input * around matches of this pattern * @see #split(CharSequence) * @since 1.8 */ public Stream splitAsStream(final CharSequence input) { class MatcherIterator implements Iterator { private final Matcher matcher; // The start position of the next sub-sequence of input // when current == input.length there are no more elements private int current; // null if the next element, if any, needs to obtained private String nextElement; // > 0 if there are N next empty elements private int emptyElementCount; MatcherIterator() { this.matcher = matcher(input); } public String next() { if (!hasNext()) throw new NoSuchElementException(); if (emptyElementCount == 0) { String n = nextElement; nextElement = null; return n; } else { emptyElementCount--; return ""; } } public boolean hasNext() { if (nextElement != null || emptyElementCount > 0) return true; if (current == input.length()) return false; // Consume the next matching element // Count sequence of matching empty elements while (matcher.find()) { nextElement = input.subSequence(current, matcher.start()).toString(); current = matcher.end(); if (!nextElement.isEmpty()) { return true; } else if (current > 0) { // no empty leading substring for zero-width // match at the beginning of the input emptyElementCount++; } } // Consume last matching element nextElement = input.subSequence(current, input.length()).toString(); current = input.length(); if (!nextElement.isEmpty()) { return true; } else { // Ignore a terminal sequence of matching empty elements emptyElementCount = 0; nextElement = null; return false; } } } return StreamSupport.stream(Spliterators.spliteratorUnknownSize( new MatcherIterator(), Spliterator.ORDERED | Spliterator.NONNULL), false); } }