mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-09 15:42:10 +00:00
6328855: String: Matches hangs at short and easy Strings containing \r \n
6192895: java.util.regex.Matcher: Performance issue 6345469: java.util.regex.Matcher utilizes 100% of the CPU 6988218: RegEx matcher loops 6693451: RegEx matcher goes into infinite delay 7006761: Matcher.matches() has infinite loop 8140212: Slow performance of Matcher.find 8151481: j.u.regex.Pattern cleanup 6609854: Regex does not match correctly for negative nested character classes 4916384: CANON_EQ supports only combining character sequences with non-spacing marks 4867170: Pattern doesn't work with composite character in CANON_EQ mode 6995635: CANON_EQ pattern flag is buggy 6728861: ExceptionInInitializerError is caught when the pattern has precomposed character 6736245: A character in Composition Exclusion Table does not match itself 7080302: the normalization in java regex pattern may have flaw Reviewed-by: rriggs, okutsu, alanb
This commit is contained in:
parent
772322c6fa
commit
b45ea8903e
@ -139,8 +139,6 @@ public class ProtectionDomain {
|
||||
*/
|
||||
final Key key = new Key();
|
||||
|
||||
private static final Debug debug = Debug.getInstance("domain");
|
||||
|
||||
/**
|
||||
* Creates a new ProtectionDomain with the given CodeSource and
|
||||
* Permissions. If the permissions object is not null, then
|
||||
@ -338,6 +336,13 @@ public class ProtectionDomain {
|
||||
" "+pc+"\n";
|
||||
}
|
||||
|
||||
/*
|
||||
* holder class for the static field "debug" to delay its initialization
|
||||
*/
|
||||
private static class DebugHolder {
|
||||
private static final Debug debug = Debug.getInstance("domain");
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true (merge policy permissions) in the following cases:
|
||||
*
|
||||
@ -359,7 +364,7 @@ public class ProtectionDomain {
|
||||
if (sm == null) {
|
||||
return true;
|
||||
} else {
|
||||
if (debug != null) {
|
||||
if (DebugHolder.debug != null) {
|
||||
if (sm.getClass().getClassLoader() == null &&
|
||||
Policy.getPolicyNoCheck().getClass().getClassLoader()
|
||||
== null) {
|
||||
|
||||
@ -62,8 +62,6 @@ public class SecureClassLoader extends ClassLoader {
|
||||
private final Map<CodeSourceKey, ProtectionDomain> pdcache
|
||||
= new ConcurrentHashMap<>(11);
|
||||
|
||||
private static final Debug debug = Debug.getInstance("scl");
|
||||
|
||||
static {
|
||||
ClassLoader.registerAsParallelCapable();
|
||||
}
|
||||
@ -202,6 +200,13 @@ public class SecureClassLoader extends ClassLoader {
|
||||
return new Permissions(); // ProtectionDomain defers the binding
|
||||
}
|
||||
|
||||
/*
|
||||
* holder class for the static field "debug" to delay its initialization
|
||||
*/
|
||||
private static class DebugHolder {
|
||||
private static final Debug debug = Debug.getInstance("scl");
|
||||
}
|
||||
|
||||
/*
|
||||
* Returned cached ProtectionDomain for the specified CodeSource.
|
||||
*/
|
||||
@ -222,9 +227,9 @@ public class SecureClassLoader extends ClassLoader {
|
||||
= SecureClassLoader.this.getPermissions(cs);
|
||||
ProtectionDomain pd = new ProtectionDomain(
|
||||
cs, perms, SecureClassLoader.this, null);
|
||||
if (debug != null) {
|
||||
debug.println(" getPermissions " + pd);
|
||||
debug.println("");
|
||||
if (DebugHolder.debug != null) {
|
||||
DebugHolder.debug.println(" getPermissions " + pd);
|
||||
DebugHolder.debug.println("");
|
||||
}
|
||||
return pd;
|
||||
}
|
||||
|
||||
@ -0,0 +1,375 @@
|
||||
/*
|
||||
* Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Pattern.CharPredicate;
|
||||
import java.util.regex.Pattern.BmpCharPredicate;
|
||||
|
||||
class CharPredicates {
|
||||
|
||||
static final CharPredicate ALPHABETIC = Character::isAlphabetic;
|
||||
|
||||
// \p{gc=Decimal_Number}
|
||||
static final CharPredicate DIGIT = Character::isDigit;
|
||||
|
||||
static final CharPredicate LETTER = Character::isLetter;
|
||||
|
||||
static final CharPredicate IDEOGRAPHIC = Character::isIdeographic;
|
||||
|
||||
static final CharPredicate LOWERCASE = Character::isLowerCase;
|
||||
|
||||
static final CharPredicate UPPERCASE = Character::isUpperCase;
|
||||
|
||||
static final CharPredicate TITLECASE = Character::isTitleCase;
|
||||
|
||||
// \p{Whitespace}
|
||||
static final CharPredicate WHITE_SPACE = ch ->
|
||||
((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
|
||||
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
|
||||
|
||||
// \p{gc=Control}
|
||||
static final CharPredicate CONTROL = ch ->
|
||||
Character.getType(ch) == Character.CONTROL;
|
||||
|
||||
// \p{gc=Punctuation}
|
||||
static final CharPredicate PUNCTUATION = ch ->
|
||||
((((1 << Character.CONNECTOR_PUNCTUATION) |
|
||||
(1 << Character.DASH_PUNCTUATION) |
|
||||
(1 << Character.START_PUNCTUATION) |
|
||||
(1 << Character.END_PUNCTUATION) |
|
||||
(1 << Character.OTHER_PUNCTUATION) |
|
||||
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0;
|
||||
|
||||
// \p{gc=Decimal_Number}
|
||||
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
|
||||
static final CharPredicate HEX_DIGIT = DIGIT.union(
|
||||
ch -> (ch >= 0x0030 && ch <= 0x0039) ||
|
||||
(ch >= 0x0041 && ch <= 0x0046) ||
|
||||
(ch >= 0x0061 && ch <= 0x0066) ||
|
||||
(ch >= 0xFF10 && ch <= 0xFF19) ||
|
||||
(ch >= 0xFF21 && ch <= 0xFF26) ||
|
||||
(ch >= 0xFF41 && ch <= 0xFF46));
|
||||
|
||||
static final CharPredicate ASSIGNED = ch ->
|
||||
Character.getType(ch) != Character.UNASSIGNED;
|
||||
|
||||
// PropList.txt:Noncharacter_Code_Point
|
||||
static final CharPredicate NONCHARACTER_CODE_POINT = ch ->
|
||||
(ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
|
||||
|
||||
// \p{alpha}
|
||||
// \p{digit}
|
||||
static final CharPredicate ALNUM = ALPHABETIC.union(DIGIT);
|
||||
|
||||
// \p{Whitespace} --
|
||||
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
|
||||
// \p{gc=Line_Separator}
|
||||
// \p{gc=Paragraph_Separator}]
|
||||
static final CharPredicate BLANK = ch ->
|
||||
Character.getType(ch) == Character.SPACE_SEPARATOR ||
|
||||
ch == 0x9; // \N{HT}
|
||||
|
||||
// [^
|
||||
// \p{space}
|
||||
// \p{gc=Control}
|
||||
// \p{gc=Surrogate}
|
||||
// \p{gc=Unassigned}]
|
||||
static final CharPredicate GRAPH = ch ->
|
||||
((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR) |
|
||||
(1 << Character.CONTROL) |
|
||||
(1 << Character.SURROGATE) |
|
||||
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
|
||||
== 0;
|
||||
|
||||
// \p{graph}
|
||||
// \p{blank}
|
||||
// -- \p{cntrl}
|
||||
static final CharPredicate PRINT = GRAPH.union(BLANK).and(CONTROL.negate());
|
||||
|
||||
// 200C..200D PropList.txt:Join_Control
|
||||
static final CharPredicate JOIN_CONTROL = ch -> ch == 0x200C || ch == 0x200D;
|
||||
|
||||
// \p{alpha}
|
||||
// \p{gc=Mark}
|
||||
// \p{digit}
|
||||
// \p{gc=Connector_Punctuation}
|
||||
// \p{Join_Control} 200C..200D
|
||||
static final CharPredicate WORD =
|
||||
ALPHABETIC.union(ch -> ((((1 << Character.NON_SPACING_MARK) |
|
||||
(1 << Character.ENCLOSING_MARK) |
|
||||
(1 << Character.COMBINING_SPACING_MARK) |
|
||||
(1 << Character.DECIMAL_DIGIT_NUMBER) |
|
||||
(1 << Character.CONNECTOR_PUNCTUATION))
|
||||
>> Character.getType(ch)) & 1) != 0,
|
||||
JOIN_CONTROL);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private static final HashMap<String, CharPredicate> posix = new HashMap<>(12);
|
||||
private static final HashMap<String, CharPredicate> uprops = new HashMap<>(18);
|
||||
|
||||
private static void defPosix(String name, CharPredicate p) {
|
||||
posix.put(name, p);
|
||||
}
|
||||
private static void defUProp(String name, CharPredicate p) {
|
||||
uprops.put(name, p);
|
||||
}
|
||||
|
||||
static {
|
||||
defPosix("ALPHA", ALPHABETIC);
|
||||
defPosix("LOWER", LOWERCASE);
|
||||
defPosix("UPPER", UPPERCASE);
|
||||
defPosix("SPACE", WHITE_SPACE);
|
||||
defPosix("PUNCT", PUNCTUATION);
|
||||
defPosix("XDIGIT",HEX_DIGIT);
|
||||
defPosix("ALNUM", ALNUM);
|
||||
defPosix("CNTRL", CONTROL);
|
||||
defPosix("DIGIT", DIGIT);
|
||||
defPosix("BLANK", BLANK);
|
||||
defPosix("GRAPH", GRAPH);
|
||||
defPosix("PRINT", PRINT);
|
||||
|
||||
defUProp("ALPHABETIC", ALPHABETIC);
|
||||
defUProp("ASSIGNED", ASSIGNED);
|
||||
defUProp("CONTROL", CONTROL);
|
||||
defUProp("HEXDIGIT", HEX_DIGIT);
|
||||
defUProp("IDEOGRAPHIC", IDEOGRAPHIC);
|
||||
defUProp("JOINCONTROL", JOIN_CONTROL);
|
||||
defUProp("LETTER", LETTER);
|
||||
defUProp("LOWERCASE", LOWERCASE);
|
||||
defUProp("NONCHARACTERCODEPOINT", NONCHARACTER_CODE_POINT);
|
||||
defUProp("TITLECASE", TITLECASE);
|
||||
defUProp("PUNCTUATION", PUNCTUATION);
|
||||
defUProp("UPPERCASE", UPPERCASE);
|
||||
defUProp("WHITESPACE", WHITE_SPACE);
|
||||
defUProp("WORD", WORD);
|
||||
defUProp("WHITE_SPACE", WHITE_SPACE);
|
||||
defUProp("HEX_DIGIT", HEX_DIGIT);
|
||||
defUProp("NONCHARACTER_CODE_POINT", NONCHARACTER_CODE_POINT);
|
||||
defUProp("JOIN_CONTROL", JOIN_CONTROL);
|
||||
}
|
||||
|
||||
public static CharPredicate forUnicodeProperty(String propName) {
|
||||
propName = propName.toUpperCase(Locale.ROOT);
|
||||
CharPredicate p = uprops.get(propName);
|
||||
if (p != null)
|
||||
return p;
|
||||
return posix.get(propName);
|
||||
}
|
||||
|
||||
public static CharPredicate forPOSIXName(String propName) {
|
||||
return posix.get(propName.toUpperCase(Locale.ENGLISH));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Returns a predicate matching all characters belong to a named
|
||||
* UnicodeScript.
|
||||
*/
|
||||
static CharPredicate forUnicodeScript(String name) {
|
||||
final Character.UnicodeScript script;
|
||||
try {
|
||||
script = Character.UnicodeScript.forName(name);
|
||||
return ch -> script == Character.UnicodeScript.of(ch);
|
||||
} catch (IllegalArgumentException iae) {}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a predicate matching all characters in a UnicodeBlock.
|
||||
*/
|
||||
static CharPredicate forUnicodeBlock(String name) {
|
||||
final Character.UnicodeBlock block;
|
||||
try {
|
||||
block = Character.UnicodeBlock.forName(name);
|
||||
return ch -> block == Character.UnicodeBlock.of(ch);
|
||||
} catch (IllegalArgumentException iae) {}
|
||||
return null;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// unicode categories, aliases, properties, java methods ...
|
||||
|
||||
private static final HashMap<String, CharPredicate> props = new HashMap<>(128);
|
||||
|
||||
/**
|
||||
* Returns a predicate matching all characters in a named property.
|
||||
*/
|
||||
static CharPredicate forProperty(String name) {
|
||||
return props.get(name);
|
||||
}
|
||||
|
||||
private static void defProp(String name, CharPredicate p) {
|
||||
props.put(name, p);
|
||||
}
|
||||
|
||||
private static void defCategory(String name, final int typeMask) {
|
||||
CharPredicate p = ch -> (typeMask & (1 << Character.getType(ch))) != 0;
|
||||
props.put(name, p);
|
||||
}
|
||||
|
||||
private static void defRange(String name, final int lower, final int upper) {
|
||||
BmpCharPredicate p = ch -> lower <= ch && ch <= upper;
|
||||
props.put(name, p);
|
||||
}
|
||||
|
||||
private static void defCtype(String name, final int ctype) {
|
||||
BmpCharPredicate p = ch -> ch < 128 && ASCII.isType(ch, ctype);
|
||||
// PrintPattern.pmap.put(p, name);
|
||||
props.put(name, p);
|
||||
}
|
||||
|
||||
static {
|
||||
// Unicode character property aliases, defined in
|
||||
// http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
|
||||
defCategory("Cn", 1<<Character.UNASSIGNED);
|
||||
defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
|
||||
defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
|
||||
defCategory("Lt", 1<<Character.TITLECASE_LETTER);
|
||||
defCategory("Lm", 1<<Character.MODIFIER_LETTER);
|
||||
defCategory("Lo", 1<<Character.OTHER_LETTER);
|
||||
defCategory("Mn", 1<<Character.NON_SPACING_MARK);
|
||||
defCategory("Me", 1<<Character.ENCLOSING_MARK);
|
||||
defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
|
||||
defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
|
||||
defCategory("Nl", 1<<Character.LETTER_NUMBER);
|
||||
defCategory("No", 1<<Character.OTHER_NUMBER);
|
||||
defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
|
||||
defCategory("Zl", 1<<Character.LINE_SEPARATOR);
|
||||
defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
|
||||
defCategory("Cc", 1<<Character.CONTROL);
|
||||
defCategory("Cf", 1<<Character.FORMAT);
|
||||
defCategory("Co", 1<<Character.PRIVATE_USE);
|
||||
defCategory("Cs", 1<<Character.SURROGATE);
|
||||
defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
|
||||
defCategory("Ps", 1<<Character.START_PUNCTUATION);
|
||||
defCategory("Pe", 1<<Character.END_PUNCTUATION);
|
||||
defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
|
||||
defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
|
||||
defCategory("Sm", 1<<Character.MATH_SYMBOL);
|
||||
defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
|
||||
defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
|
||||
defCategory("So", 1<<Character.OTHER_SYMBOL);
|
||||
defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
|
||||
defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
|
||||
defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER) |
|
||||
(1<<Character.MODIFIER_LETTER) |
|
||||
(1<<Character.OTHER_LETTER)));
|
||||
defCategory("M", ((1<<Character.NON_SPACING_MARK) |
|
||||
(1<<Character.ENCLOSING_MARK) |
|
||||
(1<<Character.COMBINING_SPACING_MARK)));
|
||||
defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
|
||||
(1<<Character.LETTER_NUMBER) |
|
||||
(1<<Character.OTHER_NUMBER)));
|
||||
defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
|
||||
(1<<Character.LINE_SEPARATOR) |
|
||||
(1<<Character.PARAGRAPH_SEPARATOR)));
|
||||
defCategory("C", ((1<<Character.CONTROL) |
|
||||
(1<<Character.FORMAT) |
|
||||
(1<<Character.PRIVATE_USE) |
|
||||
(1<<Character.SURROGATE))); // Other
|
||||
defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
|
||||
(1<<Character.START_PUNCTUATION) |
|
||||
(1<<Character.END_PUNCTUATION) |
|
||||
(1<<Character.CONNECTOR_PUNCTUATION) |
|
||||
(1<<Character.OTHER_PUNCTUATION) |
|
||||
(1<<Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1<<Character.FINAL_QUOTE_PUNCTUATION)));
|
||||
defCategory("S", ((1<<Character.MATH_SYMBOL) |
|
||||
(1<<Character.CURRENCY_SYMBOL) |
|
||||
(1<<Character.MODIFIER_SYMBOL) |
|
||||
(1<<Character.OTHER_SYMBOL)));
|
||||
defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER)));
|
||||
defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
|
||||
(1<<Character.LOWERCASE_LETTER) |
|
||||
(1<<Character.TITLECASE_LETTER) |
|
||||
(1<<Character.MODIFIER_LETTER) |
|
||||
(1<<Character.OTHER_LETTER) |
|
||||
(1<<Character.DECIMAL_DIGIT_NUMBER)));
|
||||
defRange("L1", 0x00, 0xFF); // Latin-1
|
||||
props.put("all", ch -> true);
|
||||
|
||||
// Posix regular expression character classes, defined in
|
||||
// http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
|
||||
defRange("ASCII", 0x00, 0x7F); // ASCII
|
||||
defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
|
||||
defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
|
||||
defCtype("Blank", ASCII.BLANK); // Space and tab characters
|
||||
defCtype("Cntrl", ASCII.CNTRL); // Control characters
|
||||
defRange("Digit", '0', '9'); // Numeric characters
|
||||
defCtype("Graph", ASCII.GRAPH); // printable and visible
|
||||
defRange("Lower", 'a', 'z'); // Lower-case alphabetic
|
||||
defRange("Print", 0x20, 0x7E); // Printable characters
|
||||
defCtype("Punct", ASCII.PUNCT); // Punctuation characters
|
||||
defCtype("Space", ASCII.SPACE); // Space characters
|
||||
defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
|
||||
defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
|
||||
|
||||
// Java character properties, defined by methods in Character.java
|
||||
defProp("javaLowerCase", java.lang.Character::isLowerCase);
|
||||
defProp("javaUpperCase", Character::isUpperCase);
|
||||
defProp("javaAlphabetic", java.lang.Character::isAlphabetic);
|
||||
defProp("javaIdeographic", java.lang.Character::isIdeographic);
|
||||
defProp("javaTitleCase", java.lang.Character::isTitleCase);
|
||||
defProp("javaDigit", java.lang.Character::isDigit);
|
||||
defProp("javaDefined", java.lang.Character::isDefined);
|
||||
defProp("javaLetter", java.lang.Character::isLetter);
|
||||
defProp("javaLetterOrDigit", java.lang.Character::isLetterOrDigit);
|
||||
defProp("javaJavaIdentifierStart", java.lang.Character::isJavaIdentifierStart);
|
||||
defProp("javaJavaIdentifierPart", java.lang.Character::isJavaIdentifierPart);
|
||||
defProp("javaUnicodeIdentifierStart", java.lang.Character::isUnicodeIdentifierStart);
|
||||
defProp("javaUnicodeIdentifierPart", java.lang.Character::isUnicodeIdentifierPart);
|
||||
defProp("javaIdentifierIgnorable", java.lang.Character::isIdentifierIgnorable);
|
||||
defProp("javaSpaceChar", java.lang.Character::isSpaceChar);
|
||||
defProp("javaWhitespace", java.lang.Character::isWhitespace);
|
||||
defProp("javaISOControl", java.lang.Character::isISOControl);
|
||||
defProp("javaMirrored", java.lang.Character::isMirrored);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Posix ASCII variants, not in the lookup map
|
||||
*/
|
||||
static final BmpCharPredicate ASCII_DIGIT = ch -> ch < 128 && ASCII.isDigit(ch);
|
||||
static final BmpCharPredicate ASCII_WORD = ch -> ch < 128 && ASCII.isWord(ch);
|
||||
static final BmpCharPredicate ASCII_SPACE = ch -> ch < 128 && ASCII.isSpace(ch);
|
||||
|
||||
}
|
||||
@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* A lightweight hashset implementation for positive 'int'. Not safe for
|
||||
* concurrent access.
|
||||
*/
|
||||
class IntHashSet {
|
||||
private int[] entries;
|
||||
private int[] hashes;
|
||||
private int pos = 0;
|
||||
|
||||
public IntHashSet() {
|
||||
this.entries = new int[16 << 1]; // initCapacity = 16;
|
||||
this.hashes = new int[(16 / 2) | 1]; // odd -> fewer collisions
|
||||
Arrays.fill(this.entries, -1);
|
||||
Arrays.fill(this.hashes, -1);
|
||||
}
|
||||
|
||||
public boolean contains(int i) {
|
||||
int h = hashes[i % hashes.length];
|
||||
while (h != -1) {
|
||||
if (entries[h] == i)
|
||||
return true;
|
||||
h = entries[h + 1];
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void add(int i) {
|
||||
int h0 = i % hashes.length;
|
||||
int next = hashes[h0];
|
||||
// if invoker guarantees contains(i) checked before add(i)
|
||||
// the following check is not needed.
|
||||
int next0 = next;
|
||||
while (next0 != -1) {
|
||||
if (entries[next0 ] == i)
|
||||
return;
|
||||
next0 = entries[next0 + 1];
|
||||
}
|
||||
hashes[h0] = pos;
|
||||
entries[pos++] = i;
|
||||
entries[pos++] = next;
|
||||
if (pos == entries.length)
|
||||
expand();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
Arrays.fill(this.entries, -1);
|
||||
Arrays.fill(this.hashes, -1);
|
||||
pos = 0;
|
||||
}
|
||||
|
||||
private void expand() {
|
||||
int[] old = entries;
|
||||
int[] es = new int[old.length << 1];
|
||||
int hlen = (old.length / 2) | 1;
|
||||
int[] hs = new int[hlen];
|
||||
Arrays.fill(es, -1);
|
||||
Arrays.fill(hs, -1);
|
||||
for (int n = 0; n < pos;) { // re-hashing
|
||||
int i = old[n];
|
||||
int hsh = i % hlen;
|
||||
int next = hs[hsh];
|
||||
hs[hsh] = n;
|
||||
es[n++] = i;
|
||||
es[n++] = next;
|
||||
}
|
||||
this.entries = es;
|
||||
this.hashes = hs;
|
||||
}
|
||||
}
|
||||
@ -177,6 +177,14 @@ public final class Matcher implements MatchResult {
|
||||
*/
|
||||
int[] locals;
|
||||
|
||||
/**
|
||||
* Storage used by top greedy Loop node to store a specific hash set to
|
||||
* keep the beginning index of the failed repetition match. The nodes
|
||||
* themselves are stateless, so they rely on this field to hold state
|
||||
* during a match.
|
||||
*/
|
||||
IntHashSet[] localsPos;
|
||||
|
||||
/**
|
||||
* Boolean indicating whether or not more input could change
|
||||
* the results of the last match.
|
||||
@ -239,6 +247,7 @@ public final class Matcher implements MatchResult {
|
||||
int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
|
||||
groups = new int[parentGroupCount * 2];
|
||||
locals = new int[parent.localCount];
|
||||
localsPos = new IntHashSet[parent.localTCNCount];
|
||||
|
||||
// Put fields into initial states
|
||||
reset();
|
||||
@ -375,6 +384,7 @@ public final class Matcher implements MatchResult {
|
||||
groups[i] = -1;
|
||||
for (int i = 0; i < locals.length; i++)
|
||||
locals[i] = -1;
|
||||
localsPos = new IntHashSet[parentPattern.localTCNCount];
|
||||
modCount++;
|
||||
return this;
|
||||
}
|
||||
@ -397,6 +407,10 @@ public final class Matcher implements MatchResult {
|
||||
groups[i] = -1;
|
||||
for(int i=0; i<locals.length; i++)
|
||||
locals[i] = -1;
|
||||
for (int i = 0; i < localsPos.length; i++) {
|
||||
if (localsPos[i] != null)
|
||||
localsPos[i].clear();
|
||||
}
|
||||
lastAppendPosition = 0;
|
||||
from = 0;
|
||||
to = getTextLength();
|
||||
@ -1706,6 +1720,10 @@ public final class Matcher implements MatchResult {
|
||||
this.oldLast = oldLast < 0 ? from : oldLast;
|
||||
for (int i = 0; i < groups.length; i++)
|
||||
groups[i] = -1;
|
||||
for (int i = 0; i < localsPos.length; i++) {
|
||||
if (localsPos[i] != null)
|
||||
localsPos[i].clear();
|
||||
}
|
||||
acceptMode = NOANCHOR;
|
||||
boolean result = parentPattern.root.match(this, from, text);
|
||||
if (!result)
|
||||
@ -1729,6 +1747,10 @@ public final class Matcher implements MatchResult {
|
||||
this.oldLast = oldLast < 0 ? from : oldLast;
|
||||
for (int i = 0; i < groups.length; i++)
|
||||
groups[i] = -1;
|
||||
for (int i = 0; i < localsPos.length; i++) {
|
||||
if (localsPos[i] != null)
|
||||
localsPos[i].clear();
|
||||
}
|
||||
acceptMode = anchor;
|
||||
boolean result = parentPattern.matchRoot.match(this, from, text);
|
||||
if (!result)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,220 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.regex.Pattern.CharPredicate;
|
||||
import java.util.regex.CharPredicates;
|
||||
import static java.util.regex.ASCII.*;
|
||||
|
||||
/**
|
||||
* A utility class to print out the pattern node tree.
|
||||
*/
|
||||
|
||||
class PrintPattern {
|
||||
|
||||
private static HashMap<Pattern.Node, Integer> ids = new HashMap<>();
|
||||
|
||||
private static void print(Pattern.Node node, String text, int depth) {
|
||||
if (!ids.containsKey(node))
|
||||
ids.put(node, ids.size());
|
||||
print("%6d:%" + (depth==0? "": depth<<1) + "s<%s>", ids.get(node), "", text);
|
||||
if (ids.containsKey(node.next))
|
||||
print(" (=>%d)", ids.get(node.next));
|
||||
print("%n");
|
||||
}
|
||||
|
||||
private static void print(String s, int depth) {
|
||||
print(" %" + (depth==0?"":depth<<1) + "s<%s>%n", "", s);
|
||||
}
|
||||
|
||||
private static void print(String fmt, Object ... args) {
|
||||
System.err.printf(fmt, args);
|
||||
}
|
||||
|
||||
private static String toStringCPS(int[] cps) {
|
||||
StringBuilder sb = new StringBuilder(cps.length);
|
||||
for (int cp : cps)
|
||||
sb.append(toStringCP(cp));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String toStringCP(int cp) {
|
||||
return (isPrint(cp) ? "" + (char)cp
|
||||
: "\\u" + Integer.toString(cp, 16));
|
||||
}
|
||||
|
||||
private static String toStringRange(int min, int max) {
|
||||
if (max == Pattern.MAX_REPS) {
|
||||
if (min == 0)
|
||||
return " * ";
|
||||
else if (min == 1)
|
||||
return " + ";
|
||||
return "{" + min + ", max}";
|
||||
}
|
||||
return "{" + min + ", " + max + "}";
|
||||
}
|
||||
|
||||
private static String toStringCtype(int type) {
|
||||
switch(type) {
|
||||
case UPPER: return "ASCII.UPPER";
|
||||
case LOWER: return "ASCII.LOWER";
|
||||
case DIGIT: return "ASCII.DIGIT";
|
||||
case SPACE: return "ASCII.SPACE";
|
||||
case PUNCT: return "ASCII.PUNCT";
|
||||
case CNTRL: return "ASCII.CNTRL";
|
||||
case BLANK: return "ASCII.BLANK";
|
||||
case UNDER: return "ASCII.UNDER";
|
||||
case ASCII: return "ASCII.ASCII";
|
||||
case ALPHA: return "ASCII.ALPHA";
|
||||
case ALNUM: return "ASCII.ALNUM";
|
||||
case GRAPH: return "ASCII.GRAPH";
|
||||
case WORD: return "ASCII.WORD";
|
||||
case XDIGIT: return "ASCII.XDIGIT";
|
||||
default: return "ASCII ?";
|
||||
}
|
||||
}
|
||||
|
||||
private static String toString(Pattern.Node node) {
|
||||
String name = node.getClass().getName();
|
||||
return name.substring(name.lastIndexOf('$') + 1);
|
||||
}
|
||||
|
||||
static HashMap<CharPredicate, String> pmap;
|
||||
static {
|
||||
pmap = new HashMap<>();
|
||||
pmap.put(Pattern.ALL, "All");
|
||||
pmap.put(Pattern.DOT, "Dot");
|
||||
pmap.put(Pattern.UNIXDOT, "UnixDot");
|
||||
pmap.put(Pattern.VertWS, "VertWS");
|
||||
pmap.put(Pattern.HorizWS, "HorizWS");
|
||||
|
||||
pmap.put(CharPredicates.ASCII_DIGIT, "ASCII.DIGIT");
|
||||
pmap.put(CharPredicates.ASCII_WORD, "ASCII.WORD");
|
||||
pmap.put(CharPredicates.ASCII_SPACE, "ASCII.SPACE");
|
||||
}
|
||||
|
||||
static void walk(Pattern.Node node, int depth) {
|
||||
depth++;
|
||||
while(node != null) {
|
||||
String name = toString(node);
|
||||
String str;
|
||||
if (node instanceof Pattern.Prolog) {
|
||||
print(node, name, depth);
|
||||
// print the loop here
|
||||
Pattern.Loop loop = ((Pattern.Prolog)node).loop;
|
||||
name = toString(loop);
|
||||
str = name + " " + toStringRange(loop.cmin, loop.cmax);
|
||||
print(loop, str, depth);
|
||||
walk(loop.body, depth);
|
||||
print("/" + name, depth);
|
||||
node = loop;
|
||||
} else if (node instanceof Pattern.Loop) {
|
||||
return; // stop here, body.next -> loop
|
||||
} else if (node instanceof Pattern.Curly) {
|
||||
Pattern.Curly c = (Pattern.Curly)node;
|
||||
str = "Curly " + c.type + " " + toStringRange(c.cmin, c.cmax);
|
||||
print(node, str, depth);
|
||||
walk(c.atom, depth);
|
||||
print("/Curly", depth);
|
||||
} else if (node instanceof Pattern.GroupCurly) {
|
||||
Pattern.GroupCurly gc = (Pattern.GroupCurly)node;
|
||||
str = "GroupCurly " + gc.groupIndex / 2 +
|
||||
", " + gc.type + " " + toStringRange(gc.cmin, gc.cmax);
|
||||
print(node, str, depth);
|
||||
walk(gc.atom, depth);
|
||||
print("/GroupCurly", depth);
|
||||
} else if (node instanceof Pattern.GroupHead) {
|
||||
Pattern.GroupHead head = (Pattern.GroupHead)node;
|
||||
Pattern.GroupTail tail = head.tail;
|
||||
print(head, "Group.head " + (tail.groupIndex / 2), depth);
|
||||
walk(head.next, depth);
|
||||
print(tail, "/Group.tail " + (tail.groupIndex / 2), depth);
|
||||
node = tail;
|
||||
} else if (node instanceof Pattern.GroupTail) {
|
||||
return; // stopper
|
||||
} else if (node instanceof Pattern.Ques) {
|
||||
print(node, "Ques " + ((Pattern.Ques)node).type, depth);
|
||||
walk(((Pattern.Ques)node).atom, depth);
|
||||
print("/Ques", depth);
|
||||
} else if (node instanceof Pattern.Branch) {
|
||||
Pattern.Branch b = (Pattern.Branch)node;
|
||||
print(b, name, depth);
|
||||
int i = 0;
|
||||
while (true) {
|
||||
if (b.atoms[i] != null) {
|
||||
walk(b.atoms[i], depth);
|
||||
} else {
|
||||
print(" (accepted)", depth);
|
||||
}
|
||||
if (++i == b.size)
|
||||
break;
|
||||
print("-branch.separator-", depth);
|
||||
}
|
||||
node = b.conn;
|
||||
print(node, "/Branch", depth);
|
||||
} else if (node instanceof Pattern.BranchConn) {
|
||||
return;
|
||||
} else if (node instanceof Pattern.CharProperty) {
|
||||
str = pmap.get(((Pattern.CharProperty)node).predicate);
|
||||
if (str == null)
|
||||
str = toString(node);
|
||||
else
|
||||
str = "Single \"" + str + "\"";
|
||||
print(node, str, depth);
|
||||
} else if (node instanceof Pattern.SliceNode) {
|
||||
str = name + " \"" +
|
||||
toStringCPS(((Pattern.SliceNode)node).buffer) + "\"";
|
||||
print(node, str, depth);
|
||||
} else if (node instanceof Pattern.CharPropertyGreedy) {
|
||||
Pattern.CharPropertyGreedy gcp = (Pattern.CharPropertyGreedy)node;
|
||||
String pstr = pmap.get(gcp.predicate);
|
||||
if (pstr == null)
|
||||
pstr = gcp.predicate.toString();
|
||||
else
|
||||
pstr = "Single \"" + pstr + "\"";
|
||||
str = name + " " + pstr + ((gcp.cmin == 0) ? "*" : "+");
|
||||
print(node, str, depth);
|
||||
} else if (node instanceof Pattern.BackRef) {
|
||||
str = "GroupBackRef " + ((Pattern.BackRef)node).groupIndex / 2;
|
||||
print(node, str, depth);
|
||||
} else if (node instanceof Pattern.LastNode) {
|
||||
print(node, "END", depth);
|
||||
} else if (node == Pattern.accept) {
|
||||
return;
|
||||
} else {
|
||||
print(node, name, depth);
|
||||
}
|
||||
node = node.next;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Pattern p = Pattern.compile(args[0]);
|
||||
System.out.println(" Pattern: " + p);
|
||||
walk(p.root, 0);
|
||||
}
|
||||
}
|
||||
@ -1,246 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
|
||||
enum UnicodeProp {
|
||||
|
||||
ALPHABETIC {
|
||||
public boolean is(int ch) {
|
||||
return Character.isAlphabetic(ch);
|
||||
}
|
||||
},
|
||||
|
||||
LETTER {
|
||||
public boolean is(int ch) {
|
||||
return Character.isLetter(ch);
|
||||
}
|
||||
},
|
||||
|
||||
IDEOGRAPHIC {
|
||||
public boolean is(int ch) {
|
||||
return Character.isIdeographic(ch);
|
||||
}
|
||||
},
|
||||
|
||||
LOWERCASE {
|
||||
public boolean is(int ch) {
|
||||
return Character.isLowerCase(ch);
|
||||
}
|
||||
},
|
||||
|
||||
UPPERCASE {
|
||||
public boolean is(int ch) {
|
||||
return Character.isUpperCase(ch);
|
||||
}
|
||||
},
|
||||
|
||||
TITLECASE {
|
||||
public boolean is(int ch) {
|
||||
return Character.isTitleCase(ch);
|
||||
}
|
||||
},
|
||||
|
||||
WHITE_SPACE {
|
||||
// \p{Whitespace}
|
||||
public boolean is(int ch) {
|
||||
return ((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
|
||||
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
|
||||
}
|
||||
},
|
||||
|
||||
CONTROL {
|
||||
// \p{gc=Control}
|
||||
public boolean is(int ch) {
|
||||
return Character.getType(ch) == Character.CONTROL;
|
||||
}
|
||||
},
|
||||
|
||||
PUNCTUATION {
|
||||
// \p{gc=Punctuation}
|
||||
public boolean is(int ch) {
|
||||
return ((((1 << Character.CONNECTOR_PUNCTUATION) |
|
||||
(1 << Character.DASH_PUNCTUATION) |
|
||||
(1 << Character.START_PUNCTUATION) |
|
||||
(1 << Character.END_PUNCTUATION) |
|
||||
(1 << Character.OTHER_PUNCTUATION) |
|
||||
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
|
||||
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0;
|
||||
}
|
||||
},
|
||||
|
||||
HEX_DIGIT {
|
||||
// \p{gc=Decimal_Number}
|
||||
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
|
||||
public boolean is(int ch) {
|
||||
return DIGIT.is(ch) ||
|
||||
(ch >= 0x0030 && ch <= 0x0039) ||
|
||||
(ch >= 0x0041 && ch <= 0x0046) ||
|
||||
(ch >= 0x0061 && ch <= 0x0066) ||
|
||||
(ch >= 0xFF10 && ch <= 0xFF19) ||
|
||||
(ch >= 0xFF21 && ch <= 0xFF26) ||
|
||||
(ch >= 0xFF41 && ch <= 0xFF46);
|
||||
}
|
||||
},
|
||||
|
||||
ASSIGNED {
|
||||
public boolean is(int ch) {
|
||||
return Character.getType(ch) != Character.UNASSIGNED;
|
||||
}
|
||||
},
|
||||
|
||||
NONCHARACTER_CODE_POINT {
|
||||
// PropList.txt:Noncharacter_Code_Point
|
||||
public boolean is(int ch) {
|
||||
return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
|
||||
}
|
||||
},
|
||||
|
||||
DIGIT {
|
||||
// \p{gc=Decimal_Number}
|
||||
public boolean is(int ch) {
|
||||
return Character.isDigit(ch);
|
||||
}
|
||||
},
|
||||
|
||||
ALNUM {
|
||||
// \p{alpha}
|
||||
// \p{digit}
|
||||
public boolean is(int ch) {
|
||||
return ALPHABETIC.is(ch) || DIGIT.is(ch);
|
||||
}
|
||||
},
|
||||
|
||||
BLANK {
|
||||
// \p{Whitespace} --
|
||||
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
|
||||
// \p{gc=Line_Separator}
|
||||
// \p{gc=Paragraph_Separator}]
|
||||
public boolean is(int ch) {
|
||||
return Character.getType(ch) == Character.SPACE_SEPARATOR ||
|
||||
ch == 0x9; // \N{HT}
|
||||
}
|
||||
},
|
||||
|
||||
GRAPH {
|
||||
// [^
|
||||
// \p{space}
|
||||
// \p{gc=Control}
|
||||
// \p{gc=Surrogate}
|
||||
// \p{gc=Unassigned}]
|
||||
public boolean is(int ch) {
|
||||
return ((((1 << Character.SPACE_SEPARATOR) |
|
||||
(1 << Character.LINE_SEPARATOR) |
|
||||
(1 << Character.PARAGRAPH_SEPARATOR) |
|
||||
(1 << Character.CONTROL) |
|
||||
(1 << Character.SURROGATE) |
|
||||
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
|
||||
== 0;
|
||||
}
|
||||
},
|
||||
|
||||
PRINT {
|
||||
// \p{graph}
|
||||
// \p{blank}
|
||||
// -- \p{cntrl}
|
||||
public boolean is(int ch) {
|
||||
return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
|
||||
}
|
||||
},
|
||||
|
||||
WORD {
|
||||
// \p{alpha}
|
||||
// \p{gc=Mark}
|
||||
// \p{digit}
|
||||
// \p{gc=Connector_Punctuation}
|
||||
// \p{Join_Control} 200C..200D
|
||||
|
||||
public boolean is(int ch) {
|
||||
return ALPHABETIC.is(ch) ||
|
||||
((((1 << Character.NON_SPACING_MARK) |
|
||||
(1 << Character.ENCLOSING_MARK) |
|
||||
(1 << Character.COMBINING_SPACING_MARK) |
|
||||
(1 << Character.DECIMAL_DIGIT_NUMBER) |
|
||||
(1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
|
||||
!= 0 ||
|
||||
JOIN_CONTROL.is(ch);
|
||||
}
|
||||
},
|
||||
|
||||
JOIN_CONTROL {
|
||||
// 200C..200D PropList.txt:Join_Control
|
||||
public boolean is(int ch) {
|
||||
return (ch == 0x200C || ch == 0x200D);
|
||||
}
|
||||
};
|
||||
|
||||
private static final HashMap<String, String> posix = new HashMap<>();
|
||||
private static final HashMap<String, String> aliases = new HashMap<>();
|
||||
static {
|
||||
posix.put("ALPHA", "ALPHABETIC");
|
||||
posix.put("LOWER", "LOWERCASE");
|
||||
posix.put("UPPER", "UPPERCASE");
|
||||
posix.put("SPACE", "WHITE_SPACE");
|
||||
posix.put("PUNCT", "PUNCTUATION");
|
||||
posix.put("XDIGIT","HEX_DIGIT");
|
||||
posix.put("ALNUM", "ALNUM");
|
||||
posix.put("CNTRL", "CONTROL");
|
||||
posix.put("DIGIT", "DIGIT");
|
||||
posix.put("BLANK", "BLANK");
|
||||
posix.put("GRAPH", "GRAPH");
|
||||
posix.put("PRINT", "PRINT");
|
||||
|
||||
aliases.put("WHITESPACE", "WHITE_SPACE");
|
||||
aliases.put("HEXDIGIT","HEX_DIGIT");
|
||||
aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
|
||||
aliases.put("JOINCONTROL", "JOIN_CONTROL");
|
||||
}
|
||||
|
||||
public static UnicodeProp forName(String propName) {
|
||||
propName = propName.toUpperCase(Locale.ENGLISH);
|
||||
String alias = aliases.get(propName);
|
||||
if (alias != null)
|
||||
propName = alias;
|
||||
try {
|
||||
return valueOf (propName);
|
||||
} catch (IllegalArgumentException x) {}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static UnicodeProp forPOSIXName(String propName) {
|
||||
propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
|
||||
if (propName == null)
|
||||
return null;
|
||||
return valueOf (propName);
|
||||
}
|
||||
|
||||
public abstract boolean is(int ch);
|
||||
}
|
||||
@ -33,6 +33,9 @@
|
||||
* 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
|
||||
* 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647 6559590
|
||||
* 8027645 8035076 8039124 8035975 8074678 6854417 8143854 8147531 7071819
|
||||
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384
|
||||
* 6328855 6192895 6345469 6988218 6693451 7006761 8140212
|
||||
*
|
||||
* @library /lib/testlibrary
|
||||
* @build jdk.testlibrary.*
|
||||
* @run main RegExTest
|
||||
@ -162,6 +165,7 @@ public class RegExTest {
|
||||
patternAsPredicate();
|
||||
invalidFlags();
|
||||
grapheme();
|
||||
expoBacktracking();
|
||||
|
||||
if (failure) {
|
||||
throw new
|
||||
@ -2659,51 +2663,101 @@ public class RegExTest {
|
||||
check(p, "test\u00e4\u0323\u0300", true);
|
||||
check(p, "test\u00e4\u0300\u0323", true);
|
||||
|
||||
/*
|
||||
* The following canonical equivalence tests don't work. Bug id: 4916384.
|
||||
*
|
||||
// Decomposed hangul (jamos)
|
||||
p = Pattern.compile("\u1100\u1161", Pattern.CANON_EQ);
|
||||
m = p.matcher("\u1100\u1161");
|
||||
if (!m.matches())
|
||||
failCount++;
|
||||
Object[][] data = new Object[][] {
|
||||
|
||||
m.reset("\uac00");
|
||||
if (!m.matches())
|
||||
failCount++;
|
||||
// JDK-4867170
|
||||
{ "[\u1f80-\u1f82]", "ab\u1f80cd", "f", true },
|
||||
{ "[\u1f80-\u1f82]", "ab\u1f81cd", "f", true },
|
||||
{ "[\u1f80-\u1f82]", "ab\u1f82cd", "f", true },
|
||||
{ "[\u1f80-\u1f82]", "ab\u03b1\u0314\u0345cd", "f", true },
|
||||
{ "[\u1f80-\u1f82]", "ab\u03b1\u0345\u0314cd", "f", true },
|
||||
{ "[\u1f80-\u1f82]", "ab\u1f01\u0345cd", "f", true },
|
||||
{ "[\u1f80-\u1f82]", "ab\u1f00\u0345cd", "f", true },
|
||||
|
||||
{ "\\p{IsGreek}", "ab\u1f80cd", "f", true },
|
||||
{ "\\p{IsGreek}", "ab\u1f81cd", "f", true },
|
||||
{ "\\p{IsGreek}", "ab\u1f82cd", "f", true },
|
||||
{ "\\p{IsGreek}", "ab\u03b1\u0314\u0345cd", "f", true },
|
||||
{ "\\p{IsGreek}", "ab\u1f01\u0345cd", "f", true },
|
||||
|
||||
// backtracking, force to match "\u1f80", instead of \u1f82"
|
||||
{ "ab\\p{IsGreek}\u0300cd", "ab\u03b1\u0313\u0345\u0300cd", "m", true },
|
||||
|
||||
{ "[\\p{IsGreek}]", "\u03b1\u0314\u0345", "m", true },
|
||||
{ "\\p{IsGreek}", "\u03b1\u0314\u0345", "m", true },
|
||||
|
||||
{ "[^\u1f80-\u1f82]","\u1f81", "m", false },
|
||||
{ "[^\u1f80-\u1f82]","\u03b1\u0314\u0345", "m", false },
|
||||
{ "[^\u1f01\u0345]", "\u1f81", "f", false },
|
||||
|
||||
{ "[^\u1f81]+", "\u1f80\u1f82", "f", true },
|
||||
{ "[\u1f80]", "ab\u1f80cd", "f", true },
|
||||
{ "\u1f80", "ab\u1f80cd", "f", true },
|
||||
{ "\u1f00\u0345\u0300", "\u1f82", "m", true },
|
||||
{ "\u1f80", "-\u1f00\u0345\u0300-", "f", true },
|
||||
{ "\u1f82", "\u1f00\u0345\u0300", "m", true },
|
||||
{ "\u1f82", "\u1f80\u0300", "m", true },
|
||||
|
||||
// JDK-7080302 # compile failed
|
||||
{ "a(\u0041\u0301\u0328)", "a\u0041\u0301\u0328", "m", true},
|
||||
|
||||
// JDK-6728861, same cause as above one
|
||||
{ "\u00e9\u00e9n", "e\u0301e\u0301n", "m", true},
|
||||
|
||||
// JDK-6995635
|
||||
{ "(\u00e9)", "e\u0301", "m", true },
|
||||
|
||||
// JDK-6736245
|
||||
// intereting special case, nfc(u2add+u0338) -> u2add+u0338) NOT u2adc
|
||||
{ "\u2ADC", "\u2ADC", "m", true}, // NFC
|
||||
{ "\u2ADC", "\u2ADD\u0338", "m", true}, // NFD
|
||||
|
||||
// 4916384.
|
||||
// Decomposed hangul (jamos) works inside clazz
|
||||
{ "[\u1100\u1161]", "\u1100\u1161", "m", true},
|
||||
{ "[\u1100\u1161]", "\uac00", "m", true},
|
||||
|
||||
{ "[\uac00]", "\u1100\u1161", "m", true},
|
||||
{ "[\uac00]", "\uac00", "m", true},
|
||||
|
||||
// Decomposed hangul (jamos)
|
||||
{ "\u1100\u1161", "\u1100\u1161", "m", true},
|
||||
{ "\u1100\u1161", "\uac00", "m", true},
|
||||
|
||||
// Composed hangul
|
||||
p = Pattern.compile("\uac00", Pattern.CANON_EQ);
|
||||
m = p.matcher("\u1100\u1161");
|
||||
if (!m.matches())
|
||||
failCount++;
|
||||
{ "\uac00", "\u1100\u1161", "m", true },
|
||||
{ "\uac00", "\uac00", "m", true },
|
||||
|
||||
m.reset("\uac00");
|
||||
if (!m.matches())
|
||||
failCount++;
|
||||
/* Need a NFDSlice to nfd the source to solve this issue
|
||||
u+1d1c0 -> nfd: <u+1d1ba><u+1d165><u+1d16f> -> nfc: <u+1d1ba><u+1d165><u+1d16f>
|
||||
u+1d1bc -> nfd: <u+1d1ba><u+1d165> -> nfc: <u+1d1ba><u+1d165>
|
||||
<u+1d1bc><u+1d16f> -> nfd: <u+1d1ba><u+1d165><u+1d16f> -> nfc: <u+1d1ba><u+1d165><u+1d16f>
|
||||
|
||||
// Decomposed supplementary outside char classes
|
||||
p = Pattern.compile("test\ud834\uddbc\ud834\udd6f", Pattern.CANON_EQ);
|
||||
m = p.matcher("test\ud834\uddc0");
|
||||
if (!m.matches())
|
||||
failCount++;
|
||||
|
||||
m.reset("test\ud834\uddbc\ud834\udd6f");
|
||||
if (!m.matches())
|
||||
failCount++;
|
||||
|
||||
// { "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddc0", "m", true },
|
||||
// Composed supplementary outside char classes
|
||||
p = Pattern.compile("test\ud834\uddc0", Pattern.CANON_EQ);
|
||||
m.reset("test\ud834\uddbc\ud834\udd6f");
|
||||
if (!m.matches())
|
||||
failCount++;
|
||||
|
||||
m = p.matcher("test\ud834\uddc0");
|
||||
if (!m.matches())
|
||||
failCount++;
|
||||
|
||||
// { "test\ud834\uddc0", "test\ud834\uddbc\ud834\udd6f", "m", true },
|
||||
*/
|
||||
{ "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddbc\ud834\udd6f", "m", true },
|
||||
{ "test\ud834\uddc0", "test\ud834\uddbc\ud834\udd6f", "m", true },
|
||||
|
||||
{ "test\ud834\uddc0", "test\ud834\uddc0", "m", true },
|
||||
{ "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddc0", "m", true },
|
||||
};
|
||||
|
||||
int failCount = 0;
|
||||
for (Object[] d : data) {
|
||||
String pn = (String)d[0];
|
||||
String tt = (String)d[1];
|
||||
boolean isFind = "f".equals(((String)d[2]));
|
||||
boolean expected = (boolean)d[3];
|
||||
boolean ret = isFind ? Pattern.compile(pn, Pattern.CANON_EQ).matcher(tt).find()
|
||||
: Pattern.compile(pn, Pattern.CANON_EQ).matcher(tt).matches();
|
||||
if (ret != expected) {
|
||||
failCount++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
report("Canonical Equivalence");
|
||||
}
|
||||
|
||||
@ -3846,7 +3900,6 @@ public class RegExTest {
|
||||
if (!patternString.startsWith("'")) {
|
||||
return Pattern.compile(patternString);
|
||||
}
|
||||
|
||||
int break1 = patternString.lastIndexOf("'");
|
||||
String flagString = patternString.substring(
|
||||
break1+1, patternString.length());
|
||||
@ -4092,10 +4145,11 @@ public class RegExTest {
|
||||
report("NamedGroupCapture");
|
||||
}
|
||||
|
||||
// This is for bug 6969132
|
||||
// This is for bug 6919132
|
||||
private static void nonBmpClassComplementTest() throws Exception {
|
||||
Pattern p = Pattern.compile("\\P{Lu}");
|
||||
Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
|
||||
|
||||
if (m.find() && m.start() == 1)
|
||||
failCount++;
|
||||
|
||||
@ -4113,6 +4167,11 @@ public class RegExTest {
|
||||
if (m.find() && m.start() == 1)
|
||||
failCount++;
|
||||
|
||||
p = Pattern.compile("\\P{sc=GRANTHA}");
|
||||
m = p.matcher(new String(new int[] {0x11350}, 0, 1));
|
||||
if (m.find() && m.start() == 1)
|
||||
failCount++;
|
||||
|
||||
report("NonBmpClassComplement");
|
||||
}
|
||||
|
||||
@ -4662,4 +4721,92 @@ public class RegExTest {
|
||||
failCount++;
|
||||
report("Unicode extended grapheme cluster");
|
||||
}
|
||||
|
||||
// hangup/timeout if go into exponential backtracking
|
||||
private static void expoBacktracking() throws Exception {
|
||||
|
||||
Object[][] patternMatchers = {
|
||||
// 6328855
|
||||
{ "(.*\n*)*",
|
||||
"this little fine string lets\r\njava.lang.String.matches\r\ncrash\r\n(We don't know why but adding \r* to the regex makes it work again)",
|
||||
false },
|
||||
// 6192895
|
||||
{ " *([a-zA-Z0-9/\\-\\?:\\(\\)\\.,'\\+\\{\\}]+ *)+",
|
||||
"Hello World this is a test this is a test this is a test A",
|
||||
true },
|
||||
{ " *([a-zA-Z0-9/\\-\\?:\\(\\)\\.,'\\+\\{\\}]+ *)+",
|
||||
"Hello World this is a test this is a test this is a test \u4e00 ",
|
||||
false },
|
||||
{ " *([a-z0-9]+ *)+",
|
||||
"hello world this is a test this is a test this is a test A",
|
||||
false },
|
||||
// 4771934 [FIXED] #5013651?
|
||||
{ "^(\\w+([\\.-]?\\w+)*@\\w+([\\.-]?\\w+)*(\\.\\w{2,4})+[,;]?)+$",
|
||||
"abc@efg.abc,efg@abc.abc,abc@xyz.mno;abc@sdfsd.com",
|
||||
true },
|
||||
// 4866249 [FIXED]
|
||||
{ "<\\s*" + "(meta|META)" + "(\\s|[^>])+" + "(CHARSET|charset)=" + "(\\s|[^>])+>",
|
||||
"<META http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-5\">",
|
||||
true },
|
||||
{ "^(\\w+([\\.-]?\\w+)*@\\w+([\\.-]?\\w+)*(\\.\\w{2,4})+[,;]?)+$",
|
||||
"abc@efg.abc,efg@abc.abc,abc@xyz.mno;sdfsd.com",
|
||||
false },
|
||||
// 6345469
|
||||
{ "((<[^>]+>)?(((\\s)?)*(\\ )?)*((\\s)?)*)+",
|
||||
" < br/> < / p> <p> <html> <adfasfdasdf> </p>",
|
||||
true }, // --> matched
|
||||
{ "((<[^>]+>)?(((\\s)?)*(\\ )?)*((\\s)?)*)+",
|
||||
" < br/> < / p> <p> <html> <adfasfdasdf> p </p>",
|
||||
false },
|
||||
// 5026912
|
||||
{ "^\\s*" + "(\\w|\\d|[\\xC0-\\xFF]|/)+" + "\\s+|$",
|
||||
"156580451111112225588087755221111111566969655555555",
|
||||
false},
|
||||
// 6988218
|
||||
{ "^([+-]?((0[xX](\\p{XDigit}+))|(((\\p{Digit}+)(\\.)?((\\p{Digit}+)?)([eE][+-]?(\\p{Digit}+))?)|(\\.((\\p{Digit}+))([eE][+-]?(\\p{Digit}+))?)))|[n|N]?'([^']*(?:'')*[^']*)*')",
|
||||
"'%)) order by ANGEBOT.ID",
|
||||
false}, // find
|
||||
// 6693451
|
||||
{ "^(\\s*foo\\s*)*$",
|
||||
"foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo",
|
||||
true },
|
||||
{ "^(\\s*foo\\s*)*$",
|
||||
"foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo fo",
|
||||
false
|
||||
},
|
||||
// 7006761
|
||||
{ "(([0-9A-Z]+)([_]?+)*)*", "FOOOOO_BAAAR_FOOOOOOOOO_BA_", true},
|
||||
{ "(([0-9A-Z]+)([_]?+)*)*", "FOOOOO_BAAAR_FOOOOOOOOO_BA_ ", false},
|
||||
// 8140212
|
||||
{ "(?<before>.*)\\{(?<reflection>\\w+):(?<innerMethod>\\w+(\\.?\\w+(\\(((?<args>(('[^']*')|((/|\\w)+))(,(('[^']*')|((/|\\w)+)))*))?\\))?)*)\\}(?<after>.*)",
|
||||
"{CeGlobal:getSodCutoff.getGui.getAmqp.getSimpleModeEnabled()",
|
||||
false
|
||||
},
|
||||
{ "^(a+)+$", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", true},
|
||||
{ "^(a+)+$", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!", false},
|
||||
|
||||
{ "(x+)*y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy", true },
|
||||
{ "(x+)*y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz", false},
|
||||
|
||||
{ "(x+x+)+y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy", true},
|
||||
{ "(x+x+)+y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz", false},
|
||||
|
||||
{ "(([0-9A-Z]+)([_]?+)*)*", "--------------------------------------", false},
|
||||
|
||||
/* not fixed
|
||||
//8132141 ---> second level exponential backtracking
|
||||
{ "(h|h|ih(((i|a|c|c|a|i|i|j|b|a|i|b|a|a|j))+h)ahbfhba|c|i)*",
|
||||
"hchcchicihcchciiicichhcichcihcchiihichiciiiihhcchicchhcihchcihiihciichhccciccichcichiihcchcihhicchcciicchcccihiiihhihihihichicihhcciccchihhhcchichchciihiicihciihcccciciccicciiiiiiiiicihhhiiiihchccchchhhhiiihchihcccchhhiiiiiiiicicichicihcciciihichhhhchihciiihhiccccccciciihhichiccchhicchicihihccichicciihcichccihhiciccccccccichhhhihihhcchchihihiihhihihihicichihiiiihhhhihhhchhichiicihhiiiiihchccccchichci" },
|
||||
*/
|
||||
};
|
||||
|
||||
for (Object[] pm : patternMatchers) {
|
||||
String p = (String)pm[0];
|
||||
String s = (String)pm[1];
|
||||
boolean r = (Boolean)pm[2];
|
||||
if (r != Pattern.compile(p).matcher(s).matches()) {
|
||||
failCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -139,6 +139,71 @@ false 0
|
||||
aaabbbcccdefg
|
||||
true defg 0
|
||||
|
||||
// Negation with nested char class and intersection
|
||||
[^[c]]
|
||||
c
|
||||
false 0
|
||||
|
||||
[^[a-z]]
|
||||
e
|
||||
false 0
|
||||
|
||||
[^[a-z][A-Z]]
|
||||
E
|
||||
false 0
|
||||
|
||||
[^a-d[0-9][m-p]]
|
||||
e
|
||||
true e 0
|
||||
|
||||
[^a-d[0-9][m-p]]
|
||||
8
|
||||
false 0
|
||||
|
||||
[^[a-c]&&[d-f]]
|
||||
z
|
||||
true z 0
|
||||
|
||||
[^a-c&&d-f]
|
||||
a
|
||||
true a 0
|
||||
|
||||
[^a-m&&m-z]
|
||||
m
|
||||
false 0
|
||||
|
||||
[^a-m&&m-z&&a-c]
|
||||
m
|
||||
true m 0
|
||||
|
||||
[^a-cd-f&&[d-f]]
|
||||
c
|
||||
true c 0
|
||||
|
||||
[^[a-c][d-f]&&abc]
|
||||
a
|
||||
false 0
|
||||
|
||||
[^[a-c][d-f]&&abc]
|
||||
d
|
||||
true d 0
|
||||
|
||||
[^[a-c][d-f]&&abc[def]]
|
||||
a
|
||||
false 0
|
||||
|
||||
[^[a-c][d-f]&&abc[def]]
|
||||
e
|
||||
false 0
|
||||
|
||||
[^[a-c]&&[b-d]&&[c-e]]
|
||||
a
|
||||
true a 0
|
||||
|
||||
[^[a-c]&&[b-d]&&[c-e]]
|
||||
c
|
||||
false 0
|
||||
|
||||
// Making sure a ^ not in first position matches literal ^
|
||||
[abc^b]
|
||||
b
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user