diff --git a/jdk/src/java.base/share/classes/java/security/ProtectionDomain.java b/jdk/src/java.base/share/classes/java/security/ProtectionDomain.java index c4a03166990..32cb2186a2e 100644 --- a/jdk/src/java.base/share/classes/java/security/ProtectionDomain.java +++ b/jdk/src/java.base/share/classes/java/security/ProtectionDomain.java @@ -139,8 +139,6 @@ public class ProtectionDomain { */ final Key key = new Key(); - private static final Debug debug = Debug.getInstance("domain"); - /** * Creates a new ProtectionDomain with the given CodeSource and * Permissions. If the permissions object is not null, then @@ -338,6 +336,13 @@ public class ProtectionDomain { " "+pc+"\n"; } + /* + * holder class for the static field "debug" to delay its initialization + */ + private static class DebugHolder { + private static final Debug debug = Debug.getInstance("domain"); + } + /** * Return true (merge policy permissions) in the following cases: * @@ -359,7 +364,7 @@ public class ProtectionDomain { if (sm == null) { return true; } else { - if (debug != null) { + if (DebugHolder.debug != null) { if (sm.getClass().getClassLoader() == null && Policy.getPolicyNoCheck().getClass().getClassLoader() == null) { diff --git a/jdk/src/java.base/share/classes/java/security/SecureClassLoader.java b/jdk/src/java.base/share/classes/java/security/SecureClassLoader.java index eee1e0bb7fe..21b071f4061 100644 --- a/jdk/src/java.base/share/classes/java/security/SecureClassLoader.java +++ b/jdk/src/java.base/share/classes/java/security/SecureClassLoader.java @@ -62,8 +62,6 @@ public class SecureClassLoader extends ClassLoader { private final Map pdcache = new ConcurrentHashMap<>(11); - private static final Debug debug = Debug.getInstance("scl"); - static { ClassLoader.registerAsParallelCapable(); } @@ -202,6 +200,13 @@ public class SecureClassLoader extends ClassLoader { return new Permissions(); // ProtectionDomain defers the binding } + /* + * holder class for the static field "debug" to delay its initialization + */ + private static class DebugHolder { + private static final Debug debug = Debug.getInstance("scl"); + } + /* * Returned cached ProtectionDomain for the specified CodeSource. */ @@ -222,9 +227,9 @@ public class SecureClassLoader extends ClassLoader { = SecureClassLoader.this.getPermissions(cs); ProtectionDomain pd = new ProtectionDomain( cs, perms, SecureClassLoader.this, null); - if (debug != null) { - debug.println(" getPermissions " + pd); - debug.println(""); + if (DebugHolder.debug != null) { + DebugHolder.debug.println(" getPermissions " + pd); + DebugHolder.debug.println(""); } return pd; } diff --git a/jdk/src/java.base/share/classes/java/util/regex/CharPredicates.java b/jdk/src/java.base/share/classes/java/util/regex/CharPredicates.java new file mode 100644 index 00000000000..0a873c774a9 --- /dev/null +++ b/jdk/src/java.base/share/classes/java/util/regex/CharPredicates.java @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + +import java.util.HashMap; +import java.util.Locale; +import java.util.regex.Pattern.CharPredicate; +import java.util.regex.Pattern.BmpCharPredicate; + +class CharPredicates { + + static final CharPredicate ALPHABETIC = Character::isAlphabetic; + + // \p{gc=Decimal_Number} + static final CharPredicate DIGIT = Character::isDigit; + + static final CharPredicate LETTER = Character::isLetter; + + static final CharPredicate IDEOGRAPHIC = Character::isIdeographic; + + static final CharPredicate LOWERCASE = Character::isLowerCase; + + static final CharPredicate UPPERCASE = Character::isUpperCase; + + static final CharPredicate TITLECASE = Character::isTitleCase; + + // \p{Whitespace} + static final CharPredicate WHITE_SPACE = ch -> + ((((1 << Character.SPACE_SEPARATOR) | + (1 << Character.LINE_SEPARATOR) | + (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) + != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); + + // \p{gc=Control} + static final CharPredicate CONTROL = ch -> + Character.getType(ch) == Character.CONTROL; + + // \p{gc=Punctuation} + static final CharPredicate PUNCTUATION = ch -> + ((((1 << Character.CONNECTOR_PUNCTUATION) | + (1 << Character.DASH_PUNCTUATION) | + (1 << Character.START_PUNCTUATION) | + (1 << Character.END_PUNCTUATION) | + (1 << Character.OTHER_PUNCTUATION) | + (1 << Character.INITIAL_QUOTE_PUNCTUATION) | + (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) + != 0; + + // \p{gc=Decimal_Number} + // \p{Hex_Digit} -> PropList.txt: Hex_Digit + static final CharPredicate HEX_DIGIT = DIGIT.union( + ch -> (ch >= 0x0030 && ch <= 0x0039) || + (ch >= 0x0041 && ch <= 0x0046) || + (ch >= 0x0061 && ch <= 0x0066) || + (ch >= 0xFF10 && ch <= 0xFF19) || + (ch >= 0xFF21 && ch <= 0xFF26) || + (ch >= 0xFF41 && ch <= 0xFF46)); + + static final CharPredicate ASSIGNED = ch -> + Character.getType(ch) != Character.UNASSIGNED; + + // PropList.txt:Noncharacter_Code_Point + static final CharPredicate NONCHARACTER_CODE_POINT = ch -> + (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); + + // \p{alpha} + // \p{digit} + static final CharPredicate ALNUM = ALPHABETIC.union(DIGIT); + + // \p{Whitespace} -- + // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 + // \p{gc=Line_Separator} + // \p{gc=Paragraph_Separator}] + static final CharPredicate BLANK = ch -> + Character.getType(ch) == Character.SPACE_SEPARATOR || + ch == 0x9; // \N{HT} + + // [^ + // \p{space} + // \p{gc=Control} + // \p{gc=Surrogate} + // \p{gc=Unassigned}] + static final CharPredicate GRAPH = ch -> + ((((1 << Character.SPACE_SEPARATOR) | + (1 << Character.LINE_SEPARATOR) | + (1 << Character.PARAGRAPH_SEPARATOR) | + (1 << Character.CONTROL) | + (1 << Character.SURROGATE) | + (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) + == 0; + + // \p{graph} + // \p{blank} + // -- \p{cntrl} + static final CharPredicate PRINT = GRAPH.union(BLANK).and(CONTROL.negate()); + + // 200C..200D PropList.txt:Join_Control + static final CharPredicate JOIN_CONTROL = ch -> ch == 0x200C || ch == 0x200D; + + // \p{alpha} + // \p{gc=Mark} + // \p{digit} + // \p{gc=Connector_Punctuation} + // \p{Join_Control} 200C..200D + static final CharPredicate WORD = + ALPHABETIC.union(ch -> ((((1 << Character.NON_SPACING_MARK) | + (1 << Character.ENCLOSING_MARK) | + (1 << Character.COMBINING_SPACING_MARK) | + (1 << Character.DECIMAL_DIGIT_NUMBER) | + (1 << Character.CONNECTOR_PUNCTUATION)) + >> Character.getType(ch)) & 1) != 0, + JOIN_CONTROL); + + ///////////////////////////////////////////////////////////////////////////// + + private static final HashMap posix = new HashMap<>(12); + private static final HashMap uprops = new HashMap<>(18); + + private static void defPosix(String name, CharPredicate p) { + posix.put(name, p); + } + private static void defUProp(String name, CharPredicate p) { + uprops.put(name, p); + } + + static { + defPosix("ALPHA", ALPHABETIC); + defPosix("LOWER", LOWERCASE); + defPosix("UPPER", UPPERCASE); + defPosix("SPACE", WHITE_SPACE); + defPosix("PUNCT", PUNCTUATION); + defPosix("XDIGIT",HEX_DIGIT); + defPosix("ALNUM", ALNUM); + defPosix("CNTRL", CONTROL); + defPosix("DIGIT", DIGIT); + defPosix("BLANK", BLANK); + defPosix("GRAPH", GRAPH); + defPosix("PRINT", PRINT); + + defUProp("ALPHABETIC", ALPHABETIC); + defUProp("ASSIGNED", ASSIGNED); + defUProp("CONTROL", CONTROL); + defUProp("HEXDIGIT", HEX_DIGIT); + defUProp("IDEOGRAPHIC", IDEOGRAPHIC); + defUProp("JOINCONTROL", JOIN_CONTROL); + defUProp("LETTER", LETTER); + defUProp("LOWERCASE", LOWERCASE); + defUProp("NONCHARACTERCODEPOINT", NONCHARACTER_CODE_POINT); + defUProp("TITLECASE", TITLECASE); + defUProp("PUNCTUATION", PUNCTUATION); + defUProp("UPPERCASE", UPPERCASE); + defUProp("WHITESPACE", WHITE_SPACE); + defUProp("WORD", WORD); + defUProp("WHITE_SPACE", WHITE_SPACE); + defUProp("HEX_DIGIT", HEX_DIGIT); + defUProp("NONCHARACTER_CODE_POINT", NONCHARACTER_CODE_POINT); + defUProp("JOIN_CONTROL", JOIN_CONTROL); + } + + public static CharPredicate forUnicodeProperty(String propName) { + propName = propName.toUpperCase(Locale.ROOT); + CharPredicate p = uprops.get(propName); + if (p != null) + return p; + return posix.get(propName); + } + + public static CharPredicate forPOSIXName(String propName) { + return posix.get(propName.toUpperCase(Locale.ENGLISH)); + } + + ///////////////////////////////////////////////////////////////////////////// + + /** + * Returns a predicate matching all characters belong to a named + * UnicodeScript. + */ + static CharPredicate forUnicodeScript(String name) { + final Character.UnicodeScript script; + try { + script = Character.UnicodeScript.forName(name); + return ch -> script == Character.UnicodeScript.of(ch); + } catch (IllegalArgumentException iae) {} + return null; + } + + /** + * Returns a predicate matching all characters in a UnicodeBlock. + */ + static CharPredicate forUnicodeBlock(String name) { + final Character.UnicodeBlock block; + try { + block = Character.UnicodeBlock.forName(name); + return ch -> block == Character.UnicodeBlock.of(ch); + } catch (IllegalArgumentException iae) {} + return null; + } + + ///////////////////////////////////////////////////////////////////////////// + + // unicode categories, aliases, properties, java methods ... + + private static final HashMap props = new HashMap<>(128); + + /** + * Returns a predicate matching all characters in a named property. + */ + static CharPredicate forProperty(String name) { + return props.get(name); + } + + private static void defProp(String name, CharPredicate p) { + props.put(name, p); + } + + private static void defCategory(String name, final int typeMask) { + CharPredicate p = ch -> (typeMask & (1 << Character.getType(ch))) != 0; + props.put(name, p); + } + + private static void defRange(String name, final int lower, final int upper) { + BmpCharPredicate p = ch -> lower <= ch && ch <= upper; + props.put(name, p); + } + + private static void defCtype(String name, final int ctype) { + BmpCharPredicate p = ch -> ch < 128 && ASCII.isType(ch, ctype); + // PrintPattern.pmap.put(p, name); + props.put(name, p); + } + + static { + // Unicode character property aliases, defined in + // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt + defCategory("Cn", 1< true); + + // Posix regular expression character classes, defined in + // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html + defRange("ASCII", 0x00, 0x7F); // ASCII + defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters + defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters + defCtype("Blank", ASCII.BLANK); // Space and tab characters + defCtype("Cntrl", ASCII.CNTRL); // Control characters + defRange("Digit", '0', '9'); // Numeric characters + defCtype("Graph", ASCII.GRAPH); // printable and visible + defRange("Lower", 'a', 'z'); // Lower-case alphabetic + defRange("Print", 0x20, 0x7E); // Printable characters + defCtype("Punct", ASCII.PUNCT); // Punctuation characters + defCtype("Space", ASCII.SPACE); // Space characters + defRange("Upper", 'A', 'Z'); // Upper-case alphabetic + defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits + + // Java character properties, defined by methods in Character.java + defProp("javaLowerCase", java.lang.Character::isLowerCase); + defProp("javaUpperCase", Character::isUpperCase); + defProp("javaAlphabetic", java.lang.Character::isAlphabetic); + defProp("javaIdeographic", java.lang.Character::isIdeographic); + defProp("javaTitleCase", java.lang.Character::isTitleCase); + defProp("javaDigit", java.lang.Character::isDigit); + defProp("javaDefined", java.lang.Character::isDefined); + defProp("javaLetter", java.lang.Character::isLetter); + defProp("javaLetterOrDigit", java.lang.Character::isLetterOrDigit); + defProp("javaJavaIdentifierStart", java.lang.Character::isJavaIdentifierStart); + defProp("javaJavaIdentifierPart", java.lang.Character::isJavaIdentifierPart); + defProp("javaUnicodeIdentifierStart", java.lang.Character::isUnicodeIdentifierStart); + defProp("javaUnicodeIdentifierPart", java.lang.Character::isUnicodeIdentifierPart); + defProp("javaIdentifierIgnorable", java.lang.Character::isIdentifierIgnorable); + defProp("javaSpaceChar", java.lang.Character::isSpaceChar); + defProp("javaWhitespace", java.lang.Character::isWhitespace); + defProp("javaISOControl", java.lang.Character::isISOControl); + defProp("javaMirrored", java.lang.Character::isMirrored); + } + + ///////////////////////////////////////////////////////////////////////////// + + /** + * Posix ASCII variants, not in the lookup map + */ + static final BmpCharPredicate ASCII_DIGIT = ch -> ch < 128 && ASCII.isDigit(ch); + static final BmpCharPredicate ASCII_WORD = ch -> ch < 128 && ASCII.isWord(ch); + static final BmpCharPredicate ASCII_SPACE = ch -> ch < 128 && ASCII.isSpace(ch); + +} diff --git a/jdk/src/java.base/share/classes/java/util/regex/IntHashSet.java b/jdk/src/java.base/share/classes/java/util/regex/IntHashSet.java new file mode 100644 index 00000000000..d077d2887e8 --- /dev/null +++ b/jdk/src/java.base/share/classes/java/util/regex/IntHashSet.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package java.util.regex; + +import java.util.Arrays; + +/** + * A lightweight hashset implementation for positive 'int'. Not safe for + * concurrent access. + */ +class IntHashSet { + private int[] entries; + private int[] hashes; + private int pos = 0; + + public IntHashSet() { + this.entries = new int[16 << 1]; // initCapacity = 16; + this.hashes = new int[(16 / 2) | 1]; // odd -> fewer collisions + Arrays.fill(this.entries, -1); + Arrays.fill(this.hashes, -1); + } + + public boolean contains(int i) { + int h = hashes[i % hashes.length]; + while (h != -1) { + if (entries[h] == i) + return true; + h = entries[h + 1]; + } + return false; + } + + public void add(int i) { + int h0 = i % hashes.length; + int next = hashes[h0]; + // if invoker guarantees contains(i) checked before add(i) + // the following check is not needed. + int next0 = next; + while (next0 != -1) { + if (entries[next0 ] == i) + return; + next0 = entries[next0 + 1]; + } + hashes[h0] = pos; + entries[pos++] = i; + entries[pos++] = next; + if (pos == entries.length) + expand(); + } + + public void clear() { + Arrays.fill(this.entries, -1); + Arrays.fill(this.hashes, -1); + pos = 0; + } + + private void expand() { + int[] old = entries; + int[] es = new int[old.length << 1]; + int hlen = (old.length / 2) | 1; + int[] hs = new int[hlen]; + Arrays.fill(es, -1); + Arrays.fill(hs, -1); + for (int n = 0; n < pos;) { // re-hashing + int i = old[n]; + int hsh = i % hlen; + int next = hs[hsh]; + hs[hsh] = n; + es[n++] = i; + es[n++] = next; + } + this.entries = es; + this.hashes = hs; + } +} diff --git a/jdk/src/java.base/share/classes/java/util/regex/Matcher.java b/jdk/src/java.base/share/classes/java/util/regex/Matcher.java index 7f030af704a..de24ffb4e40 100644 --- a/jdk/src/java.base/share/classes/java/util/regex/Matcher.java +++ b/jdk/src/java.base/share/classes/java/util/regex/Matcher.java @@ -177,6 +177,14 @@ public final class Matcher implements MatchResult { */ int[] locals; + /** + * Storage used by top greedy Loop node to store a specific hash set to + * keep the beginning index of the failed repetition match. The nodes + * themselves are stateless, so they rely on this field to hold state + * during a match. + */ + IntHashSet[] localsPos; + /** * Boolean indicating whether or not more input could change * the results of the last match. @@ -239,6 +247,7 @@ public final class Matcher implements MatchResult { int parentGroupCount = Math.max(parent.capturingGroupCount, 10); groups = new int[parentGroupCount * 2]; locals = new int[parent.localCount]; + localsPos = new IntHashSet[parent.localTCNCount]; // Put fields into initial states reset(); @@ -375,6 +384,7 @@ public final class Matcher implements MatchResult { groups[i] = -1; for (int i = 0; i < locals.length; i++) locals[i] = -1; + localsPos = new IntHashSet[parentPattern.localTCNCount]; modCount++; return this; } @@ -397,6 +407,10 @@ public final class Matcher implements MatchResult { groups[i] = -1; for(int i=0; i topClosureNodes; + + /** + * The number of top greedy closure nodes in this Pattern. Used by + * matchers to allocate storage needed for a IntHashSet to keep the + * beginning pos {@code i} of all failed match. + */ + transient int localTCNCount; + + /* + * Turn off the stop-exponential-backtracking optimization if there + * is a group ref in the pattern. + */ + transient boolean hasGroupRef; + /** * Temporary null terminated code point array used by pattern compiling. */ @@ -1026,7 +1053,7 @@ public final class Pattern * If the Start node might possibly match supplementary characters. * It is set to true during compiling if * (1) There is supplementary char in pattern, or - * (2) There is complement node of Category or Block + * (2) There is complement node of a "family" CharProperty */ private transient boolean hasSupplementary; @@ -1338,6 +1365,7 @@ public final class Pattern // Initialize counts capturingGroupCount = 1; localCount = 0; + localTCNCount = 0; // if length > 0, the Pattern is lazily compiled if (pattern.length() == 0) { @@ -1368,6 +1396,7 @@ public final class Pattern // Reset group index count capturingGroupCount = 1; localCount = 0; + localTCNCount = 0; if (pattern.length() > 0) { compile(); @@ -1378,105 +1407,114 @@ public final class Pattern } /** - * The pattern is converted to normalized form ({@linkplain - * java.text.Normalizer.Form.NFD NFD}, canonical decomposition) - * and then a pure group is constructed to match canonical - * equivalences of the characters. + * The pattern is converted to normalized form ({@link + * java.text.Normalizer.Form.NFC NFC}, canonical decomposition, + * followed by canonical composition for the character class + * part, and {@link java.text.Normalizer.Form.NFD NFD}, + * canonical decomposition) for the rest), and then a pure + * group is constructed to match canonical equivalences of the + * characters. */ - private void normalize() { - int lastCodePoint = -1; - - // Convert pattern into normalized form - normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD); - patternLength = normalizedPattern.length(); - - // Modify pattern to match canonical equivalences - StringBuilder newPattern = new StringBuilder(patternLength); - for(int i=0; i= patternLength) - break; - c = normalizedPattern.codePointAt(i); - sequenceBuffer.appendCodePoint(c); - } - String ea = produceEquivalentAlternation( - sequenceBuffer.toString()); - newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint)); - newPattern.append("(?:").append(ea).append(")"); - } else if (c == '[' && lastCodePoint != '\\') { - i = normalizeCharClass(newPattern, i); - } else { - newPattern.appendCodePoint(c); + private static String normalize(String pattern) { + int plen = pattern.length(); + StringBuilder pbuf = new StringBuilder(plen); + char last = 0; + int lastStart = 0; + char cc = 0; + for (int i = 0; i < plen;) { + char c = pattern.charAt(i); + if (cc == 0 && // top level + c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') { + i += 2; last = 0; + continue; } - lastCodePoint = c; - i += Character.charCount(c); + if (c == '[' && last != '\\') { + if (cc == 0) { + if (lastStart < i) + normalizeSlice(pattern, lastStart, i, pbuf); + lastStart = i; + } + cc++; + } else if (c == ']' && last != '\\') { + cc--; + if (cc == 0) { + normalizeClazz(pattern, lastStart, i + 1, pbuf); + lastStart = i + 1; + } + } + last = c; + i++; } - normalizedPattern = newPattern.toString(); + assert (cc == 0); + if (lastStart < plen) + normalizeSlice(pattern, lastStart, plen, pbuf); + return pbuf.toString(); } - /** - * Complete the character class being parsed and add a set - * of alternations to it that will match the canonical equivalences - * of the characters within the class. - */ - private int normalizeCharClass(StringBuilder newPattern, int i) { - StringBuilder charClass = new StringBuilder(); - StringBuilder eq = null; - int lastCodePoint = -1; - String result; - - i++; - charClass.append("["); - while(true) { - int c = normalizedPattern.codePointAt(i); - StringBuilder sequenceBuffer; - - if (c == ']' && lastCodePoint != '\\') { - charClass.append((char)c); - break; - } else if (Character.getType(c) == Character.NON_SPACING_MARK) { - sequenceBuffer = new StringBuilder(); - sequenceBuffer.appendCodePoint(lastCodePoint); - while(Character.getType(c) == Character.NON_SPACING_MARK) { - sequenceBuffer.appendCodePoint(c); - i += Character.charCount(c); - if (i >= normalizedPattern.length()) - break; - c = normalizedPattern.codePointAt(i); - } - String ea = produceEquivalentAlternation( - sequenceBuffer.toString()); - - charClass.setLength(charClass.length()-Character.charCount(lastCodePoint)); - if (eq == null) - eq = new StringBuilder(); - eq.append('|'); - eq.append(ea); - } else { - charClass.appendCodePoint(c); - i++; + private static void normalizeSlice(String src, int off, int limit, + StringBuilder dst) + { + int len = src.length(); + int off0 = off; + while (off < limit && ASCII.isAscii(src.charAt(off))) { + off++; + } + if (off == limit) { + dst.append(src, off0, limit); + return; + } + off--; + if (off < off0) + off = off0; + else + dst.append(src, off0, off); + while (off < limit) { + int ch0 = src.codePointAt(off); + if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) { + dst.append((char)ch0); + off++; + continue; } - if (i == normalizedPattern.length()) - throw error("Unclosed character class"); - lastCodePoint = c; + int j = off + Character.charCount(ch0); + int ch1; + while (j < limit) { + ch1 = src.codePointAt(j); + if (Grapheme.isBoundary(ch0, ch1)) + break; + ch0 = ch1; + j += Character.charCount(ch1); + } + String seq = src.substring(off, j); + String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD); + off = j; + if (nfd.length() > 1) { + ch0 = nfd.codePointAt(0); + ch1 = nfd.codePointAt(Character.charCount(ch0)); + if (Character.getType(ch1) == Character.NON_SPACING_MARK) { + Set altns = new LinkedHashSet<>(); + altns.add(seq); + produceEquivalentAlternation(nfd, altns); + dst.append("(?:"); + altns.forEach( s -> dst.append(s + "|")); + dst.delete(dst.length() - 1, dst.length()); + dst.append(")"); + continue; + } + } + String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC); + if (!seq.equals(nfc) && !nfd.equals(nfc)) + dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")"); + else if (!seq.equals(nfd)) + dst.append("(?:" + seq + "|" + nfd + ")"); + else + dst.append(seq); } + } - if (eq != null) { - result = "(?:"+charClass.toString()+eq.toString()+")"; - } else { - result = charClass.toString(); - } - - newPattern.append(result); - return i; + private static void normalizeClazz(String src, int off, int limit, + StringBuilder dst) + { + dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC)); } /** @@ -1484,28 +1522,26 @@ public final class Pattern * combining marks that follow it, produce the alternation that will * match all canonical equivalences of that sequence. */ - private String produceEquivalentAlternation(String source) { - int len = countChars(source, 0, 1); - if (source.length() == len) - // source has one character. - return source; - - String base = source.substring(0,len); - String combiningMarks = source.substring(len); - - String[] perms = producePermutations(combiningMarks); - StringBuilder result = new StringBuilder(source); - - // Add combined permutations - for(int x=0; x0) - result.append("|"+next); - next = composeOneStep(next); - if (next != null) - result.append("|"+produceEquivalentAlternation(next)); + private static void produceEquivalentAlternation(String src, + Set dst) + { + int len = countChars(src, 0, 1); + if (src.length() == len) { + dst.add(src); // source has one character. + return; + } + String base = src.substring(0,len); + String combiningMarks = src.substring(len); + String[] perms = producePermutations(combiningMarks); + // Add combined permutations + for(int x = 0; x < perms.length; x++) { + String next = base + perms[x]; + dst.add(next); + next = composeOneStep(next); + if (next != null) { + produceEquivalentAlternation(next, dst); + } } - return result.toString(); } /** @@ -1517,7 +1553,7 @@ public final class Pattern * possibilities must be removed because they are not canonically * equivalent. */ - private String[] producePermutations(String input) { + private static String[] producePermutations(String input) { if (input.length() == countChars(input, 0, 1)) return new String[] {input}; @@ -1575,7 +1611,7 @@ loop: for(int x=0, offset=0; x(10); if (has(LITERAL)) { // Literal pattern handling @@ -1737,12 +1773,26 @@ loop: for(int x=0, offset=0; x namedGroups() { @@ -1753,44 +1803,6 @@ loop: for(int x=0, offset=0; xnext:"); - if (node == Pattern.accept) { - System.out.println("Accept Node"); - node = null; - } - } - } - /** * Used to accumulate information about a subtree of the object graph * so that optimizations can be applied to the subtree. @@ -2083,7 +2095,10 @@ loop: for(int x=0, offset=0; x does not exit"); if (create) { + hasGroupRef = true; if (has(CASE_INSENSITIVE)) root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); else @@ -2473,9 +2529,12 @@ loop: for(int x=0, offset=0; x ch < 256 && bits.bits[ch]; + + boolean isNeg = false; + boolean hasBits = false; int ch = next(); + + // Negates if first char in a class, otherwise literal + if (ch == '^' && temp[cursor-1] == '[') { + ch = next(); + isNeg = true; + } for (;;) { switch (ch) { - case '^': - // Negates if first char in a class, otherwise literal - if (firstInClass) { - if (temp[cursor-1] != '[') - break; - ch = next(); - include = !include; - continue; - } else { - // ^ not first in class, treat as literal - break; - } case '[': - firstInClass = false; - node = clazz(true); + curr = clazz(true); if (prev == null) - prev = node; + prev = curr; else - prev = union(prev, node); + prev = prev.union(curr); ch = peek(); continue; case '&': - firstInClass = false; ch = next(); if (ch == '&') { ch = next(); - CharProperty rightNode = null; + CharPredicate right = null; while (ch != ']' && ch != '&') { if (ch == '[') { - if (rightNode == null) - rightNode = clazz(true); + if (right == null) + right = clazz(true); else - rightNode = union(rightNode, clazz(true)); + right = right.union(clazz(true)); } else { // abc&&def unread(); - rightNode = clazz(false); + right = clazz(false); } ch = peek(); } - if (rightNode != null) - node = rightNode; + if (hasBits) { + // bits used, union has high precedence + if (prev == null) { + prev = curr = bitsP; + } else { + prev = prev.union(bitsP); + } + hasBits = false; + } + if (right != null) + curr = right; if (prev == null) { - if (rightNode == null) + if (right == null) throw error("Bad class syntax"); else - prev = rightNode; + prev = right; } else { - prev = intersection(prev, node); + prev = prev.and(curr); } } else { // treat as a literal & @@ -2585,43 +2654,39 @@ loop: for(int x=0, offset=0; x= patternLength) throw error("Unclosed character class"); break; case ']': - firstInClass = false; - if (prev != null) { + if (prev != null || hasBits) { if (consume) next(); + if (prev == null) + prev = bitsP; + else if (hasBits) + prev = prev.union(bitsP); + if (isNeg) + return prev.negate(); return prev; } break; default: - firstInClass = false; break; } - node = range(bits); - if (include) { - if (prev == null) { - prev = node; - } else { - if (prev != node) - prev = union(prev, node); - } + curr = range(bits); + if (curr == null) { // the bits used + hasBits = true; } else { - if (prev == null) { - prev = node.complement(); - } else { - if (prev != node) - prev = setDifference(prev, node); - } + if (prev == null) + prev = curr; + else if (prev != curr) + prev = prev.union(curr); } ch = peek(); } } - private CharProperty bitsOrSingle(BitClass bits, int ch) { + private CharPredicate bitsOrSingle(BitClass bits, int ch) { /* Bits can only handle codepoints in [u+0000-u+00ff] range. Use "single" node instead of bits when dealing with unicode case folding for codepoints listed below. @@ -2643,19 +2708,46 @@ loop: for(int x=0, offset=0; x, " - + "value=<" + value + ">}"); + break; } + if (p == null) + throw error("Unknown Unicode property {name=<" + name + ">, " + + "value=<" + value + ">}"); + } else { if (name.startsWith("In")) { - // \p{inBlockName} - node = unicodeBlockPropertyFor(name.substring(2)); + // \p{InBlockName} + p = CharPredicates.forUnicodeBlock(name.substring(2)); } else if (name.startsWith("Is")) { - // \p{isGeneralCategory} and \p{isScriptName} + // \p{IsGeneralCategory} and \p{IsScriptName} name = name.substring(2); - UnicodeProp uprop = UnicodeProp.forName(name); - if (uprop != null) - node = new Utype(uprop); - if (node == null) - node = CharPropertyNames.charPropertyFor(name); - if (node == null) - node = unicodeScriptPropertyFor(name); + p = CharPredicates.forUnicodeProperty(name); + if (p == null) + p = CharPredicates.forProperty(name); + if (p == null) + p = CharPredicates.forUnicodeScript(name); } else { if (has(UNICODE_CHARACTER_CLASS)) { - UnicodeProp uprop = UnicodeProp.forPOSIXName(name); - if (uprop != null) - node = new Utype(uprop); + p = CharPredicates.forPOSIXName(name); } - if (node == null) - node = charPropertyNodeFor(name); + if (p == null) + p = CharPredicates.forProperty(name); } + if (p == null) + throw error("Unknown character property name {In/Is" + name + "}"); } - if (maybeComplement) { - if (node instanceof Category || node instanceof Block) - hasSupplementary = true; - node = node.complement(); + if (isComplement) { + // it might be too expensive to detect if a complement of + // CharProperty can match "certain" supplementary. So just + // go with StartS. + hasSupplementary = true; + p = p.negate(); } - return node; - } - - - /** - * Returns a CharProperty matching all characters belong to - * a UnicodeScript. - */ - private CharProperty unicodeScriptPropertyFor(String name) { - final Character.UnicodeScript script; - try { - script = Character.UnicodeScript.forName(name); - } catch (IllegalArgumentException iae) { - throw error("Unknown character script name {" + name + "}"); - } - return new Script(script); - } - - /** - * Returns a CharProperty matching all characters in a UnicodeBlock. - */ - private CharProperty unicodeBlockPropertyFor(String name) { - final Character.UnicodeBlock block; - try { - block = Character.UnicodeBlock.forName(name); - } catch (IllegalArgumentException iae) { - throw error("Unknown character block name {" + name + "}"); - } - return new Block(block); - } - - /** - * Returns a CharProperty matching all characters in a named property. - */ - private CharProperty charPropertyNodeFor(String name) { - CharProperty p = CharPropertyNames.charPropertyFor(name); - if (p == null) - throw error("Unknown character property name {" + name + "}"); return p; } + private CharProperty newCharProperty(CharPredicate p) { + if (p == null) + return null; + if (p instanceof BmpCharPredicate) + return new BmpCharProperty((BmpCharPredicate)p); + else + return new CharProperty(p); + } + /** * Parses and returns the name of a "named capturing group", the trailing * ">" is consumed after parsing. @@ -2859,6 +2926,7 @@ loop: for(int x=0, offset=0; x ch < 256 && bits[ch]); + this.bits = bits; + } BitClass add(int c, int flags) { assert c >= 0 && c <= 255; if ((flags & CASE_INSENSITIVE) != 0) { @@ -3394,32 +3480,6 @@ loop: for(int x=0, offset=0; x= 0x0A && cp <= 0x0D) || - cp == 0x85 || cp == 0x2028 || cp == 0x2029; - } - } - - /** - * Node class that matches a Perl horizontal whitespace - */ - static final class HorizWS extends BmpCharProperty { - boolean isSatisfiedBy(int cp) { - return cp == 0x09 || cp == 0x20 || cp == 0xa0 || - cp == 0x1680 || cp == 0x180e || - cp >= 0x2000 && cp <= 0x200a || - cp == 0x202f || cp == 0x205f || cp == 0x3000; + boolean study(TreeInfo info) { + info.minLength++; + info.deterministic = false; + return next.study(info); } } @@ -4217,81 +4197,13 @@ loop: for(int x=0, offset=0; x= to) { + matcher.hitEnd = true; + } + while (n >= cmin) { + if (next.match(matcher, i, seq)) + return true; + if (n == cmin) + return false; + // backing off if match fails + int ch = Character.codePointBefore(seq, i); + i -= Character.charCount(ch); + n--; + } + return false; + } + + boolean study(TreeInfo info) { + info.minLength += cmin; + if (info.maxValid) { + info.maxLength += MAX_REPS; + } + info.deterministic = false; + return next.study(info); + } + } + + static final class BmpCharPropertyGreedy extends CharPropertyGreedy { + + BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) { + super(bcp, cmin); + } + + boolean match(Matcher matcher, int i, CharSequence seq) { + int n = 0; + int to = matcher.to; + while (i < to && predicate.is(seq.charAt(i))) { + i++; n++; + } + if (i >= to) { + matcher.hitEnd = true; + } + while (n >= cmin) { + if (next.match(matcher, i, seq)) + return true; + i--; n--; // backing off if match fails + } + return false; + } + } + /** * Handles the curly-brace style repetition with a specified minimum and * maximum occurrences. The * quantifier is handled as a special case. @@ -4331,11 +4316,11 @@ loop: for(int x=0, offset=0; x is(ch) && p.is(ch); + } + default CharPredicate union(CharPredicate p) { + return ch -> is(ch) || p.is(ch); + } + default CharPredicate union(CharPredicate p1, + CharPredicate p2 ) { + return ch -> is(ch) || p1.is(ch) || p2.is(ch); + } + default CharPredicate negate() { + return ch -> !is(ch); + } + } + + static interface BmpCharPredicate extends CharPredicate { + + default CharPredicate and(CharPredicate p) { + if(p instanceof BmpCharPredicate) + return (BmpCharPredicate)(ch -> is(ch) && p.is(ch)); + return ch -> is(ch) && p.is(ch); + } + default CharPredicate union(CharPredicate p) { + if (p instanceof BmpCharPredicate) + return (BmpCharPredicate)(ch -> is(ch) || p.is(ch)); + return ch -> is(ch) || p.is(ch); + } + static CharPredicate union(CharPredicate... predicates) { + CharPredicate cp = ch -> { + for (CharPredicate p : predicates) { + if (!p.is(ch)) + return false; + } + return true; + }; + for (CharPredicate p : predicates) { + if (! (p instanceof BmpCharPredicate)) + return cp; + } + return (BmpCharPredicate)cp; + } + } + + /** + * matches a Perl vertical whitespace + */ + static BmpCharPredicate VertWS = cp -> + (cp >= 0x0A && cp <= 0x0D) || cp == 0x85 || cp == 0x2028 || cp == 0x2029; + + /** + * matches a Perl horizontal whitespace + */ + static BmpCharPredicate HorizWS = cp -> + cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 || + cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f || + cp == 0x205f || cp == 0x3000; + + /** + * for the Unicode category ALL and the dot metacharacter when + * in dotall mode. + */ + static CharPredicate ALL = ch -> true; + + /** + * for the dot metacharacter when dotall is not enabled. + */ + static CharPredicate DOT = ch -> (ch != '\n' && ch != '\r' + && (ch|1) != '\u2029' + && ch != '\u0085'); + /** + * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled. + */ + static CharPredicate UNIXDOT = ch -> ch != '\n'; + + /** + * Indicate that matches a Supplementary Unicode character + */ + static CharPredicate SingleS(int c) { + return ch -> ch == c; + } + + /** + * A bmp/optimized predicate of single + */ + static BmpCharPredicate Single(int c) { + return ch -> ch == c; + } + + /** + * Case insensitive matches a given BMP character + */ + static BmpCharPredicate SingleI(int lower, int upper) { + return ch -> ch == lower || ch == upper; + } + + /** + * Unicode case insensitive matches a given Unicode character + */ + static CharPredicate SingleU(int lower) { + return ch -> lower == ch || + lower == Character.toLowerCase(Character.toUpperCase(ch)); + } + + private static boolean inRange(int lower, int ch, int upper) { + return lower <= ch && ch <= upper; + } + + /** + * Charactrs within a explicit value range + */ + static CharPredicate Range(int lower, int upper) { + if (upper < Character.MIN_HIGH_SURROGATE || + lower > Character.MAX_HIGH_SURROGATE && + upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) + return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); + return ch -> inRange(lower, ch, upper); + } + + /** + * Charactrs within a explicit value range in a case insensitive manner. + */ + static CharPredicate CIRange(int lower, int upper) { + return ch -> inRange(lower, ch, upper) || + ASCII.isAscii(ch) && + (inRange(lower, ASCII.toUpper(ch), upper) || + inRange(lower, ASCII.toLower(ch), upper)); + } + + static CharPredicate CIRangeU(int lower, int upper) { + return ch -> { + if (inRange(lower, ch, upper)) + return true; + int up = Character.toUpperCase(ch); + return inRange(lower, up, upper) || + inRange(lower, Character.toLowerCase(up), upper); + }; + } /** * This must be the very first initializer. */ - static Node accept = new Node(); + static final Node accept = new Node(); - static Node lastAccept = new LastNode(); - - private static class CharPropertyNames { - - static CharProperty charPropertyFor(String name) { - CharPropertyFactory m = map.get(name); - return m == null ? null : m.make(); - } - - private abstract static class CharPropertyFactory { - abstract CharProperty make(); - } - - private static void defCategory(String name, - final int typeMask) { - map.put(name, new CharPropertyFactory() { - CharProperty make() { return new Category(typeMask);}}); - } - - private static void defRange(String name, - final int lower, final int upper) { - map.put(name, new CharPropertyFactory() { - CharProperty make() { return rangeFor(lower, upper);}}); - } - - private static void defCtype(String name, - final int ctype) { - map.put(name, new CharPropertyFactory() { - CharProperty make() { return new Ctype(ctype);}}); - } - - private abstract static class CloneableProperty - extends CharProperty implements Cloneable - { - public CloneableProperty clone() { - try { - return (CloneableProperty) super.clone(); - } catch (CloneNotSupportedException e) { - throw new AssertionError(e); - } - } - } - - private static void defClone(String name, - final CloneableProperty p) { - map.put(name, new CharPropertyFactory() { - CharProperty make() { return p.clone();}}); - } - - private static final HashMap map - = new HashMap<>(); - - static { - // Unicode character property aliases, defined in - // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt - defCategory("Cn", 1< ids = new HashMap<>(); + + private static void print(Pattern.Node node, String text, int depth) { + if (!ids.containsKey(node)) + ids.put(node, ids.size()); + print("%6d:%" + (depth==0? "": depth<<1) + "s<%s>", ids.get(node), "", text); + if (ids.containsKey(node.next)) + print(" (=>%d)", ids.get(node.next)); + print("%n"); + } + + private static void print(String s, int depth) { + print(" %" + (depth==0?"":depth<<1) + "s<%s>%n", "", s); + } + + private static void print(String fmt, Object ... args) { + System.err.printf(fmt, args); + } + + private static String toStringCPS(int[] cps) { + StringBuilder sb = new StringBuilder(cps.length); + for (int cp : cps) + sb.append(toStringCP(cp)); + return sb.toString(); + } + + private static String toStringCP(int cp) { + return (isPrint(cp) ? "" + (char)cp + : "\\u" + Integer.toString(cp, 16)); + } + + private static String toStringRange(int min, int max) { + if (max == Pattern.MAX_REPS) { + if (min == 0) + return " * "; + else if (min == 1) + return " + "; + return "{" + min + ", max}"; + } + return "{" + min + ", " + max + "}"; + } + + private static String toStringCtype(int type) { + switch(type) { + case UPPER: return "ASCII.UPPER"; + case LOWER: return "ASCII.LOWER"; + case DIGIT: return "ASCII.DIGIT"; + case SPACE: return "ASCII.SPACE"; + case PUNCT: return "ASCII.PUNCT"; + case CNTRL: return "ASCII.CNTRL"; + case BLANK: return "ASCII.BLANK"; + case UNDER: return "ASCII.UNDER"; + case ASCII: return "ASCII.ASCII"; + case ALPHA: return "ASCII.ALPHA"; + case ALNUM: return "ASCII.ALNUM"; + case GRAPH: return "ASCII.GRAPH"; + case WORD: return "ASCII.WORD"; + case XDIGIT: return "ASCII.XDIGIT"; + default: return "ASCII ?"; + } + } + + private static String toString(Pattern.Node node) { + String name = node.getClass().getName(); + return name.substring(name.lastIndexOf('$') + 1); + } + + static HashMap pmap; + static { + pmap = new HashMap<>(); + pmap.put(Pattern.ALL, "All"); + pmap.put(Pattern.DOT, "Dot"); + pmap.put(Pattern.UNIXDOT, "UnixDot"); + pmap.put(Pattern.VertWS, "VertWS"); + pmap.put(Pattern.HorizWS, "HorizWS"); + + pmap.put(CharPredicates.ASCII_DIGIT, "ASCII.DIGIT"); + pmap.put(CharPredicates.ASCII_WORD, "ASCII.WORD"); + pmap.put(CharPredicates.ASCII_SPACE, "ASCII.SPACE"); + } + + static void walk(Pattern.Node node, int depth) { + depth++; + while(node != null) { + String name = toString(node); + String str; + if (node instanceof Pattern.Prolog) { + print(node, name, depth); + // print the loop here + Pattern.Loop loop = ((Pattern.Prolog)node).loop; + name = toString(loop); + str = name + " " + toStringRange(loop.cmin, loop.cmax); + print(loop, str, depth); + walk(loop.body, depth); + print("/" + name, depth); + node = loop; + } else if (node instanceof Pattern.Loop) { + return; // stop here, body.next -> loop + } else if (node instanceof Pattern.Curly) { + Pattern.Curly c = (Pattern.Curly)node; + str = "Curly " + c.type + " " + toStringRange(c.cmin, c.cmax); + print(node, str, depth); + walk(c.atom, depth); + print("/Curly", depth); + } else if (node instanceof Pattern.GroupCurly) { + Pattern.GroupCurly gc = (Pattern.GroupCurly)node; + str = "GroupCurly " + gc.groupIndex / 2 + + ", " + gc.type + " " + toStringRange(gc.cmin, gc.cmax); + print(node, str, depth); + walk(gc.atom, depth); + print("/GroupCurly", depth); + } else if (node instanceof Pattern.GroupHead) { + Pattern.GroupHead head = (Pattern.GroupHead)node; + Pattern.GroupTail tail = head.tail; + print(head, "Group.head " + (tail.groupIndex / 2), depth); + walk(head.next, depth); + print(tail, "/Group.tail " + (tail.groupIndex / 2), depth); + node = tail; + } else if (node instanceof Pattern.GroupTail) { + return; // stopper + } else if (node instanceof Pattern.Ques) { + print(node, "Ques " + ((Pattern.Ques)node).type, depth); + walk(((Pattern.Ques)node).atom, depth); + print("/Ques", depth); + } else if (node instanceof Pattern.Branch) { + Pattern.Branch b = (Pattern.Branch)node; + print(b, name, depth); + int i = 0; + while (true) { + if (b.atoms[i] != null) { + walk(b.atoms[i], depth); + } else { + print(" (accepted)", depth); + } + if (++i == b.size) + break; + print("-branch.separator-", depth); + } + node = b.conn; + print(node, "/Branch", depth); + } else if (node instanceof Pattern.BranchConn) { + return; + } else if (node instanceof Pattern.CharProperty) { + str = pmap.get(((Pattern.CharProperty)node).predicate); + if (str == null) + str = toString(node); + else + str = "Single \"" + str + "\""; + print(node, str, depth); + } else if (node instanceof Pattern.SliceNode) { + str = name + " \"" + + toStringCPS(((Pattern.SliceNode)node).buffer) + "\""; + print(node, str, depth); + } else if (node instanceof Pattern.CharPropertyGreedy) { + Pattern.CharPropertyGreedy gcp = (Pattern.CharPropertyGreedy)node; + String pstr = pmap.get(gcp.predicate); + if (pstr == null) + pstr = gcp.predicate.toString(); + else + pstr = "Single \"" + pstr + "\""; + str = name + " " + pstr + ((gcp.cmin == 0) ? "*" : "+"); + print(node, str, depth); + } else if (node instanceof Pattern.BackRef) { + str = "GroupBackRef " + ((Pattern.BackRef)node).groupIndex / 2; + print(node, str, depth); + } else if (node instanceof Pattern.LastNode) { + print(node, "END", depth); + } else if (node == Pattern.accept) { + return; + } else { + print(node, name, depth); + } + node = node.next; + } + } + + public static void main(String[] args) { + Pattern p = Pattern.compile(args[0]); + System.out.println(" Pattern: " + p); + walk(p.root, 0); + } +} diff --git a/jdk/src/java.base/share/classes/java/util/regex/UnicodeProp.java b/jdk/src/java.base/share/classes/java/util/regex/UnicodeProp.java deleted file mode 100644 index dccbd1c718e..00000000000 --- a/jdk/src/java.base/share/classes/java/util/regex/UnicodeProp.java +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package java.util.regex; - -import java.util.HashMap; -import java.util.Locale; - -enum UnicodeProp { - - ALPHABETIC { - public boolean is(int ch) { - return Character.isAlphabetic(ch); - } - }, - - LETTER { - public boolean is(int ch) { - return Character.isLetter(ch); - } - }, - - IDEOGRAPHIC { - public boolean is(int ch) { - return Character.isIdeographic(ch); - } - }, - - LOWERCASE { - public boolean is(int ch) { - return Character.isLowerCase(ch); - } - }, - - UPPERCASE { - public boolean is(int ch) { - return Character.isUpperCase(ch); - } - }, - - TITLECASE { - public boolean is(int ch) { - return Character.isTitleCase(ch); - } - }, - - WHITE_SPACE { - // \p{Whitespace} - public boolean is(int ch) { - return ((((1 << Character.SPACE_SEPARATOR) | - (1 << Character.LINE_SEPARATOR) | - (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) - != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); - } - }, - - CONTROL { - // \p{gc=Control} - public boolean is(int ch) { - return Character.getType(ch) == Character.CONTROL; - } - }, - - PUNCTUATION { - // \p{gc=Punctuation} - public boolean is(int ch) { - return ((((1 << Character.CONNECTOR_PUNCTUATION) | - (1 << Character.DASH_PUNCTUATION) | - (1 << Character.START_PUNCTUATION) | - (1 << Character.END_PUNCTUATION) | - (1 << Character.OTHER_PUNCTUATION) | - (1 << Character.INITIAL_QUOTE_PUNCTUATION) | - (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) - != 0; - } - }, - - HEX_DIGIT { - // \p{gc=Decimal_Number} - // \p{Hex_Digit} -> PropList.txt: Hex_Digit - public boolean is(int ch) { - return DIGIT.is(ch) || - (ch >= 0x0030 && ch <= 0x0039) || - (ch >= 0x0041 && ch <= 0x0046) || - (ch >= 0x0061 && ch <= 0x0066) || - (ch >= 0xFF10 && ch <= 0xFF19) || - (ch >= 0xFF21 && ch <= 0xFF26) || - (ch >= 0xFF41 && ch <= 0xFF46); - } - }, - - ASSIGNED { - public boolean is(int ch) { - return Character.getType(ch) != Character.UNASSIGNED; - } - }, - - NONCHARACTER_CODE_POINT { - // PropList.txt:Noncharacter_Code_Point - public boolean is(int ch) { - return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); - } - }, - - DIGIT { - // \p{gc=Decimal_Number} - public boolean is(int ch) { - return Character.isDigit(ch); - } - }, - - ALNUM { - // \p{alpha} - // \p{digit} - public boolean is(int ch) { - return ALPHABETIC.is(ch) || DIGIT.is(ch); - } - }, - - BLANK { - // \p{Whitespace} -- - // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 - // \p{gc=Line_Separator} - // \p{gc=Paragraph_Separator}] - public boolean is(int ch) { - return Character.getType(ch) == Character.SPACE_SEPARATOR || - ch == 0x9; // \N{HT} - } - }, - - GRAPH { - // [^ - // \p{space} - // \p{gc=Control} - // \p{gc=Surrogate} - // \p{gc=Unassigned}] - public boolean is(int ch) { - return ((((1 << Character.SPACE_SEPARATOR) | - (1 << Character.LINE_SEPARATOR) | - (1 << Character.PARAGRAPH_SEPARATOR) | - (1 << Character.CONTROL) | - (1 << Character.SURROGATE) | - (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) - == 0; - } - }, - - PRINT { - // \p{graph} - // \p{blank} - // -- \p{cntrl} - public boolean is(int ch) { - return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch); - } - }, - - WORD { - // \p{alpha} - // \p{gc=Mark} - // \p{digit} - // \p{gc=Connector_Punctuation} - // \p{Join_Control} 200C..200D - - public boolean is(int ch) { - return ALPHABETIC.is(ch) || - ((((1 << Character.NON_SPACING_MARK) | - (1 << Character.ENCLOSING_MARK) | - (1 << Character.COMBINING_SPACING_MARK) | - (1 << Character.DECIMAL_DIGIT_NUMBER) | - (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1) - != 0 || - JOIN_CONTROL.is(ch); - } - }, - - JOIN_CONTROL { - // 200C..200D PropList.txt:Join_Control - public boolean is(int ch) { - return (ch == 0x200C || ch == 0x200D); - } - }; - - private static final HashMap posix = new HashMap<>(); - private static final HashMap aliases = new HashMap<>(); - static { - posix.put("ALPHA", "ALPHABETIC"); - posix.put("LOWER", "LOWERCASE"); - posix.put("UPPER", "UPPERCASE"); - posix.put("SPACE", "WHITE_SPACE"); - posix.put("PUNCT", "PUNCTUATION"); - posix.put("XDIGIT","HEX_DIGIT"); - posix.put("ALNUM", "ALNUM"); - posix.put("CNTRL", "CONTROL"); - posix.put("DIGIT", "DIGIT"); - posix.put("BLANK", "BLANK"); - posix.put("GRAPH", "GRAPH"); - posix.put("PRINT", "PRINT"); - - aliases.put("WHITESPACE", "WHITE_SPACE"); - aliases.put("HEXDIGIT","HEX_DIGIT"); - aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT"); - aliases.put("JOINCONTROL", "JOIN_CONTROL"); - } - - public static UnicodeProp forName(String propName) { - propName = propName.toUpperCase(Locale.ENGLISH); - String alias = aliases.get(propName); - if (alias != null) - propName = alias; - try { - return valueOf (propName); - } catch (IllegalArgumentException x) {} - return null; - } - - public static UnicodeProp forPOSIXName(String propName) { - propName = posix.get(propName.toUpperCase(Locale.ENGLISH)); - if (propName == null) - return null; - return valueOf (propName); - } - - public abstract boolean is(int ch); -} diff --git a/jdk/test/java/util/regex/RegExTest.java b/jdk/test/java/util/regex/RegExTest.java index 2f5fca2b0c3..815cb99104e 100644 --- a/jdk/test/java/util/regex/RegExTest.java +++ b/jdk/test/java/util/regex/RegExTest.java @@ -33,6 +33,9 @@ * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066 * 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647 6559590 * 8027645 8035076 8039124 8035975 8074678 6854417 8143854 8147531 7071819 + * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 + * 6328855 6192895 6345469 6988218 6693451 7006761 8140212 + * * @library /lib/testlibrary * @build jdk.testlibrary.* * @run main RegExTest @@ -162,6 +165,7 @@ public class RegExTest { patternAsPredicate(); invalidFlags(); grapheme(); + expoBacktracking(); if (failure) { throw new @@ -2659,51 +2663,101 @@ public class RegExTest { check(p, "test\u00e4\u0323\u0300", true); check(p, "test\u00e4\u0300\u0323", true); - /* - * The following canonical equivalence tests don't work. Bug id: 4916384. - * - // Decomposed hangul (jamos) - p = Pattern.compile("\u1100\u1161", Pattern.CANON_EQ); - m = p.matcher("\u1100\u1161"); - if (!m.matches()) - failCount++; + Object[][] data = new Object[][] { - m.reset("\uac00"); - if (!m.matches()) - failCount++; + // JDK-4867170 + { "[\u1f80-\u1f82]", "ab\u1f80cd", "f", true }, + { "[\u1f80-\u1f82]", "ab\u1f81cd", "f", true }, + { "[\u1f80-\u1f82]", "ab\u1f82cd", "f", true }, + { "[\u1f80-\u1f82]", "ab\u03b1\u0314\u0345cd", "f", true }, + { "[\u1f80-\u1f82]", "ab\u03b1\u0345\u0314cd", "f", true }, + { "[\u1f80-\u1f82]", "ab\u1f01\u0345cd", "f", true }, + { "[\u1f80-\u1f82]", "ab\u1f00\u0345cd", "f", true }, + + { "\\p{IsGreek}", "ab\u1f80cd", "f", true }, + { "\\p{IsGreek}", "ab\u1f81cd", "f", true }, + { "\\p{IsGreek}", "ab\u1f82cd", "f", true }, + { "\\p{IsGreek}", "ab\u03b1\u0314\u0345cd", "f", true }, + { "\\p{IsGreek}", "ab\u1f01\u0345cd", "f", true }, + + // backtracking, force to match "\u1f80", instead of \u1f82" + { "ab\\p{IsGreek}\u0300cd", "ab\u03b1\u0313\u0345\u0300cd", "m", true }, + + { "[\\p{IsGreek}]", "\u03b1\u0314\u0345", "m", true }, + { "\\p{IsGreek}", "\u03b1\u0314\u0345", "m", true }, + + { "[^\u1f80-\u1f82]","\u1f81", "m", false }, + { "[^\u1f80-\u1f82]","\u03b1\u0314\u0345", "m", false }, + { "[^\u1f01\u0345]", "\u1f81", "f", false }, + + { "[^\u1f81]+", "\u1f80\u1f82", "f", true }, + { "[\u1f80]", "ab\u1f80cd", "f", true }, + { "\u1f80", "ab\u1f80cd", "f", true }, + { "\u1f00\u0345\u0300", "\u1f82", "m", true }, + { "\u1f80", "-\u1f00\u0345\u0300-", "f", true }, + { "\u1f82", "\u1f00\u0345\u0300", "m", true }, + { "\u1f82", "\u1f80\u0300", "m", true }, + + // JDK-7080302 # compile failed + { "a(\u0041\u0301\u0328)", "a\u0041\u0301\u0328", "m", true}, + + // JDK-6728861, same cause as above one + { "\u00e9\u00e9n", "e\u0301e\u0301n", "m", true}, + + // JDK-6995635 + { "(\u00e9)", "e\u0301", "m", true }, + + // JDK-6736245 + // intereting special case, nfc(u2add+u0338) -> u2add+u0338) NOT u2adc + { "\u2ADC", "\u2ADC", "m", true}, // NFC + { "\u2ADC", "\u2ADD\u0338", "m", true}, // NFD + + // 4916384. + // Decomposed hangul (jamos) works inside clazz + { "[\u1100\u1161]", "\u1100\u1161", "m", true}, + { "[\u1100\u1161]", "\uac00", "m", true}, + + { "[\uac00]", "\u1100\u1161", "m", true}, + { "[\uac00]", "\uac00", "m", true}, + + // Decomposed hangul (jamos) + { "\u1100\u1161", "\u1100\u1161", "m", true}, + { "\u1100\u1161", "\uac00", "m", true}, // Composed hangul - p = Pattern.compile("\uac00", Pattern.CANON_EQ); - m = p.matcher("\u1100\u1161"); - if (!m.matches()) - failCount++; + { "\uac00", "\u1100\u1161", "m", true }, + { "\uac00", "\uac00", "m", true }, - m.reset("\uac00"); - if (!m.matches()) - failCount++; + /* Need a NFDSlice to nfd the source to solve this issue + u+1d1c0 -> nfd: -> nfc: + u+1d1bc -> nfd: -> nfc: + -> nfd: -> nfc: // Decomposed supplementary outside char classes - p = Pattern.compile("test\ud834\uddbc\ud834\udd6f", Pattern.CANON_EQ); - m = p.matcher("test\ud834\uddc0"); - if (!m.matches()) - failCount++; - - m.reset("test\ud834\uddbc\ud834\udd6f"); - if (!m.matches()) - failCount++; - + // { "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddc0", "m", true }, // Composed supplementary outside char classes - p = Pattern.compile("test\ud834\uddc0", Pattern.CANON_EQ); - m.reset("test\ud834\uddbc\ud834\udd6f"); - if (!m.matches()) - failCount++; - - m = p.matcher("test\ud834\uddc0"); - if (!m.matches()) - failCount++; - + // { "test\ud834\uddc0", "test\ud834\uddbc\ud834\udd6f", "m", true }, */ + { "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddbc\ud834\udd6f", "m", true }, + { "test\ud834\uddc0", "test\ud834\uddbc\ud834\udd6f", "m", true }, + { "test\ud834\uddc0", "test\ud834\uddc0", "m", true }, + { "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddc0", "m", true }, + }; + + int failCount = 0; + for (Object[] d : data) { + String pn = (String)d[0]; + String tt = (String)d[1]; + boolean isFind = "f".equals(((String)d[2])); + boolean expected = (boolean)d[3]; + boolean ret = isFind ? Pattern.compile(pn, Pattern.CANON_EQ).matcher(tt).find() + : Pattern.compile(pn, Pattern.CANON_EQ).matcher(tt).matches(); + if (ret != expected) { + failCount++; + continue; + } + } report("Canonical Equivalence"); } @@ -3846,7 +3900,6 @@ public class RegExTest { if (!patternString.startsWith("'")) { return Pattern.compile(patternString); } - int break1 = patternString.lastIndexOf("'"); String flagString = patternString.substring( break1+1, patternString.length()); @@ -4092,10 +4145,11 @@ public class RegExTest { report("NamedGroupCapture"); } - // This is for bug 6969132 + // This is for bug 6919132 private static void nonBmpClassComplementTest() throws Exception { Pattern p = Pattern.compile("\\P{Lu}"); Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1)); + if (m.find() && m.start() == 1) failCount++; @@ -4113,6 +4167,11 @@ public class RegExTest { if (m.find() && m.start() == 1) failCount++; + p = Pattern.compile("\\P{sc=GRANTHA}"); + m = p.matcher(new String(new int[] {0x11350}, 0, 1)); + if (m.find() && m.start() == 1) + failCount++; + report("NonBmpClassComplement"); } @@ -4662,4 +4721,92 @@ public class RegExTest { failCount++; report("Unicode extended grapheme cluster"); } + + // hangup/timeout if go into exponential backtracking + private static void expoBacktracking() throws Exception { + + Object[][] patternMatchers = { + // 6328855 + { "(.*\n*)*", + "this little fine string lets\r\njava.lang.String.matches\r\ncrash\r\n(We don't know why but adding \r* to the regex makes it work again)", + false }, + // 6192895 + { " *([a-zA-Z0-9/\\-\\?:\\(\\)\\.,'\\+\\{\\}]+ *)+", + "Hello World this is a test this is a test this is a test A", + true }, + { " *([a-zA-Z0-9/\\-\\?:\\(\\)\\.,'\\+\\{\\}]+ *)+", + "Hello World this is a test this is a test this is a test \u4e00 ", + false }, + { " *([a-z0-9]+ *)+", + "hello world this is a test this is a test this is a test A", + false }, + // 4771934 [FIXED] #5013651? + { "^(\\w+([\\.-]?\\w+)*@\\w+([\\.-]?\\w+)*(\\.\\w{2,4})+[,;]?)+$", + "abc@efg.abc,efg@abc.abc,abc@xyz.mno;abc@sdfsd.com", + true }, + // 4866249 [FIXED] + { "<\\s*" + "(meta|META)" + "(\\s|[^>])+" + "(CHARSET|charset)=" + "(\\s|[^>])+>", + "", + true }, + { "^(\\w+([\\.-]?\\w+)*@\\w+([\\.-]?\\w+)*(\\.\\w{2,4})+[,;]?)+$", + "abc@efg.abc,efg@abc.abc,abc@xyz.mno;sdfsd.com", + false }, + // 6345469 + { "((<[^>]+>)?(((\\s)?)*(\\ )?)*((\\s)?)*)+", + "   < br/>   < / p>

 

", + true }, // --> matched + { "((<[^>]+>)?(((\\s)?)*(\\ )?)*((\\s)?)*)+", + "   < br/>   < / p>

  p

", + false }, + // 5026912 + { "^\\s*" + "(\\w|\\d|[\\xC0-\\xFF]|/)+" + "\\s+|$", + "156580451111112225588087755221111111566969655555555", + false}, + // 6988218 + { "^([+-]?((0[xX](\\p{XDigit}+))|(((\\p{Digit}+)(\\.)?((\\p{Digit}+)?)([eE][+-]?(\\p{Digit}+))?)|(\\.((\\p{Digit}+))([eE][+-]?(\\p{Digit}+))?)))|[n|N]?'([^']*(?:'')*[^']*)*')", + "'%)) order by ANGEBOT.ID", + false}, // find + // 6693451 + { "^(\\s*foo\\s*)*$", + "foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo", + true }, + { "^(\\s*foo\\s*)*$", + "foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo foo fo", + false + }, + // 7006761 + { "(([0-9A-Z]+)([_]?+)*)*", "FOOOOO_BAAAR_FOOOOOOOOO_BA_", true}, + { "(([0-9A-Z]+)([_]?+)*)*", "FOOOOO_BAAAR_FOOOOOOOOO_BA_ ", false}, + // 8140212 + { "(?.*)\\{(?\\w+):(?\\w+(\\.?\\w+(\\(((?(('[^']*')|((/|\\w)+))(,(('[^']*')|((/|\\w)+)))*))?\\))?)*)\\}(?.*)", + "{CeGlobal:getSodCutoff.getGui.getAmqp.getSimpleModeEnabled()", + false + }, + { "^(a+)+$", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", true}, + { "^(a+)+$", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!", false}, + + { "(x+)*y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy", true }, + { "(x+)*y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz", false}, + + { "(x+x+)+y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy", true}, + { "(x+x+)+y", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz", false}, + + { "(([0-9A-Z]+)([_]?+)*)*", "--------------------------------------", false}, + + /* not fixed + //8132141 ---> second level exponential backtracking + { "(h|h|ih(((i|a|c|c|a|i|i|j|b|a|i|b|a|a|j))+h)ahbfhba|c|i)*", + "hchcchicihcchciiicichhcichcihcchiihichiciiiihhcchicchhcihchcihiihciichhccciccichcichiihcchcihhicchcciicchcccihiiihhihihihichicihhcciccchihhhcchichchciihiicihciihcccciciccicciiiiiiiiicihhhiiiihchccchchhhhiiihchihcccchhhiiiiiiiicicichicihcciciihichhhhchihciiihhiccccccciciihhichiccchhicchicihihccichicciihcichccihhiciccccccccichhhhihihhcchchihihiihhihihihicichihiiiihhhhihhhchhichiicihhiiiiihchccccchichci" }, + */ + }; + + for (Object[] pm : patternMatchers) { + String p = (String)pm[0]; + String s = (String)pm[1]; + boolean r = (Boolean)pm[2]; + if (r != Pattern.compile(p).matcher(s).matches()) { + failCount++; + } + } + } } diff --git a/jdk/test/java/util/regex/TestCases.txt b/jdk/test/java/util/regex/TestCases.txt index 4490eaf9516..3c9174939ba 100644 --- a/jdk/test/java/util/regex/TestCases.txt +++ b/jdk/test/java/util/regex/TestCases.txt @@ -139,6 +139,71 @@ false 0 aaabbbcccdefg true defg 0 +// Negation with nested char class and intersection +[^[c]] +c +false 0 + +[^[a-z]] +e +false 0 + +[^[a-z][A-Z]] +E +false 0 + +[^a-d[0-9][m-p]] +e +true e 0 + +[^a-d[0-9][m-p]] +8 +false 0 + +[^[a-c]&&[d-f]] +z +true z 0 + +[^a-c&&d-f] +a +true a 0 + +[^a-m&&m-z] +m +false 0 + +[^a-m&&m-z&&a-c] +m +true m 0 + +[^a-cd-f&&[d-f]] +c +true c 0 + +[^[a-c][d-f]&&abc] +a +false 0 + +[^[a-c][d-f]&&abc] +d +true d 0 + +[^[a-c][d-f]&&abc[def]] +a +false 0 + +[^[a-c][d-f]&&abc[def]] +e +false 0 + +[^[a-c]&&[b-d]&&[c-e]] +a +true a 0 + +[^[a-c]&&[b-d]&&[c-e]] +c +false 0 + // Making sure a ^ not in first position matches literal ^ [abc^b] b