From 722199f9b2a4fef6c7bbdf2e246783f78425c0d1 Mon Sep 17 00:00:00 2001 From: Xueming Shen Date: Mon, 6 May 2013 21:24:37 -0700 Subject: [PATCH] 8013252: Regex Matcher .start and .end should be accessible by group name 8013254: Constructor \w need update to add the support of \p{Join_Control} Added the requested methods and updated the \w constructor Reviewed-by: mchung, alanb --- .../classes/java/util/regex/Matcher.java | 76 ++++++++++++++++--- .../classes/java/util/regex/Pattern.java | 3 +- .../classes/java/util/regex/UnicodeProp.java | 12 ++- jdk/test/java/util/regex/POSIX_Unicode.java | 7 +- jdk/test/java/util/regex/RegExTest.java | 50 +++++++++--- 5 files changed, 124 insertions(+), 24 deletions(-) diff --git a/jdk/src/share/classes/java/util/regex/Matcher.java b/jdk/src/share/classes/java/util/regex/Matcher.java index fb1cca4b865..b01ec84262a 100644 --- a/jdk/src/share/classes/java/util/regex/Matcher.java +++ b/jdk/src/share/classes/java/util/regex/Matcher.java @@ -25,6 +25,7 @@ package java.util.regex; +import java.util.Objects; /** * An engine that performs match operations on a {@link java.lang.CharSequence @@ -370,11 +371,36 @@ public final class Matcher implements MatchResult { public int start(int group) { if (first < 0) throw new IllegalStateException("No match available"); - if (group > groupCount()) + if (group < 0 || group > groupCount()) throw new IndexOutOfBoundsException("No group " + group); return groups[group * 2]; } + /** + * Returns the start index of the subsequence captured by the given + * named-capturing group during the + * previous match operation. + * + * @param name + * The name of a named-capturing group in this matcher's pattern + * + * @return The index of the first character captured by the group, + * or {@code -1} if the match was successful but the group + * itself did not match anything + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IllegalArgumentException + * If there is no capturing group in the pattern + * with the given name + * @since 1.8 + */ + public int start(String name) { + return groups[getMatchedGroupIndex(name) * 2]; + } + /** * Returns the offset after the last character matched.

* @@ -417,11 +443,36 @@ public final class Matcher implements MatchResult { public int end(int group) { if (first < 0) throw new IllegalStateException("No match available"); - if (group > groupCount()) + if (group < 0 || group > groupCount()) throw new IndexOutOfBoundsException("No group " + group); return groups[group * 2 + 1]; } + /** + * Returns the offset after the last character of the subsequence + * captured by the given named-capturing + * group during the previous match operation. + * + * @param name + * The name of a named-capturing group in this matcher's pattern + * + * @return The offset after the last character captured by the group, + * or {@code -1} if the match was successful + * but the group itself did not match anything + * + * @throws IllegalStateException + * If no match has yet been attempted, + * or if the previous match operation failed + * + * @throws IllegalArgumentException + * If there is no capturing group in the pattern + * with the given name + * @since 1.8 + */ + public int end(String name) { + return groups[getMatchedGroupIndex(name) * 2 + 1]; + } + /** * Returns the input subsequence matched by the previous match. * @@ -518,13 +569,7 @@ public final class Matcher implements MatchResult { * @since 1.7 */ public String group(String name) { - if (name == null) - throw new NullPointerException("Null group name"); - if (first < 0) - throw new IllegalStateException("No match found"); - if (!parentPattern.namedGroups().containsKey(name)) - throw new IllegalArgumentException("No group with name <" + name + ">"); - int group = parentPattern.namedGroups().get(name); + int group = getMatchedGroupIndex(name); if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) return null; return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); @@ -1257,4 +1302,17 @@ public final class Matcher implements MatchResult { return text.charAt(i); } + /** + * Returns the group index of the matched capturing group. + * + * @return the index of the named-capturing group + */ + int getMatchedGroupIndex(String name) { + Objects.requireNonNull(name, "Group name"); + if (first < 0) + throw new IllegalStateException("No match found"); + if (!parentPattern.namedGroups().containsKey(name)) + throw new IllegalArgumentException("No group with name <" + name + ">"); + return parentPattern.namedGroups().get(name); + } } diff --git a/jdk/src/share/classes/java/util/regex/Pattern.java b/jdk/src/share/classes/java/util/regex/Pattern.java index 529b07c666e..652784ad8d1 100644 --- a/jdk/src/share/classes/java/util/regex/Pattern.java +++ b/jdk/src/share/classes/java/util/regex/Pattern.java @@ -612,6 +612,7 @@ import java.util.Arrays; *
  • White_Space *
  • Digit *
  • Hex_Digit + *
  • Join_Control *
  • Noncharacter_Code_Point *
  • Assigned * @@ -662,7 +663,7 @@ import java.util.Arrays; * \S * A non-whitespace character: [^\s] * \w - * A word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}] + * A word character: [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}] * \W * A non-word character: [^\w] * diff --git a/jdk/src/share/classes/java/util/regex/UnicodeProp.java b/jdk/src/share/classes/java/util/regex/UnicodeProp.java index d1a68c08fa1..2c7c128a557 100644 --- a/jdk/src/share/classes/java/util/regex/UnicodeProp.java +++ b/jdk/src/share/classes/java/util/regex/UnicodeProp.java @@ -181,6 +181,7 @@ enum UnicodeProp { // \p{gc=Mark} // \p{digit} // \p{gc=Connector_Punctuation} + // \p{Join_Control} 200C..200D public boolean is(int ch) { return ALPHABETIC.is(ch) || @@ -189,7 +190,15 @@ enum UnicodeProp { (1 << Character.COMBINING_SPACING_MARK) | (1 << Character.DECIMAL_DIGIT_NUMBER) | (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1) - != 0; + != 0 || + JOIN_CONTROL.is(ch); + } + }, + + JOIN_CONTROL { + // 200C..200D PropList.txt:Join_Control + public boolean is(int ch) { + return (ch == 0x200C || ch == 0x200D); } }; @@ -212,6 +221,7 @@ enum UnicodeProp { aliases.put("WHITESPACE", "WHITE_SPACE"); aliases.put("HEXDIGIT","HEX_DIGIT"); aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT"); + aliases.put("JOINCONTROL", "JOIN_CONTROL"); } public static UnicodeProp forName(String propName) { diff --git a/jdk/test/java/util/regex/POSIX_Unicode.java b/jdk/test/java/util/regex/POSIX_Unicode.java index da691fe6b57..817e445d605 100644 --- a/jdk/test/java/util/regex/POSIX_Unicode.java +++ b/jdk/test/java/util/regex/POSIX_Unicode.java @@ -125,6 +125,10 @@ final public class POSIX_Unicode { return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); } + public static boolean isJoinControl(int ch) { + return (ch == 0x200C || ch == 0x200D); + } + // \p{alpha} // \p{gc=Mark} // \p{digit} @@ -136,6 +140,7 @@ final public class POSIX_Unicode { (1 << Character.COMBINING_SPACING_MARK) | (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1) != 0 || - isDigit(ch); + isDigit(ch) || + isJoinControl(ch); } } diff --git a/jdk/test/java/util/regex/RegExTest.java b/jdk/test/java/util/regex/RegExTest.java index bc345eda853..8626a5102ea 100644 --- a/jdk/test/java/util/regex/RegExTest.java +++ b/jdk/test/java/util/regex/RegExTest.java @@ -33,7 +33,7 @@ * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066 - * 7067045 7014640 7189363 8007395 + * 7067045 7014640 7189363 8007395 8013252 8013254 */ import java.util.regex.*; @@ -3390,7 +3390,9 @@ public class RegExTest { private static void check(Pattern p, String s, String g, String expected) { Matcher m = p.matcher(s); m.find(); - if (!m.group(g).equals(expected)) + if (!m.group(g).equals(expected) || + s.charAt(m.start(g)) != expected.charAt(0) || + s.charAt(m.end(g) - 1) != expected.charAt(expected.length() - 1)) failCount++; } @@ -3420,19 +3422,42 @@ public class RegExTest { failCount++; } - private static void checkExpectedFail(Matcher m, String g) { + private static void checkExpectedIAE(Matcher m, String g) { m.find(); try { m.group(g); - } catch (IllegalArgumentException iae) { + } catch (IllegalArgumentException x) { //iae.printStackTrace(); - return; - } catch (NullPointerException npe) { - return; + try { + m.start(g); + } catch (IllegalArgumentException xx) { + try { + m.start(g); + } catch (IllegalArgumentException xxx) { + return; + } + } } failCount++; } + private static void checkExpectedNPE(Matcher m) { + m.find(); + try { + m.group(null); + } catch (NullPointerException x) { + try { + m.start(null); + } catch (NullPointerException xx) { + try { + m.end(null); + } catch (NullPointerException xxx) { + return; + } + } + } + failCount++; + } private static void namedGroupCaptureTest() throws Exception { check(Pattern.compile("x+(?y+)z+"), @@ -3559,10 +3584,9 @@ public class RegExTest { checkExpectedFail("(?<6groupnamestartswithdigit>abc)(def)"); checkExpectedFail("(?abc)(def)\\k"); checkExpectedFail("(?abc)(?def)\\k"); - checkExpectedFail(Pattern.compile("(?abc)(def)").matcher("abcdef"), - "gnameX"); - checkExpectedFail(Pattern.compile("(?abc)(def)").matcher("abcdef"), - null); + checkExpectedIAE(Pattern.compile("(?abc)(def)").matcher("abcdef"), + "gnameX"); + checkExpectedNPE(Pattern.compile("(?abc)(def)").matcher("abcdef")); report("NamedGroupCapture"); } @@ -3759,6 +3783,7 @@ public class RegExTest { Matcher spaceP = Pattern.compile("\\p{IsWhiteSpace}").matcher(""); Matcher definedP = Pattern.compile("\\p{IsAssigned}").matcher(""); Matcher nonCCPP = Pattern.compile("\\p{IsNoncharacterCodePoint}").matcher(""); + Matcher joinCrtl = Pattern.compile("\\p{IsJoinControl}").matcher(""); // javaMethod Matcher lowerJ = Pattern.compile("\\p{javaLowerCase}").matcher(""); @@ -3829,7 +3854,8 @@ public class RegExTest { Character.isIdeographic(cp) != ideogP.reset(str).matches() || Character.isIdeographic(cp) != ideogJ.reset(str).matches() || (Character.UNASSIGNED == type) == definedP.reset(str).matches() || - POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches()) + POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches() || + POSIX_Unicode.isJoinControl(cp) != joinCrtl.reset(str).matches()) failCount++; }