From f01cce235b62e378e91a3bae32942e2f3dfc5c7e Mon Sep 17 00:00:00 2001 From: Ian Graves Date: Tue, 29 Mar 2022 00:01:57 +0000 Subject: [PATCH] 8264160: Regex \b is not consistent with \w without UNICODE_CHARACTER_CLASS Reviewed-by: lancea, bpb, naoto --- .../classes/java/util/regex/Pattern.java | 19 ++-- test/jdk/java/util/regex/RegExTest.java | 90 ++++++++++++++++--- 2 files changed, 91 insertions(+), 18 deletions(-) diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index 5bb6ec76a7c..a06afacc6d2 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -158,7 +158,8 @@ import jdk.internal.util.ArraysSupport; * {@code .} * Any character (may or may not match line terminators) * {@code \d} - * A digit: {@code [0-9]} + * A digit: {@code [0-9]} if + * * UNICODE_CHARACTER_CLASS is not set. See Unicode Support. * {@code \D} * A non-digit: {@code [^0-9]} * {@code \h} @@ -167,7 +168,9 @@ import jdk.internal.util.ArraysSupport; * {@code \H} * A non-horizontal whitespace character: {@code [^\h]} * {@code \s} - * A whitespace character: {@code [ \t\n\x0B\f\r]} + * A whitespace character: {@code [ \t\n\x0B\f\r]} if + * UNICODE_CHARACTER_CLASS is not set. See + * Unicode Support. * {@code \S} * A non-whitespace character: {@code [^\s]} * {@code \v} @@ -176,7 +179,8 @@ import jdk.internal.util.ArraysSupport; * {@code \V} * A non-vertical whitespace character: {@code [^\v]} * {@code \w} - * A word character: {@code [a-zA-Z_0-9]} + * A word character: {@code [a-zA-Z_0-9]} if + * UNICODE_CHARACTER_CLASS is not set. See Unicode Support. * {@code \W} * A non-word character: {@code [^\w]} * @@ -246,11 +250,12 @@ import jdk.internal.util.ArraysSupport; * {@code $} * The end of a line * {@code \b} - * A word boundary + * A word boundary: {@code (?:(?<=\w)(?=\W)|(?<=\W)(?=\w))} (the location + * where a non-word character abuts a word character) * {@code \b{g}} * A Unicode extended grapheme cluster boundary * {@code \B} - * A non-word boundary + * A non-word boundary: {@code [^\b]} * {@code \A} * The beginning of the input * {@code \G} @@ -535,7 +540,7 @@ import jdk.internal.util.ArraysSupport; * that do not capture text and do not count towards the group total, or * named-capturing group. * - *

Unicode support

+ *

Unicode support

* *

This class is in conformance with Level 1 of Unicode Technical @@ -5377,7 +5382,7 @@ loop: for(int x=0, offset=0; x basicWordCharCheck = + (cp) -> cpMatches(basicWordCharPattern, cp, false); + + IntFunction basicBoundaryCharCheck = + (cp) -> cpMatches(basicWordCharBoundaryPattern, + cp, true); + + IntFunction unicodeWordCharCheck = + (cp) -> cpMatches(unicodeWordCharPattern, cp, false); + + IntFunction unicodeBoundaryCharCheck = + (cp) -> cpMatches(unicodeWordCharBoundaryPattern, + cp,true); + + //basic pattern comparison + for(int cp = 0; cp <= Character.MAX_CODE_POINT; cp++){ + assertEquals(basicWordCharCheck.apply(cp), + basicBoundaryCharCheck.apply(cp), + "Codepoint: " + cp); + assertEquals(unicodeWordCharCheck.apply(cp), + unicodeBoundaryCharCheck.apply(cp), + "Codepoint: " + cp); + } + } + + private static boolean cpMatches(Pattern p, int cp, boolean boundary) { + String cpString; + if (Character.isBmpCodePoint(cp)) { + cpString = "" + ((char) cp); + } else { + cpString = "" + Character.highSurrogate(cp) + + Character.lowSurrogate(cp); + } + + if (boundary) { + return p.matcher(";" + cpString).matches(); + } else { + return p.matcher(cpString).matches(); + } + } + //This test is for 8281560 + //Checks that when the Canonical Equivalence flag is set, the behavior for + //Matcher::hitEnd is equivalent for these similar, patterns that saw + //inconsistencies. @Test public static void prematureHitEndInNFCCharProperty() { var testInput = "a1a1"; @@ -4582,6 +4648,8 @@ public class RegExTest { } //This test is for 8281315 + //Checks that we are able to correctly match this case with a backref + //without encountering an IndexOutOfBoundsException. @Test public static void iOOBForCIBackrefs(){ String line = "\ud83d\udc95\ud83d\udc95\ud83d\udc95";