diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java
index 5bb6ec76a7c..a06afacc6d2 100644
--- a/src/java.base/share/classes/java/util/regex/Pattern.java
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java
@@ -158,7 +158,8 @@ import jdk.internal.util.ArraysSupport;
*
| {@code .} |
* Any character (may or may not match line terminators) |
* | {@code \d} |
- * A digit: {@code [0-9]} |
+ * A digit: {@code [0-9]} if
+ * * UNICODE_CHARACTER_CLASS is not set. See Unicode Support. |
* | {@code \D} |
* A non-digit: {@code [^0-9]} |
* | {@code \h} |
@@ -167,7 +168,9 @@ import jdk.internal.util.ArraysSupport;
*
|---|
| {@code \H} |
* A non-horizontal whitespace character: {@code [^\h]} |
* | {@code \s} |
- * A whitespace character: {@code [ \t\n\x0B\f\r]} |
+ * A whitespace character: {@code [ \t\n\x0B\f\r]} if
+ * UNICODE_CHARACTER_CLASS is not set. See
+ * Unicode Support. |
* | {@code \S} |
* A non-whitespace character: {@code [^\s]} |
* | {@code \v} |
@@ -176,7 +179,8 @@ import jdk.internal.util.ArraysSupport;
*
|---|
| {@code \V} |
* A non-vertical whitespace character: {@code [^\v]} |
* | {@code \w} |
- * A word character: {@code [a-zA-Z_0-9]} |
+ * A word character: {@code [a-zA-Z_0-9]} if
+ * UNICODE_CHARACTER_CLASS is not set. See Unicode Support. |
* | {@code \W} |
* A non-word character: {@code [^\w]} |
*
@@ -246,11 +250,12 @@ import jdk.internal.util.ArraysSupport;
* | {@code $} |
* The end of a line |
* | {@code \b} |
- * A word boundary |
+ * A word boundary: {@code (?:(?<=\w)(?=\W)|(?<=\W)(?=\w))} (the location
+ * where a non-word character abuts a word character) |
* | {@code \b{g}} |
* A Unicode extended grapheme cluster boundary |
* | {@code \B} |
- * A non-word boundary |
+ * A non-word boundary: {@code [^\b]} |
* | {@code \A} |
* The beginning of the input |
* | {@code \G} |
@@ -535,7 +540,7 @@ import jdk.internal.util.ArraysSupport;
* that do not capture text and do not count towards the group total, or
* named-capturing group.
*
- * Unicode support
+ * Unicode support
*
* This class is in conformance with Level 1 of Unicode Technical
@@ -5377,7 +5382,7 @@ loop: for(int x=0, offset=0; x basicWordCharCheck =
+ (cp) -> cpMatches(basicWordCharPattern, cp, false);
+
+ IntFunction basicBoundaryCharCheck =
+ (cp) -> cpMatches(basicWordCharBoundaryPattern,
+ cp, true);
+
+ IntFunction unicodeWordCharCheck =
+ (cp) -> cpMatches(unicodeWordCharPattern, cp, false);
+
+ IntFunction unicodeBoundaryCharCheck =
+ (cp) -> cpMatches(unicodeWordCharBoundaryPattern,
+ cp,true);
+
+ //basic pattern comparison
+ for(int cp = 0; cp <= Character.MAX_CODE_POINT; cp++){
+ assertEquals(basicWordCharCheck.apply(cp),
+ basicBoundaryCharCheck.apply(cp),
+ "Codepoint: " + cp);
+ assertEquals(unicodeWordCharCheck.apply(cp),
+ unicodeBoundaryCharCheck.apply(cp),
+ "Codepoint: " + cp);
+ }
+ }
+
+ private static boolean cpMatches(Pattern p, int cp, boolean boundary) {
+ String cpString;
+ if (Character.isBmpCodePoint(cp)) {
+ cpString = "" + ((char) cp);
+ } else {
+ cpString = "" + Character.highSurrogate(cp) +
+ Character.lowSurrogate(cp);
+ }
+
+ if (boundary) {
+ return p.matcher(";" + cpString).matches();
+ } else {
+ return p.matcher(cpString).matches();
+ }
+ }
+
//This test is for 8281560
+ //Checks that when the Canonical Equivalence flag is set, the behavior for
+ //Matcher::hitEnd is equivalent for these similar, patterns that saw
+ //inconsistencies.
@Test
public static void prematureHitEndInNFCCharProperty() {
var testInput = "a1a1";
@@ -4582,6 +4648,8 @@ public class RegExTest {
}
//This test is for 8281315
+ //Checks that we are able to correctly match this case with a backref
+ //without encountering an IndexOutOfBoundsException.
@Test
public static void iOOBForCIBackrefs(){
String line = "\ud83d\udc95\ud83d\udc95\ud83d\udc95";
|---|