From f01cce235b62e378e91a3bae32942e2f3dfc5c7e Mon Sep 17 00:00:00 2001
From: Ian Graves <igraves@openjdk.org>
Date: Tue, 29 Mar 2022 00:01:57 +0000
Subject: [PATCH] 8264160: Regex \b is not consistent with \w without
 UNICODE_CHARACTER_CLASS

Reviewed-by: lancea, bpb, naoto
---
 .../classes/java/util/regex/Pattern.java      | 19 ++--
 test/jdk/java/util/regex/RegExTest.java       | 90 ++++++++++++++++---
 2 files changed, 91 insertions(+), 18 deletions(-)
diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java
index 5bb6ec76a7c..a06afacc6d2 100644
--- a/src/java.base/share/classes/java/util/regex/Pattern.java
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java
@@ -158,7 +158,8 @@ import jdk.internal.util.ArraysSupport;
  * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>
  *     <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>
- *     <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>
+ *     <td headers="matches predef digit">A digit: {@code [0-9]} if <a href="#UNICODE_CHARACTER_CLASS">
+ *  *         UNICODE_CHARACTER_CLASS</a> is not set. See <a href="#unicodesupport">Unicode Support</a>.</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>
  *     <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>
@@ -167,7 +168,9 @@ import jdk.internal.util.ArraysSupport;
  * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>
  *     <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>
- *     <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
+ *     <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]} if
+ *     <a href="#UNICODE_CHARACTER_CLASS"> UNICODE_CHARACTER_CLASS</a> is not set. See
+ *     <a href="#unicodesupport">Unicode Support</a>.</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>
  *     <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>
@@ -176,7 +179,8 @@ import jdk.internal.util.ArraysSupport;
  * <tr><th style="vertical-align:top; font-weight:normal" id="non_vert_white">{@code \V}</th>
  *     <td headers="matches predef non_vert_white">A non-vertical whitespace character: {@code [^\v]}</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="word">{@code \w}</th>
- *     <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]}</td></tr>
+ *     <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]} if <a href="#UNICODE_CHARACTER_CLASS">
+ *         UNICODE_CHARACTER_CLASS</a> is not set. See <a href="#unicodesupport">Unicode Support</a>. </td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="non_word">{@code \W}</th>
  *     <td headers="matches predef non_word">A non-word character: {@code [^\w]}</td></tr>
  *
@@ -246,11 +250,12 @@ import jdk.internal.util.ArraysSupport;
  * <tr><th style="vertical-align:top; font-weight:normal" id="end_line">{@code $}</th>
  *     <td headers="matches bounds end_line">The end of a line</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="word_boundary">{@code \b}</th>
- *     <td headers="matches bounds word_boundary">A word boundary</td></tr>
+ *     <td headers="matches bounds word_boundary">A word boundary: {@code (?:(?<=\w)(?=\W)|(?<=\W)(?=\w))} (the location
+ *     where a non-word character abuts a word character)</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_cluster_boundary">{@code \b{g}}</th>
  *     <td headers="matches bounds grapheme_cluster_boundary">A Unicode extended grapheme cluster boundary</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="non_word_boundary">{@code \B}</th>
- *     <td headers="matches bounds non_word_boundary">A non-word boundary</td></tr>
+ *     <td headers="matches bounds non_word_boundary">A non-word boundary: {@code [^\b]}</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="begin_input">{@code \A}</th>
  *     <td headers="matches bounds begin_input">The beginning of the input</td></tr>
  * <tr><th style="vertical-align:top; font-weight:normal" id="end_prev_match">{@code \G}</th>
@@ -535,7 +540,7 @@ import jdk.internal.util.ArraysSupport;
  * that do not capture text and do not count towards the group total, or
  * <i>named-capturing</i> group.
  *
- * <h2> Unicode support </h2>
+ * <h2 id="unicodesupport"> Unicode support </h2>
  *
  * <p> This class is in conformance with Level 1 of <a
  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
@@ -5377,7 +5382,7 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
 
         boolean isWord(int ch) {
             return useUWORD ? CharPredicates.WORD().is(ch)
-                            : (ch == '_' || Character.isLetterOrDigit(ch));
+                            : CharPredicates.ASCII_WORD().is(ch);
         }
 
         int check(Matcher matcher, int i, CharSequence seq) {
diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java
index 39c454ce4f4..41db0915d17 100644
--- a/test/jdk/java/util/regex/RegExTest.java
+++ b/test/jdk/java/util/regex/RegExTest.java
@@ -36,7 +36,7 @@
  * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
  * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
  * 8216332 8214245 8237599 8241055 8247546 8258259 8037397 8269753 8276694
- *
+ * 8280403 8264160 8281315
  * @library /test/lib
  * @library /lib/testlibrary/java/lang
  * @build jdk.test.lib.RandomFactory
@@ -51,14 +51,9 @@ import java.nio.CharBuffer;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Scanner;
+import java.util.*;
 import java.util.function.Function;
+import java.util.function.IntFunction;
 import java.util.function.Predicate;
 import java.util.regex.Matcher;
 import java.util.regex.MatchResult;
@@ -3854,11 +3849,11 @@ public class RegExTest {
         }
 
         // bounds/word align
-        twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
+        twoFindIndexes(" \u0180sherman\u0400 ", boundU, 1, 10);
         assertTrue(bwbU.reset("\u0180sherman\u0400").matches());
-        twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
+        twoFindIndexes(" \u0180sh\u0345erman\u0400 ", boundU, 1, 11);
         assertTrue(bwbU.reset("\u0180sh\u0345erman\u0400").matches());
-        twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
+        twoFindIndexes(" \u0724\u0739\u0724 ", boundU, 1, 4);
         assertTrue(bwbU.reset("\u0724\u0739\u0724").matches());
         assertTrue(bwbEU.reset("\u0724\u0739\u0724").matches());
     }
@@ -4503,6 +4498,8 @@ public class RegExTest {
     }
 
     //This test is for 8037397
+    //Ensure we don't drop nested interior character classes to the right of an
+    //intersection operator.
     @Test
     public static void droppedClassesWithIntersection() {
         String rx = "[A-Z&&[A-Z]0-9]";
@@ -4530,6 +4527,9 @@ public class RegExTest {
     }
 
     //This test is for 8269753
+    //This is for ensuring that the caret doesn't point at the wrong character
+    //in a syntax exception message because we previously didn't compensate for
+    //tabs when rendering the offending string that contained tab characters.
     @Test
     public static void errorMessageCaretIndentation() {
         String pattern = "\t**";
@@ -4540,6 +4540,8 @@ public class RegExTest {
     }
 
     //This test is for 8276694
+    //Ensure our error message indicates we have an unescaped backslash when we
+    //encounter one.
     @Test
     public static void unescapedBackslash() {
         String pattern = "\\";
@@ -4549,6 +4551,7 @@ public class RegExTest {
     }
 
     //This test is for 8280403
+    //Given bad intersection syntax, we should throw a PatternSyntaxException.
     @Test
     public static void badIntersectionSyntax() {
         String pattern = "[˜\\H +F&&]";
@@ -4557,7 +4560,70 @@ public class RegExTest {
         assertTrue(e.getMessage().contains("Bad intersection syntax"));
     }
 
+    //This test is for 8264160
+    //Here we check for inconsistencies between the behavior of \w and the
+    //behavior of \b. Prior to this fix, the two flags did not behave in a
+    //consistent way ie \b would recognize non-\w characters as part of a word
+    //in some cases. This test verifies that the two behave consistently
+    //for all codepoints we support.
+    @Test
+    public static void wordBoundaryInconsistencies() {
+        Pattern basicWordCharPattern = Pattern.compile("\\w");
+        Pattern basicWordCharBoundaryPattern =
+                Pattern.compile(";\\b.", Pattern.DOTALL);
+
+        Pattern unicodeWordCharPattern =
+                Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS);
+
+        Pattern unicodeWordCharBoundaryPattern =
+                Pattern.compile(";\\b.",
+                        Pattern.DOTALL | Pattern.UNICODE_CHARACTER_CLASS);
+
+        IntFunction<Boolean> basicWordCharCheck =
+                (cp) -> cpMatches(basicWordCharPattern, cp, false);
+
+        IntFunction<Boolean> basicBoundaryCharCheck =
+                (cp) -> cpMatches(basicWordCharBoundaryPattern,
+                                  cp, true);
+
+        IntFunction<Boolean> unicodeWordCharCheck =
+                (cp) -> cpMatches(unicodeWordCharPattern, cp, false);
+
+        IntFunction<Boolean> unicodeBoundaryCharCheck =
+                (cp) -> cpMatches(unicodeWordCharBoundaryPattern,
+                                  cp,true);
+
+        //basic pattern comparison
+        for(int cp = 0; cp <= Character.MAX_CODE_POINT; cp++){
+            assertEquals(basicWordCharCheck.apply(cp),
+                    basicBoundaryCharCheck.apply(cp),
+                    "Codepoint: " + cp);
+            assertEquals(unicodeWordCharCheck.apply(cp),
+                    unicodeBoundaryCharCheck.apply(cp),
+                    "Codepoint: " + cp);
+        }
+    }
+
+    private static boolean cpMatches(Pattern p, int cp, boolean boundary) {
+        String cpString;
+        if (Character.isBmpCodePoint(cp)) {
+            cpString = "" + ((char) cp);
+        } else {
+            cpString = "" + Character.highSurrogate(cp) +
+                    Character.lowSurrogate(cp);
+        }
+
+        if (boundary) {
+            return p.matcher(";" + cpString).matches();
+        } else {
+            return p.matcher(cpString).matches();
+        }
+    }
+
     //This test is for 8281560
+    //Checks that when the Canonical Equivalence flag is set, the behavior for
+    //Matcher::hitEnd is equivalent for these similar, patterns that saw
+    //inconsistencies.
     @Test
     public static void prematureHitEndInNFCCharProperty() {
         var testInput = "a1a1";
@@ -4582,6 +4648,8 @@ public class RegExTest {
     }
 
     //This test is for 8281315
+    //Checks that we are able to correctly match this case with a backref
+    //without encountering an IndexOutOfBoundsException.
     @Test
     public static void iOOBForCIBackrefs(){
         String line = "\ud83d\udc95\ud83d\udc95\ud83d\udc95";