8360459: UNICODE_CASE and character class with non-ASCII range does not match ASCII char

Reviewed-by: naoto
2026-06-28 21:30:26 +00:00 · 2025-07-15 17:57:13 +00:00 · 2025-07-15 17:57:13 +00:00 · 401af27b9d
commit 401af27b9d
parent 38af17d078
9 changed files with 2084 additions and 5 deletions
--- a/make/ToolsJdk.gmk
+++ b/make/ToolsJdk.gmk
@ -78,6 +78,9 @@ TOOL_GENERATECACERTS = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_class
 TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
    build.tools.generateextraproperties.GenerateExtraProperties

+TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
+    build.tools.generatecharacter.CaseFolding
+
 TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
    build.tools.makezipreproducible.MakeZipReproducible

--- a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java
+++ b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java
@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package build.tools.generatecharacter;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class CaseFolding {
+
+    public static void main(String[] args) throws Throwable {
+        if (args.length != 3) {
+            System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
+            System.exit(1);
+        }
+        var templateFile = Paths.get(args[0]);
+        var caseFoldingTxt = Paths.get(args[1]);
+        var genSrcFile = Paths.get(args[2]);
+        var supportedTypes = "^.*; [CTS]; .*$";
+        var caseFoldingEntries = Files.lines(caseFoldingTxt)
+            .filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
+            .map(line -> {
+                String[] cols = line.split("; ");
+                return new String[] {cols[0], cols[1], cols[2]};
+            })
+            .filter(cols -> {
+                //  the folding case doesn't map back to the original char.
+                var cp1 = Integer.parseInt(cols[0], 16);
+                var cp2 = Integer.parseInt(cols[2], 16);
+                return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
+            })
+            .map(cols -> String.format("        entry(0x%s, 0x%s)", cols[0], cols[2]))
+            .collect(Collectors.joining(",\n", "", ""));
+
+        // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
+        // 0049; T; 0131; # LATIN CAPITAL LETTER I
+        final String T_0x0131_0x49 = String.format("        entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
+
+        // Generate .java file
+        Files.write(
+            genSrcFile,
+            Files.lines(templateFile)
+                .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
+                .collect(Collectors.toList()),
+            StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
+    }
+}
--- a/make/modules/java.base/gensrc/GensrcRegex.gmk
+++ b/make/modules/java.base/gensrc/GensrcRegex.gmk
@ -50,5 +50,22 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)

 ################################################################################

+GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
+
+CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
+CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
+
+$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
+	$(call LogInfo, Generating $@)
+	$(call MakeTargetDir)
+	$(TOOL_GENERATECASEFOLDING) \
+	    $(CASEFOLDINGTEMP) \
+	    $(CASEFOLDINGTXT) \
+	    $(GENSRC_CASEFOLDING)
+
+TARGETS += $(GENSRC_CASEFOLDING)
+
+################################################################################
+
 endif # include guard
 include MakeIncludeEnd.gmk
--- a/src/java.base/share/classes/java/util/regex/Pattern.java
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java
@ -44,6 +44,7 @@ import java.util.stream.Stream;
 import java.util.stream.StreamSupport;

 import jdk.internal.util.ArraysSupport;
+import jdk.internal.util.regex.CaseFolding;
 import jdk.internal.util.regex.Grapheme;

 /**
@ -2915,6 +2916,8 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
              toLowerCase(u+212a) ==> u+006B
           (6)AngstromSign u+212b
              toLowerCase(u+212b) ==> u+00e5
+           (7) Latin Capital Letter Sharp S u+1e0e, was added in version 5.1
+              toLowerCase(u+1e9e) ==> u+00df
        */
        if (ch < 256 &&
            !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
@ -2922,7 +2925,11 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
               ch == 0x49 || ch == 0x69 ||    //I and i
               ch == 0x53 || ch == 0x73 ||    //S and s
               ch == 0x4b || ch == 0x6b ||    //K and k
-               ch == 0xc5 || ch == 0xe5))) {  //A+ring
+               ch == 0xc5 || ch == 0xe5 ||    //A+ring
+               // need to force single() to use SingleU specifically for u+00df.
+               // u+00df <-> u+1e9e, see https://codepoints.net/U+00DF.
+               // Character.toUpperCase('u+00df') still returns u+00df for now.
+                ch == 0xdf))) {               // Shape S
            bits.add(ch, flags0);
            return null;
        }
@ -2939,7 +2946,7 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                upper = Character.toUpperCase(ch);
                lower = Character.toLowerCase(upper);
                // Unicode case insensitive matches
-                if (upper != lower)
+                if (upper != lower || ch == 0xDF)
                    return SingleU(lower);
            } else if (ASCII.isAscii(ch)) {
                lower = ASCII.toLower(ch);
@ -5960,12 +5967,29 @@ NEXT:       while (i <= last) {
    }

    static CharPredicate CIRangeU(int lower, int upper) {
+        int[] closingCharacters = CaseFolding.getClassRangeClosingCharacters(lower, upper);
+        if (closingCharacters.length == 0) {
+            return ch -> {
+                if (inRange(lower, ch, upper))
+                    return true;
+                int up = Character.toUpperCase(ch);
+                return (inRange(lower, up, upper) ||
+                        inRange(lower, Character.toLowerCase(up), upper));
+            };
+        }
        return ch -> {
            if (inRange(lower, ch, upper))
                return true;
            int up = Character.toUpperCase(ch);
-            return inRange(lower, up, upper) ||
-                   inRange(lower, Character.toLowerCase(up), upper);
+            int lo = Character.toLowerCase(up);
+            if (inRange(lower, up, upper) ||
+                inRange(lower, lo, upper))
+                return true;
+            for (int cp : closingCharacters) {
+                if (up == cp || lo == cp)
+                return true;
+            }
+            return false;
        };
    }

--- a/src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template
+++ b/src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template
@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.internal.util.regex;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Objects;
+
+import static java.util.Map.entry;
+
+public final class CaseFolding {
+
+    private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
+%%%Entries
+    );
+
+    private static final int[] expanded_case_cps = expanded_case_map.keySet()
+      .stream()
+      .mapToInt(Integer::intValue)
+      .toArray();
+
+    private CaseFolding()  {}
+
+    /**
+     * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
+     * matching, according to the
+     * <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
+     * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
+     * <p>
+     * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
+     * be applied to literals and (optionally) to character classes. When applied to character classes, each
+     * character class is expected to be closed under simple case folding. See the standard for the
+     * detailed explanation and example of "closed".
+     * <p>
+     * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
+     * <ol>
+     * <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
+     * <li>Specify which character properties or constructs are closed under the matching.</li>
+     * </ol>
+     * <p>
+     * In the {@code  Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
+     * back-refs, string slice (sequences), single, family(char-property) and class range. Single and
+     * family may appears independently or within a class.
+     * <p>
+     * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
+     * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
+     * matching.
+     * <p>
+     * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
+     * if their behavior is clearly specified.
+     * <p>
+     * This method addresses that requirement for the "range" construct within in character class by computing
+     * the additional characters that should be included to close the range under simple case folding:
+     * <p>
+     * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
+     * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
+     * character is not already in the range, then that mapped character (typically lowercase) is added to
+     * the expansion set.
+     * <p>
+     * This allows regex character class "range" implementation to use the returned expansion set to support
+     * additional case-insensitive matching, without duplicating characters already covered by the existing
+     * regex range implementation. The expectation is the matching is done using both the uppercase and
+     * lowercase forms of the input character, for example
+     *
+     * <pre>{@code
+     *
+     *     ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
+     *           inRange(lower, Character.toLower(ch), upper) ||
+     *           additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
+     *           additionalClosingCharacters.contains(Character.toUpperCase(ch))
+     * }</pre>
+     *
+     * <p>
+     * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
+     * @param start the starting code point of the character range
+     * @param end the ending code point of the character range
+     * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
+     *         those already in the range
+     */
+    public static int[] getClassRangeClosingCharacters(int start, int end) {
+        int[] expanded = new int[expanded_case_cps.length];
+        int off = 0;
+        for (int cp : expanded_case_cps) {
+            if (cp >= start && cp <= end) {
+                int folding = expanded_case_map.get(cp);
+                if (folding < start || folding > end) {
+                    expanded[off++] = folding;
+                }
+            }
+        }
+        return Arrays.copyOf(expanded, off);
+    }
+}
--- a/src/java.base/share/data/unicodedata/CaseFolding.txt
+++ b/src/java.base/share/data/unicodedata/CaseFolding.txt
--- a/test/jdk/java/util/regex/CaseFoldingTest.java
+++ b/test/jdk/java/util/regex/CaseFoldingTest.java
@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @summary tests RegExp unicode case-insensitive match (?ui)
+ * @bug 8360459
+ * @library /lib/testlibrary/java/lang
+ * @run junit CaseFoldingTest
+ */
+
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class CaseFoldingTest {
+
+    @Test
+    void testUnicodeCaseInsensitiveMatch() throws Throwable {
+        var testAll = true;   // true to test all codepoints defined in CaseFolding.txt
+        var verbose = true;   // true to display all codepoints being tested
+        var filter = "^.*; [CTS]; .*$";  // update C,T,S to test different type
+        var excluded = Set.of(
+            // these 'S' characters failed for known reason. they don't map to their
+            // folding form with toUpperCase or toLowerCase, only map with case-folding.
+            // exclude them for now.
+            0x1fd3,  // 1FD3 [lo: 1fd3, up: 1fd3]  0390 [lo: 0390, up: 0390]
+            0x1fe3,  // 1FE3 [lo: 1fe3, up: 1fe3]  03B0 [lo: 03b0, up: 03b0]
+            0xfb05   // FB05 [lo: fb05, up: fb05]  FB06 [lo: fb06, up: fb06]
+        );
+
+        var results = Files.lines(UCDFiles.CASEFOLDING)
+            .filter(line -> !line.startsWith("#") && line.matches(filter))
+            .map(line -> {
+                var strs = line.split("; ");
+                return new String[] {strs[0], strs[1], strs[2]};
+            })
+            .filter(cps -> {
+                var cp1 = Integer.parseInt(cps[0], 16);
+                var cp2 = Integer.parseInt(cps[2], 16);
+                if (excluded.contains(cp1))
+                    return false;
+                if (testAll) {
+                    return true;
+                }
+                // the folding codepoint doesn't map back to the original codepoint.
+                return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
+            })
+            .flatMap(cps -> {
+                // test slice, single & range
+                var cp = Integer.parseInt(cps[0], 16);
+                var folding = Integer.parseInt(cps[2], 16);
+                var errors = testCaseFolding(cp, folding);
+                if (verbose)
+                    System.out.format(" [%s] %s [lo: %04x, up: %04x]  %s [lo: %04x, up: %04x]\n",
+                        cps[1],
+                        cps[0],
+                        Character.toLowerCase(cp),
+                        Character.toUpperCase(cp),
+                        cps[2],
+                        Character.toLowerCase(folding),
+                        Character.toUpperCase(folding)
+                    );
+                errors.forEach(error -> System.out.print(error));
+                return errors.stream();
+            })
+            .collect(Collectors.toList());
+        assertEquals(results.size(), 0);
+    }
+
+    private static ArrayList<String> testCaseFolding(int cp, int folding) {
+        ArrayList<String> errors = new ArrayList<>();
+        testCaseFolding0(cp, folding, errors, "s-t");
+        testCaseFolding0(folding, cp, errors, "t-s");
+        // test all uppercase, lowercase combinations
+        var up = Character.toUpperCase(cp);
+        var lo = Character.toLowerCase(cp);
+        var folding_up = Character.toUpperCase(folding);  // folding should be normally lowercase
+        if (up != cp) {
+            testCaseFolding0(up, folding, errors, "s(u)-t");
+            testCaseFolding0(folding, up, errors, "t-s(u)");
+            if (folding_up != folding) {
+                testCaseFolding0(up, folding_up, errors, "s(u)-t(u)");
+                testCaseFolding0(folding_up, up, errors, "t(u)-s(u)");
+            }
+        }
+        if (lo != cp) {
+            testCaseFolding0(lo, folding, errors, "s(l)-t");
+            testCaseFolding0(folding, lo, errors, "t-s(l)");
+            if (folding_up != folding) {
+                testCaseFolding0(lo, folding_up, errors, "s(l)-t(u)");
+                testCaseFolding0(folding_up, lo, errors, "t(u)-s(l)");
+            }
+        }
+        return errors;
+    }
+
+    private static void testCaseFolding0(int cp, int folding, ArrayList<String> errors, String type) {
+        var cp_str = Character.isSupplementaryCodePoint(cp)
+            ? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(cp), (int)Character.lowSurrogate(cp))
+            : String.format("\\u%04x", cp);
+
+        var t = new String(Character.toChars(folding));
+        var p = String.format("(?iu)%s", cp_str);
+
+        if (Pattern.compile(p).matcher(t).matches() == false) {
+            errors.add(String.format("     [FAILED] slice:  %-20s  t: u+%04x  (%s)\n", p, folding, type));
+        }
+
+        p = String.format("(?iu)[%s]", cp_str);
+        if (Pattern.compile(p).matcher(t).matches() == false) {
+            errors.add(String.format("     [FAILED] single: %-20s  t: u+%04x  (%s)\n", p, folding, type));
+        }
+
+        p = String.format("(?iu)[%s-%s]", cp_str, cp_str);
+        if (Pattern.compile(p).matcher(t).matches() == false) {
+            errors.add(String.format("     [FAILED] range:  %-20s  t: u+%04x  (%s)\n", p, folding, type));
+        }
+
+        // small range
+        var end_cp = cp + 16;
+        var end_cp_str = Character.isSupplementaryCodePoint(end_cp)
+                ? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(end_cp), (int)Character.lowSurrogate(end_cp))
+                : String.format("\\u%04x", end_cp);
+        p = String.format("(?iu)[%s-%s]", cp_str, end_cp_str);
+        if (Pattern.compile(p).matcher(t).matches() == false) {
+            errors.add(String.format("     [FAILED] range:  %-20s  t: u+%04x  (%s)\n", p, folding, type));
+        }
+
+        end_cp = cp + 128;  // bigger than the expanded_casefolding_map.
+        end_cp_str = Character.isSupplementaryCodePoint(end_cp)
+                ? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(end_cp), (int)Character.lowSurrogate(end_cp))
+                : String.format("\\u%04x", end_cp);
+        p = String.format("(?iu)[%s-%s]", cp_str, end_cp_str);
+        if (Pattern.compile(p).matcher(t).matches() == false) {
+            errors.add(String.format("     [FAILED] range:  %-20s  t: u+%04x  (%s)\n", p, folding, type));
+        }
+    }
+}
--- a/test/jdk/java/util/regex/TestCases.txt
+++ b/test/jdk/java/util/regex/TestCases.txt
@ -1273,3 +1273,28 @@ true 11111111 1 1111
 ^(1{2,})\1+$
 11111111
 true 11111111 1 1111
+
+//
+(?ui)\u00df
+\u1e9e
+true \u1e9e 0
+
+(?ui)[\u00df]
+\u1e9e
+true \u1e9e 0
+
+(?ui)[\u00df-\u00df]
+\u1e9e
+true \u1e9e 0
+
+(?ui)\u1e9e
+\u00df
+true \u00df 0
+
+(?ui)[\u1e9e]
+\u00df
+true \u00df 0
+
+(?ui)[\u1e9e-\u1e9e]
+\u00df
+true \u00df 0
--- a/test/jdk/lib/testlibrary/java/lang/UCDFiles.java
+++ b/test/jdk/lib/testlibrary/java/lang/UCDFiles.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -55,4 +55,6 @@ public class UCDFiles {
        UCD_DIR.resolve("UnicodeData.txt");
    public static Path EMOJI_DATA =
        UCD_DIR.resolve("emoji").resolve("emoji-data.txt");
+    public static Path CASEFOLDING =
+        UCD_DIR.resolve("CaseFolding.txt");
 }