8360459: UNICODE_CASE and character class with non-ASCII range does not match ASCII char

Reviewed-by: naoto
This commit is contained in:
Xueming Shen 2025-07-15 17:57:13 +00:00
parent 38af17d078
commit 401af27b9d
9 changed files with 2084 additions and 5 deletions

View File

@ -78,6 +78,9 @@ TOOL_GENERATECACERTS = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_class
TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
build.tools.generateextraproperties.GenerateExtraProperties
TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
build.tools.generatecharacter.CaseFolding
TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
build.tools.makezipreproducible.MakeZipReproducible

View File

@ -0,0 +1,73 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package build.tools.generatecharacter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class CaseFolding {
public static void main(String[] args) throws Throwable {
if (args.length != 3) {
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
System.exit(1);
}
var templateFile = Paths.get(args[0]);
var caseFoldingTxt = Paths.get(args[1]);
var genSrcFile = Paths.get(args[2]);
var supportedTypes = "^.*; [CTS]; .*$";
var caseFoldingEntries = Files.lines(caseFoldingTxt)
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
.map(line -> {
String[] cols = line.split("; ");
return new String[] {cols[0], cols[1], cols[2]};
})
.filter(cols -> {
// the folding case doesn't map back to the original char.
var cp1 = Integer.parseInt(cols[0], 16);
var cp2 = Integer.parseInt(cols[2], 16);
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
.collect(Collectors.joining(",\n", "", ""));
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
// 0049; T; 0131; # LATIN CAPITAL LETTER I
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
// Generate .java file
Files.write(
genSrcFile,
Files.lines(templateFile)
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
.collect(Collectors.toList()),
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
}

View File

@ -50,5 +50,22 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)
################################################################################
GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
$(call LogInfo, Generating $@)
$(call MakeTargetDir)
$(TOOL_GENERATECASEFOLDING) \
$(CASEFOLDINGTEMP) \
$(CASEFOLDINGTXT) \
$(GENSRC_CASEFOLDING)
TARGETS += $(GENSRC_CASEFOLDING)
################################################################################
endif # include guard
include MakeIncludeEnd.gmk

View File

@ -44,6 +44,7 @@ import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import jdk.internal.util.ArraysSupport;
import jdk.internal.util.regex.CaseFolding;
import jdk.internal.util.regex.Grapheme;
/**
@ -2915,6 +2916,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
toLowerCase(u+212a) ==> u+006B
(6)AngstromSign u+212b
toLowerCase(u+212b) ==> u+00e5
(7) Latin Capital Letter Sharp S u+1e0e, was added in version 5.1
toLowerCase(u+1e9e) ==> u+00df
*/
if (ch < 256 &&
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
@ -2922,7 +2925,11 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
ch == 0x49 || ch == 0x69 || //I and i
ch == 0x53 || ch == 0x73 || //S and s
ch == 0x4b || ch == 0x6b || //K and k
ch == 0xc5 || ch == 0xe5))) { //A+ring
ch == 0xc5 || ch == 0xe5 || //A+ring
// need to force single() to use SingleU specifically for u+00df.
// u+00df <-> u+1e9e, see https://codepoints.net/U+00DF.
// Character.toUpperCase('u+00df') still returns u+00df for now.
ch == 0xdf))) { // Shape S
bits.add(ch, flags0);
return null;
}
@ -2939,7 +2946,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
upper = Character.toUpperCase(ch);
lower = Character.toLowerCase(upper);
// Unicode case insensitive matches
if (upper != lower)
if (upper != lower || ch == 0xDF)
return SingleU(lower);
} else if (ASCII.isAscii(ch)) {
lower = ASCII.toLower(ch);
@ -5960,12 +5967,29 @@ NEXT: while (i <= last) {
}
static CharPredicate CIRangeU(int lower, int upper) {
int[] closingCharacters = CaseFolding.getClassRangeClosingCharacters(lower, upper);
if (closingCharacters.length == 0) {
return ch -> {
if (inRange(lower, ch, upper))
return true;
int up = Character.toUpperCase(ch);
return (inRange(lower, up, upper) ||
inRange(lower, Character.toLowerCase(up), upper));
};
}
return ch -> {
if (inRange(lower, ch, upper))
return true;
int up = Character.toUpperCase(ch);
return inRange(lower, up, upper) ||
inRange(lower, Character.toLowerCase(up), upper);
int lo = Character.toLowerCase(up);
if (inRange(lower, up, upper) ||
inRange(lower, lo, upper))
return true;
for (int cp : closingCharacters) {
if (up == cp || lo == cp)
return true;
}
return false;
};
}

View File

@ -0,0 +1,116 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package jdk.internal.util.regex;
import java.util.Arrays;
import java.util.Map;
import java.util.Objects;
import static java.util.Map.entry;
public final class CaseFolding {
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
%%%Entries
);
private static final int[] expanded_case_cps = expanded_case_map.keySet()
.stream()
.mapToInt(Integer::intValue)
.toArray();
private CaseFolding() {}
/**
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
* matching, according to the
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
* <p>
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
* be applied to literals and (optionally) to character classes. When applied to character classes, each
* character class is expected to be closed under simple case folding. See the standard for the
* detailed explanation and example of "closed".
* <p>
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
* <ol>
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
* <li>Specify which character properties or constructs are closed under the matching.</li>
* </ol>
* <p>
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
* family may appears independently or within a class.
* <p>
* For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
* matching.
* <p>
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
* if their behavior is clearly specified.
* <p>
* This method addresses that requirement for the "range" construct within in character class by computing
* the additional characters that should be included to close the range under simple case folding:
* <p>
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
* character is not already in the range, then that mapped character (typically lowercase) is added to
* the expansion set.
* <p>
* This allows regex character class "range" implementation to use the returned expansion set to support
* additional case-insensitive matching, without duplicating characters already covered by the existing
* regex range implementation. The expectation is the matching is done using both the uppercase and
* lowercase forms of the input character, for example
*
* <pre>{@code
*
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
* inRange(lower, Character.toLower(ch), upper) ||
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
* }</pre>
*
* <p>
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
* @param start the starting code point of the character range
* @param end the ending code point of the character range
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
* those already in the range
*/
public static int[] getClassRangeClosingCharacters(int start, int end) {
int[] expanded = new int[expanded_case_cps.length];
int off = 0;
for (int cp : expanded_case_cps) {
if (cp >= start && cp <= end) {
int folding = expanded_case_map.get(cp);
if (folding < start || folding > end) {
expanded[off++] = folding;
}
}
}
return Arrays.copyOf(expanded, off);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,165 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @summary tests RegExp unicode case-insensitive match (?ui)
* @bug 8360459
* @library /lib/testlibrary/java/lang
* @run junit CaseFoldingTest
*/
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class CaseFoldingTest {
@Test
void testUnicodeCaseInsensitiveMatch() throws Throwable {
var testAll = true; // true to test all codepoints defined in CaseFolding.txt
var verbose = true; // true to display all codepoints being tested
var filter = "^.*; [CTS]; .*$"; // update C,T,S to test different type
var excluded = Set.of(
// these 'S' characters failed for known reason. they don't map to their
// folding form with toUpperCase or toLowerCase, only map with case-folding.
// exclude them for now.
0x1fd3, // 1FD3 [lo: 1fd3, up: 1fd3] 0390 [lo: 0390, up: 0390]
0x1fe3, // 1FE3 [lo: 1fe3, up: 1fe3] 03B0 [lo: 03b0, up: 03b0]
0xfb05 // FB05 [lo: fb05, up: fb05] FB06 [lo: fb06, up: fb06]
);
var results = Files.lines(UCDFiles.CASEFOLDING)
.filter(line -> !line.startsWith("#") && line.matches(filter))
.map(line -> {
var strs = line.split("; ");
return new String[] {strs[0], strs[1], strs[2]};
})
.filter(cps -> {
var cp1 = Integer.parseInt(cps[0], 16);
var cp2 = Integer.parseInt(cps[2], 16);
if (excluded.contains(cp1))
return false;
if (testAll) {
return true;
}
// the folding codepoint doesn't map back to the original codepoint.
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
})
.flatMap(cps -> {
// test slice, single & range
var cp = Integer.parseInt(cps[0], 16);
var folding = Integer.parseInt(cps[2], 16);
var errors = testCaseFolding(cp, folding);
if (verbose)
System.out.format(" [%s] %s [lo: %04x, up: %04x] %s [lo: %04x, up: %04x]\n",
cps[1],
cps[0],
Character.toLowerCase(cp),
Character.toUpperCase(cp),
cps[2],
Character.toLowerCase(folding),
Character.toUpperCase(folding)
);
errors.forEach(error -> System.out.print(error));
return errors.stream();
})
.collect(Collectors.toList());
assertEquals(results.size(), 0);
}
private static ArrayList<String> testCaseFolding(int cp, int folding) {
ArrayList<String> errors = new ArrayList<>();
testCaseFolding0(cp, folding, errors, "s-t");
testCaseFolding0(folding, cp, errors, "t-s");
// test all uppercase, lowercase combinations
var up = Character.toUpperCase(cp);
var lo = Character.toLowerCase(cp);
var folding_up = Character.toUpperCase(folding); // folding should be normally lowercase
if (up != cp) {
testCaseFolding0(up, folding, errors, "s(u)-t");
testCaseFolding0(folding, up, errors, "t-s(u)");
if (folding_up != folding) {
testCaseFolding0(up, folding_up, errors, "s(u)-t(u)");
testCaseFolding0(folding_up, up, errors, "t(u)-s(u)");
}
}
if (lo != cp) {
testCaseFolding0(lo, folding, errors, "s(l)-t");
testCaseFolding0(folding, lo, errors, "t-s(l)");
if (folding_up != folding) {
testCaseFolding0(lo, folding_up, errors, "s(l)-t(u)");
testCaseFolding0(folding_up, lo, errors, "t(u)-s(l)");
}
}
return errors;
}
private static void testCaseFolding0(int cp, int folding, ArrayList<String> errors, String type) {
var cp_str = Character.isSupplementaryCodePoint(cp)
? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(cp), (int)Character.lowSurrogate(cp))
: String.format("\\u%04x", cp);
var t = new String(Character.toChars(folding));
var p = String.format("(?iu)%s", cp_str);
if (Pattern.compile(p).matcher(t).matches() == false) {
errors.add(String.format(" [FAILED] slice: %-20s t: u+%04x (%s)\n", p, folding, type));
}
p = String.format("(?iu)[%s]", cp_str);
if (Pattern.compile(p).matcher(t).matches() == false) {
errors.add(String.format(" [FAILED] single: %-20s t: u+%04x (%s)\n", p, folding, type));
}
p = String.format("(?iu)[%s-%s]", cp_str, cp_str);
if (Pattern.compile(p).matcher(t).matches() == false) {
errors.add(String.format(" [FAILED] range: %-20s t: u+%04x (%s)\n", p, folding, type));
}
// small range
var end_cp = cp + 16;
var end_cp_str = Character.isSupplementaryCodePoint(end_cp)
? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(end_cp), (int)Character.lowSurrogate(end_cp))
: String.format("\\u%04x", end_cp);
p = String.format("(?iu)[%s-%s]", cp_str, end_cp_str);
if (Pattern.compile(p).matcher(t).matches() == false) {
errors.add(String.format(" [FAILED] range: %-20s t: u+%04x (%s)\n", p, folding, type));
}
end_cp = cp + 128; // bigger than the expanded_casefolding_map.
end_cp_str = Character.isSupplementaryCodePoint(end_cp)
? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(end_cp), (int)Character.lowSurrogate(end_cp))
: String.format("\\u%04x", end_cp);
p = String.format("(?iu)[%s-%s]", cp_str, end_cp_str);
if (Pattern.compile(p).matcher(t).matches() == false) {
errors.add(String.format(" [FAILED] range: %-20s t: u+%04x (%s)\n", p, folding, type));
}
}
}

View File

@ -1273,3 +1273,28 @@ true 11111111 1 1111
^(1{2,})\1+$
11111111
true 11111111 1 1111
//
(?ui)\u00df
\u1e9e
true \u1e9e 0
(?ui)[\u00df]
\u1e9e
true \u1e9e 0
(?ui)[\u00df-\u00df]
\u1e9e
true \u1e9e 0
(?ui)\u1e9e
\u00df
true \u00df 0
(?ui)[\u1e9e]
\u00df
true \u00df 0
(?ui)[\u1e9e-\u1e9e]
\u00df
true \u00df 0

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -55,4 +55,6 @@ public class UCDFiles {
UCD_DIR.resolve("UnicodeData.txt");
public static Path EMOJI_DATA =
UCD_DIR.resolve("emoji").resolve("emoji-data.txt");
public static Path CASEFOLDING =
UCD_DIR.resolve("CaseFolding.txt");
}