mirror of
https://github.com/openjdk/jdk.git
synced 2026-05-04 02:35:33 +00:00
8360459: UNICODE_CASE and character class with non-ASCII range does not match ASCII char
Reviewed-by: naoto
This commit is contained in:
parent
38af17d078
commit
401af27b9d
@ -78,6 +78,9 @@ TOOL_GENERATECACERTS = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_class
|
||||
TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
||||
build.tools.generateextraproperties.GenerateExtraProperties
|
||||
|
||||
TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
||||
build.tools.generatecharacter.CaseFolding
|
||||
|
||||
TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
||||
build.tools.makezipreproducible.MakeZipReproducible
|
||||
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package build.tools.generatecharacter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class CaseFolding {
|
||||
|
||||
public static void main(String[] args) throws Throwable {
|
||||
if (args.length != 3) {
|
||||
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
|
||||
System.exit(1);
|
||||
}
|
||||
var templateFile = Paths.get(args[0]);
|
||||
var caseFoldingTxt = Paths.get(args[1]);
|
||||
var genSrcFile = Paths.get(args[2]);
|
||||
var supportedTypes = "^.*; [CTS]; .*$";
|
||||
var caseFoldingEntries = Files.lines(caseFoldingTxt)
|
||||
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
|
||||
.map(line -> {
|
||||
String[] cols = line.split("; ");
|
||||
return new String[] {cols[0], cols[1], cols[2]};
|
||||
})
|
||||
.filter(cols -> {
|
||||
// the folding case doesn't map back to the original char.
|
||||
var cp1 = Integer.parseInt(cols[0], 16);
|
||||
var cp2 = Integer.parseInt(cols[2], 16);
|
||||
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
|
||||
})
|
||||
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
|
||||
.collect(Collectors.joining(",\n", "", ""));
|
||||
|
||||
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
|
||||
// 0049; T; 0131; # LATIN CAPITAL LETTER I
|
||||
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
|
||||
|
||||
// Generate .java file
|
||||
Files.write(
|
||||
genSrcFile,
|
||||
Files.lines(templateFile)
|
||||
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
|
||||
.collect(Collectors.toList()),
|
||||
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
}
|
||||
}
|
||||
@ -50,5 +50,22 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)
|
||||
|
||||
################################################################################
|
||||
|
||||
GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
|
||||
|
||||
CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
|
||||
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
|
||||
|
||||
$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
|
||||
$(call LogInfo, Generating $@)
|
||||
$(call MakeTargetDir)
|
||||
$(TOOL_GENERATECASEFOLDING) \
|
||||
$(CASEFOLDINGTEMP) \
|
||||
$(CASEFOLDINGTXT) \
|
||||
$(GENSRC_CASEFOLDING)
|
||||
|
||||
TARGETS += $(GENSRC_CASEFOLDING)
|
||||
|
||||
################################################################################
|
||||
|
||||
endif # include guard
|
||||
include MakeIncludeEnd.gmk
|
||||
|
||||
@ -44,6 +44,7 @@ import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import jdk.internal.util.ArraysSupport;
|
||||
import jdk.internal.util.regex.CaseFolding;
|
||||
import jdk.internal.util.regex.Grapheme;
|
||||
|
||||
/**
|
||||
@ -2915,6 +2916,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
toLowerCase(u+212a) ==> u+006B
|
||||
(6)AngstromSign u+212b
|
||||
toLowerCase(u+212b) ==> u+00e5
|
||||
(7) Latin Capital Letter Sharp S u+1e0e, was added in version 5.1
|
||||
toLowerCase(u+1e9e) ==> u+00df
|
||||
*/
|
||||
if (ch < 256 &&
|
||||
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
|
||||
@ -2922,7 +2925,11 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
ch == 0x49 || ch == 0x69 || //I and i
|
||||
ch == 0x53 || ch == 0x73 || //S and s
|
||||
ch == 0x4b || ch == 0x6b || //K and k
|
||||
ch == 0xc5 || ch == 0xe5))) { //A+ring
|
||||
ch == 0xc5 || ch == 0xe5 || //A+ring
|
||||
// need to force single() to use SingleU specifically for u+00df.
|
||||
// u+00df <-> u+1e9e, see https://codepoints.net/U+00DF.
|
||||
// Character.toUpperCase('u+00df') still returns u+00df for now.
|
||||
ch == 0xdf))) { // Shape S
|
||||
bits.add(ch, flags0);
|
||||
return null;
|
||||
}
|
||||
@ -2939,7 +2946,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
||||
upper = Character.toUpperCase(ch);
|
||||
lower = Character.toLowerCase(upper);
|
||||
// Unicode case insensitive matches
|
||||
if (upper != lower)
|
||||
if (upper != lower || ch == 0xDF)
|
||||
return SingleU(lower);
|
||||
} else if (ASCII.isAscii(ch)) {
|
||||
lower = ASCII.toLower(ch);
|
||||
@ -5960,12 +5967,29 @@ NEXT: while (i <= last) {
|
||||
}
|
||||
|
||||
static CharPredicate CIRangeU(int lower, int upper) {
|
||||
int[] closingCharacters = CaseFolding.getClassRangeClosingCharacters(lower, upper);
|
||||
if (closingCharacters.length == 0) {
|
||||
return ch -> {
|
||||
if (inRange(lower, ch, upper))
|
||||
return true;
|
||||
int up = Character.toUpperCase(ch);
|
||||
return (inRange(lower, up, upper) ||
|
||||
inRange(lower, Character.toLowerCase(up), upper));
|
||||
};
|
||||
}
|
||||
return ch -> {
|
||||
if (inRange(lower, ch, upper))
|
||||
return true;
|
||||
int up = Character.toUpperCase(ch);
|
||||
return inRange(lower, up, upper) ||
|
||||
inRange(lower, Character.toLowerCase(up), upper);
|
||||
int lo = Character.toLowerCase(up);
|
||||
if (inRange(lower, up, upper) ||
|
||||
inRange(lower, lo, upper))
|
||||
return true;
|
||||
for (int cp : closingCharacters) {
|
||||
if (up == cp || lo == cp)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,116 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package jdk.internal.util.regex;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import static java.util.Map.entry;
|
||||
|
||||
public final class CaseFolding {
|
||||
|
||||
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
|
||||
%%%Entries
|
||||
);
|
||||
|
||||
private static final int[] expanded_case_cps = expanded_case_map.keySet()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue)
|
||||
.toArray();
|
||||
|
||||
private CaseFolding() {}
|
||||
|
||||
/**
|
||||
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
|
||||
* matching, according to the
|
||||
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
|
||||
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
|
||||
* <p>
|
||||
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
|
||||
* be applied to literals and (optionally) to character classes. When applied to character classes, each
|
||||
* character class is expected to be closed under simple case folding. See the standard for the
|
||||
* detailed explanation and example of "closed".
|
||||
* <p>
|
||||
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
|
||||
* <ol>
|
||||
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
|
||||
* <li>Specify which character properties or constructs are closed under the matching.</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
|
||||
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
|
||||
* family may appears independently or within a class.
|
||||
* <p>
|
||||
* For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
|
||||
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
|
||||
* matching.
|
||||
* <p>
|
||||
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
|
||||
* if their behavior is clearly specified.
|
||||
* <p>
|
||||
* This method addresses that requirement for the "range" construct within in character class by computing
|
||||
* the additional characters that should be included to close the range under simple case folding:
|
||||
* <p>
|
||||
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
|
||||
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
|
||||
* character is not already in the range, then that mapped character (typically lowercase) is added to
|
||||
* the expansion set.
|
||||
* <p>
|
||||
* This allows regex character class "range" implementation to use the returned expansion set to support
|
||||
* additional case-insensitive matching, without duplicating characters already covered by the existing
|
||||
* regex range implementation. The expectation is the matching is done using both the uppercase and
|
||||
* lowercase forms of the input character, for example
|
||||
*
|
||||
* <pre>{@code
|
||||
*
|
||||
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
|
||||
* inRange(lower, Character.toLower(ch), upper) ||
|
||||
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
|
||||
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
|
||||
* }</pre>
|
||||
*
|
||||
* <p>
|
||||
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
|
||||
* @param start the starting code point of the character range
|
||||
* @param end the ending code point of the character range
|
||||
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
|
||||
* those already in the range
|
||||
*/
|
||||
public static int[] getClassRangeClosingCharacters(int start, int end) {
|
||||
int[] expanded = new int[expanded_case_cps.length];
|
||||
int off = 0;
|
||||
for (int cp : expanded_case_cps) {
|
||||
if (cp >= start && cp <= end) {
|
||||
int folding = expanded_case_map.get(cp);
|
||||
if (folding < start || folding > end) {
|
||||
expanded[off++] = folding;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Arrays.copyOf(expanded, off);
|
||||
}
|
||||
}
|
||||
1654
src/java.base/share/data/unicodedata/CaseFolding.txt
Normal file
1654
src/java.base/share/data/unicodedata/CaseFolding.txt
Normal file
File diff suppressed because it is too large
Load Diff
165
test/jdk/java/util/regex/CaseFoldingTest.java
Normal file
165
test/jdk/java/util/regex/CaseFoldingTest.java
Normal file
@ -0,0 +1,165 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary tests RegExp unicode case-insensitive match (?ui)
|
||||
* @bug 8360459
|
||||
* @library /lib/testlibrary/java/lang
|
||||
* @run junit CaseFoldingTest
|
||||
*/
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class CaseFoldingTest {
|
||||
|
||||
@Test
|
||||
void testUnicodeCaseInsensitiveMatch() throws Throwable {
|
||||
var testAll = true; // true to test all codepoints defined in CaseFolding.txt
|
||||
var verbose = true; // true to display all codepoints being tested
|
||||
var filter = "^.*; [CTS]; .*$"; // update C,T,S to test different type
|
||||
var excluded = Set.of(
|
||||
// these 'S' characters failed for known reason. they don't map to their
|
||||
// folding form with toUpperCase or toLowerCase, only map with case-folding.
|
||||
// exclude them for now.
|
||||
0x1fd3, // 1FD3 [lo: 1fd3, up: 1fd3] 0390 [lo: 0390, up: 0390]
|
||||
0x1fe3, // 1FE3 [lo: 1fe3, up: 1fe3] 03B0 [lo: 03b0, up: 03b0]
|
||||
0xfb05 // FB05 [lo: fb05, up: fb05] FB06 [lo: fb06, up: fb06]
|
||||
);
|
||||
|
||||
var results = Files.lines(UCDFiles.CASEFOLDING)
|
||||
.filter(line -> !line.startsWith("#") && line.matches(filter))
|
||||
.map(line -> {
|
||||
var strs = line.split("; ");
|
||||
return new String[] {strs[0], strs[1], strs[2]};
|
||||
})
|
||||
.filter(cps -> {
|
||||
var cp1 = Integer.parseInt(cps[0], 16);
|
||||
var cp2 = Integer.parseInt(cps[2], 16);
|
||||
if (excluded.contains(cp1))
|
||||
return false;
|
||||
if (testAll) {
|
||||
return true;
|
||||
}
|
||||
// the folding codepoint doesn't map back to the original codepoint.
|
||||
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
|
||||
})
|
||||
.flatMap(cps -> {
|
||||
// test slice, single & range
|
||||
var cp = Integer.parseInt(cps[0], 16);
|
||||
var folding = Integer.parseInt(cps[2], 16);
|
||||
var errors = testCaseFolding(cp, folding);
|
||||
if (verbose)
|
||||
System.out.format(" [%s] %s [lo: %04x, up: %04x] %s [lo: %04x, up: %04x]\n",
|
||||
cps[1],
|
||||
cps[0],
|
||||
Character.toLowerCase(cp),
|
||||
Character.toUpperCase(cp),
|
||||
cps[2],
|
||||
Character.toLowerCase(folding),
|
||||
Character.toUpperCase(folding)
|
||||
);
|
||||
errors.forEach(error -> System.out.print(error));
|
||||
return errors.stream();
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
assertEquals(results.size(), 0);
|
||||
}
|
||||
|
||||
private static ArrayList<String> testCaseFolding(int cp, int folding) {
|
||||
ArrayList<String> errors = new ArrayList<>();
|
||||
testCaseFolding0(cp, folding, errors, "s-t");
|
||||
testCaseFolding0(folding, cp, errors, "t-s");
|
||||
// test all uppercase, lowercase combinations
|
||||
var up = Character.toUpperCase(cp);
|
||||
var lo = Character.toLowerCase(cp);
|
||||
var folding_up = Character.toUpperCase(folding); // folding should be normally lowercase
|
||||
if (up != cp) {
|
||||
testCaseFolding0(up, folding, errors, "s(u)-t");
|
||||
testCaseFolding0(folding, up, errors, "t-s(u)");
|
||||
if (folding_up != folding) {
|
||||
testCaseFolding0(up, folding_up, errors, "s(u)-t(u)");
|
||||
testCaseFolding0(folding_up, up, errors, "t(u)-s(u)");
|
||||
}
|
||||
}
|
||||
if (lo != cp) {
|
||||
testCaseFolding0(lo, folding, errors, "s(l)-t");
|
||||
testCaseFolding0(folding, lo, errors, "t-s(l)");
|
||||
if (folding_up != folding) {
|
||||
testCaseFolding0(lo, folding_up, errors, "s(l)-t(u)");
|
||||
testCaseFolding0(folding_up, lo, errors, "t(u)-s(l)");
|
||||
}
|
||||
}
|
||||
return errors;
|
||||
}
|
||||
|
||||
private static void testCaseFolding0(int cp, int folding, ArrayList<String> errors, String type) {
|
||||
var cp_str = Character.isSupplementaryCodePoint(cp)
|
||||
? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(cp), (int)Character.lowSurrogate(cp))
|
||||
: String.format("\\u%04x", cp);
|
||||
|
||||
var t = new String(Character.toChars(folding));
|
||||
var p = String.format("(?iu)%s", cp_str);
|
||||
|
||||
if (Pattern.compile(p).matcher(t).matches() == false) {
|
||||
errors.add(String.format(" [FAILED] slice: %-20s t: u+%04x (%s)\n", p, folding, type));
|
||||
}
|
||||
|
||||
p = String.format("(?iu)[%s]", cp_str);
|
||||
if (Pattern.compile(p).matcher(t).matches() == false) {
|
||||
errors.add(String.format(" [FAILED] single: %-20s t: u+%04x (%s)\n", p, folding, type));
|
||||
}
|
||||
|
||||
p = String.format("(?iu)[%s-%s]", cp_str, cp_str);
|
||||
if (Pattern.compile(p).matcher(t).matches() == false) {
|
||||
errors.add(String.format(" [FAILED] range: %-20s t: u+%04x (%s)\n", p, folding, type));
|
||||
}
|
||||
|
||||
// small range
|
||||
var end_cp = cp + 16;
|
||||
var end_cp_str = Character.isSupplementaryCodePoint(end_cp)
|
||||
? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(end_cp), (int)Character.lowSurrogate(end_cp))
|
||||
: String.format("\\u%04x", end_cp);
|
||||
p = String.format("(?iu)[%s-%s]", cp_str, end_cp_str);
|
||||
if (Pattern.compile(p).matcher(t).matches() == false) {
|
||||
errors.add(String.format(" [FAILED] range: %-20s t: u+%04x (%s)\n", p, folding, type));
|
||||
}
|
||||
|
||||
end_cp = cp + 128; // bigger than the expanded_casefolding_map.
|
||||
end_cp_str = Character.isSupplementaryCodePoint(end_cp)
|
||||
? String.format("\\u%04x\\u%04x", (int)Character.highSurrogate(end_cp), (int)Character.lowSurrogate(end_cp))
|
||||
: String.format("\\u%04x", end_cp);
|
||||
p = String.format("(?iu)[%s-%s]", cp_str, end_cp_str);
|
||||
if (Pattern.compile(p).matcher(t).matches() == false) {
|
||||
errors.add(String.format(" [FAILED] range: %-20s t: u+%04x (%s)\n", p, folding, type));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1273,3 +1273,28 @@ true 11111111 1 1111
|
||||
^(1{2,})\1+$
|
||||
11111111
|
||||
true 11111111 1 1111
|
||||
|
||||
//
|
||||
(?ui)\u00df
|
||||
\u1e9e
|
||||
true \u1e9e 0
|
||||
|
||||
(?ui)[\u00df]
|
||||
\u1e9e
|
||||
true \u1e9e 0
|
||||
|
||||
(?ui)[\u00df-\u00df]
|
||||
\u1e9e
|
||||
true \u1e9e 0
|
||||
|
||||
(?ui)\u1e9e
|
||||
\u00df
|
||||
true \u00df 0
|
||||
|
||||
(?ui)[\u1e9e]
|
||||
\u00df
|
||||
true \u00df 0
|
||||
|
||||
(?ui)[\u1e9e-\u1e9e]
|
||||
\u00df
|
||||
true \u00df 0
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -55,4 +55,6 @@ public class UCDFiles {
|
||||
UCD_DIR.resolve("UnicodeData.txt");
|
||||
public static Path EMOJI_DATA =
|
||||
UCD_DIR.resolve("emoji").resolve("emoji-data.txt");
|
||||
public static Path CASEFOLDING =
|
||||
UCD_DIR.resolve("CaseFolding.txt");
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user