mirror of
https://github.com/openjdk/jdk.git
synced 2026-01-28 03:58:21 +00:00
8365675: Add String Unicode Case-Folding Support
Reviewed-by: rriggs, naoto, ihse
This commit is contained in:
parent
618732ffc0
commit
b97ed667db
@ -79,7 +79,7 @@ TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_too
|
|||||||
build.tools.generateextraproperties.GenerateExtraProperties
|
build.tools.generateextraproperties.GenerateExtraProperties
|
||||||
|
|
||||||
TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
||||||
build.tools.generatecharacter.CaseFolding
|
build.tools.generatecharacter.GenerateCaseFolding
|
||||||
|
|
||||||
TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
|
||||||
build.tools.makezipreproducible.MakeZipReproducible
|
build.tools.makezipreproducible.MakeZipReproducible
|
||||||
|
|||||||
@ -1,73 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
||||||
*
|
|
||||||
* This code is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 only, as
|
|
||||||
* published by the Free Software Foundation. Oracle designates this
|
|
||||||
* particular file as subject to the "Classpath" exception as provided
|
|
||||||
* by Oracle in the LICENSE file that accompanied this code.
|
|
||||||
*
|
|
||||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
||||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
||||||
* version 2 for more details (a copy is included in the LICENSE file that
|
|
||||||
* accompanied this code).
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License version
|
|
||||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
||||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
*
|
|
||||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
||||||
* or visit www.oracle.com if you need additional information or have any
|
|
||||||
* questions.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package build.tools.generatecharacter;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
import java.nio.file.StandardOpenOption;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class CaseFolding {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Throwable {
|
|
||||||
if (args.length != 3) {
|
|
||||||
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
var templateFile = Paths.get(args[0]);
|
|
||||||
var caseFoldingTxt = Paths.get(args[1]);
|
|
||||||
var genSrcFile = Paths.get(args[2]);
|
|
||||||
var supportedTypes = "^.*; [CTS]; .*$";
|
|
||||||
var caseFoldingEntries = Files.lines(caseFoldingTxt)
|
|
||||||
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
|
|
||||||
.map(line -> {
|
|
||||||
String[] cols = line.split("; ");
|
|
||||||
return new String[] {cols[0], cols[1], cols[2]};
|
|
||||||
})
|
|
||||||
.filter(cols -> {
|
|
||||||
// the folding case doesn't map back to the original char.
|
|
||||||
var cp1 = Integer.parseInt(cols[0], 16);
|
|
||||||
var cp2 = Integer.parseInt(cols[2], 16);
|
|
||||||
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
|
|
||||||
})
|
|
||||||
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
|
|
||||||
.collect(Collectors.joining(",\n", "", ""));
|
|
||||||
|
|
||||||
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
|
|
||||||
// 0049; T; 0131; # LATIN CAPITAL LETTER I
|
|
||||||
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
|
|
||||||
|
|
||||||
// Generate .java file
|
|
||||||
Files.write(
|
|
||||||
genSrcFile,
|
|
||||||
Files.lines(templateFile)
|
|
||||||
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
|
|
||||||
.collect(Collectors.toList()),
|
|
||||||
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -0,0 +1,134 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation. Oracle designates this
|
||||||
|
* particular file as subject to the "Classpath" exception as provided
|
||||||
|
* by Oracle in the LICENSE file that accompanied this code.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
package build.tools.generatecharacter;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
|
public class GenerateCaseFolding {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Throwable {
|
||||||
|
if (args.length != 3) {
|
||||||
|
System.err.println("Usage: java GenerateCaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
var templateFile = Paths.get(args[0]);
|
||||||
|
var caseFoldingTxt = Paths.get(args[1]);
|
||||||
|
var genSrcFile = Paths.get(args[2]);
|
||||||
|
|
||||||
|
// java.lang
|
||||||
|
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
|
||||||
|
String[][] caseFoldings = Files.lines(caseFoldingTxt)
|
||||||
|
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
|
||||||
|
.map(line -> {
|
||||||
|
var fields = line.split("; ");
|
||||||
|
var cp = fields[0];
|
||||||
|
fields = fields[2].trim().split(" ");
|
||||||
|
var folding = new String[fields.length + 1];
|
||||||
|
folding[0] = cp;
|
||||||
|
System.arraycopy(fields, 0, folding, 1, fields.length);
|
||||||
|
return folding;
|
||||||
|
})
|
||||||
|
.toArray(size -> new String[size][]);
|
||||||
|
|
||||||
|
// util.regex
|
||||||
|
var expandedSupportedTypes = "^.*; [CTS]; .*$";
|
||||||
|
var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt)
|
||||||
|
.filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes))
|
||||||
|
.map(line -> {
|
||||||
|
String[] cols = line.split("; ");
|
||||||
|
return new String[]{cols[0], cols[1], cols[2]};
|
||||||
|
})
|
||||||
|
.filter(cols -> {
|
||||||
|
// the folding case doesn't map back to the original char.
|
||||||
|
var cp1 = Integer.parseInt(cols[0], 16);
|
||||||
|
var cp2 = Integer.parseInt(cols[2], 16);
|
||||||
|
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
|
||||||
|
})
|
||||||
|
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
|
||||||
|
.collect(Collectors.joining(",\n", "", ""));
|
||||||
|
|
||||||
|
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
|
||||||
|
// 0049; T; 0131; # LATIN CAPITAL LETTER I
|
||||||
|
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
|
||||||
|
|
||||||
|
Files.write(
|
||||||
|
genSrcFile,
|
||||||
|
Files.lines(templateFile)
|
||||||
|
.map(line -> line.contains("%%%Entries") ? genFoldingEntries(caseFoldings) : line)
|
||||||
|
.map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line)
|
||||||
|
.collect(Collectors.toList()),
|
||||||
|
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long foldingToLong(String[] folding) {
|
||||||
|
int cp = Integer.parseInt(folding[0], 16);
|
||||||
|
long value = (long)Integer.parseInt(folding[1], 16);
|
||||||
|
if (!Character.isSupplementaryCodePoint(cp) && folding.length != 2) {
|
||||||
|
var shift = 16;
|
||||||
|
for (int j = 2; j < folding.length; j++) {
|
||||||
|
value |= (long)Integer.parseInt(folding[j], 16) << shift;
|
||||||
|
shift <<= 1;
|
||||||
|
}
|
||||||
|
value = value | (long) (folding.length - 1) << 48;
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String genFoldingEntries(String[][] foldings) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append(" private static final int[] CASE_FOLDING_CPS = {\n");
|
||||||
|
int width = 10;
|
||||||
|
for (int i = 0; i < foldings.length; i++) {
|
||||||
|
if (i % width == 0)
|
||||||
|
sb.append(" ");
|
||||||
|
sb.append(String.format("0X%s", foldings[i][0]));
|
||||||
|
if (i < foldings.length - 1)
|
||||||
|
sb.append(", ");
|
||||||
|
if (i % width == width - 1 || i == foldings.length - 1)
|
||||||
|
sb.append("\n");
|
||||||
|
}
|
||||||
|
sb.append(" };\n\n");
|
||||||
|
|
||||||
|
sb.append(" private static final long[] CASE_FOLDING_VALUES = {\n");
|
||||||
|
width = 6;
|
||||||
|
for (int i = 0; i < foldings.length; i++) {
|
||||||
|
if (i % width == 0)
|
||||||
|
sb.append(" "); // indent
|
||||||
|
sb.append(String.format("0x%013xL", foldingToLong(foldings[i])));
|
||||||
|
if (i < foldings.length - 1)
|
||||||
|
sb.append(", ");
|
||||||
|
if (i % width == width - 1 || i == foldings.length - 1) {
|
||||||
|
sb.append("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sb.append(" };\n");
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA)
|
|||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/lang/CaseFolding.java
|
||||||
|
|
||||||
|
STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
|
||||||
|
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
|
||||||
|
|
||||||
|
$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
|
||||||
|
$(call LogInfo, Generating $@)
|
||||||
|
$(call MakeTargetDir)
|
||||||
|
$(TOOL_GENERATECASEFOLDING) \
|
||||||
|
$(STRINGCASEFOLDING_TEMPLATE) \
|
||||||
|
$(CASEFOLDINGTXT) \
|
||||||
|
$(GENSRC_STRINGCASEFOLDING)
|
||||||
|
|
||||||
|
TARGETS += $(GENSRC_STRINGCASEFOLDING)
|
||||||
|
|
||||||
|
|
||||||
endif # include guard
|
endif # include guard
|
||||||
include MakeIncludeEnd.gmk
|
include MakeIncludeEnd.gmk
|
||||||
|
|||||||
@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)
|
|||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
|
|
||||||
|
|
||||||
CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
|
|
||||||
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
|
|
||||||
|
|
||||||
$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
|
|
||||||
$(call LogInfo, Generating $@)
|
|
||||||
$(call MakeTargetDir)
|
|
||||||
$(TOOL_GENERATECASEFOLDING) \
|
|
||||||
$(CASEFOLDINGTEMP) \
|
|
||||||
$(CASEFOLDINGTXT) \
|
|
||||||
$(GENSRC_CASEFOLDING)
|
|
||||||
|
|
||||||
TARGETS += $(GENSRC_CASEFOLDING)
|
|
||||||
|
|
||||||
################################################################################
|
|
||||||
|
|
||||||
endif # include guard
|
endif # include guard
|
||||||
include MakeIncludeEnd.gmk
|
include MakeIncludeEnd.gmk
|
||||||
|
|||||||
@ -117,9 +117,38 @@ import sun.nio.cs.UTF_8;
|
|||||||
* Unicode code points (i.e., characters), in addition to those for
|
* Unicode code points (i.e., characters), in addition to those for
|
||||||
* dealing with Unicode code units (i.e., {@code char} values).
|
* dealing with Unicode code units (i.e., {@code char} values).
|
||||||
*
|
*
|
||||||
* <p>Unless otherwise noted, methods for comparing Strings do not take locale
|
* <p><b>String comparison and case-insensitive matching</b>
|
||||||
* into account. The {@link java.text.Collator} class provides methods for
|
*
|
||||||
* finer-grain, locale-sensitive String comparison.
|
* <p>There are several related ways to compare {@code String} values; choose
|
||||||
|
* the one whose semantics fit your purpose:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li><b>Exact content equality</b> — {@link #equals(Object)} checks that two
|
||||||
|
* strings contain the identical char sequence of UTF-16 code units. This is
|
||||||
|
* a strict, case-sensitive comparison suitable for exact matching, hashing
|
||||||
|
* and any situation that requires bit-for-bit stability.</li>
|
||||||
|
*
|
||||||
|
* <li><b>Simple case-insensitive equality</b> — {@link #equalsIgnoreCase(String)}
|
||||||
|
* (and the corresponding {@link #compareToIgnoreCase(String)} and {@link #CASE_INSENSITIVE_ORDER})
|
||||||
|
* performs a per-code-point, locale-independent comparison using
|
||||||
|
* {@link Character#toUpperCase(int)} and {@link Character#toLowerCase(int)}.
|
||||||
|
* It is convenient for many common case-insensitive checks.</li>
|
||||||
|
*
|
||||||
|
* <li><b>Unicode case-folded equivalence</b> — {@link #equalsFoldCase(String)}
|
||||||
|
* (and the corresponding {@link #compareToFoldCase(String)} and {@link #UNICODE_CASEFOLD_ORDER})
|
||||||
|
* implement the Unicode <em>{@index "full case folding"}</em> rules defined in
|
||||||
|
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">Unicode CaseFolding.txt</a>.
|
||||||
|
* Case folding is locale-independent and language-neutral and may map a single code
|
||||||
|
* point to multiple code points (1:M mappings). For example, the German sharp
|
||||||
|
* s ({@code U+00DF}) is folded to the sequence {@code "ss"}.
|
||||||
|
* Use these methods when you need Unicode-compliant
|
||||||
|
* <a href="https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790">
|
||||||
|
* caseless matching</a>, searching, or ordering.</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>Unless otherwise noted, methods for comparing Strings do not take locale into
|
||||||
|
* account. The {@link java.text.Collator} class provides methods for finer-grain,
|
||||||
|
* locale-sensitive String comparison.
|
||||||
*
|
*
|
||||||
* @implNote The implementation of the string concatenation operator is left to
|
* @implNote The implementation of the string concatenation operator is left to
|
||||||
* the discretion of a Java compiler, as long as the compiler ultimately conforms
|
* the discretion of a Java compiler, as long as the compiler ultimately conforms
|
||||||
@ -2179,6 +2208,7 @@ public final class String
|
|||||||
* false} otherwise
|
* false} otherwise
|
||||||
*
|
*
|
||||||
* @see #equals(Object)
|
* @see #equals(Object)
|
||||||
|
* @see #equalsFoldCase(String)
|
||||||
* @see #codePoints()
|
* @see #codePoints()
|
||||||
*/
|
*/
|
||||||
public boolean equalsIgnoreCase(String anotherString) {
|
public boolean equalsIgnoreCase(String anotherString) {
|
||||||
@ -2188,6 +2218,57 @@ public final class String
|
|||||||
&& regionMatches(true, 0, anotherString, 0, length());
|
&& regionMatches(true, 0, anotherString, 0, length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares this {@code String} to another {@code String} for equality,
|
||||||
|
* using <em>{@index "Unicode case folding"}</em>. Two strings are considered equal
|
||||||
|
* by this method if their case-folded forms are identical.
|
||||||
|
* <p>
|
||||||
|
* Case folding is defined by the Unicode Standard in
|
||||||
|
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
|
||||||
|
* including 1:M mappings. For example, {@code "Fuß".equalsFoldCase("FUSS")}
|
||||||
|
* returns {@code true}, since the character {@code U+00DF} (sharp s) folds
|
||||||
|
* to {@code "ss"}.
|
||||||
|
* <p>
|
||||||
|
* Case folding is locale-independent and language-neutral, unlike
|
||||||
|
* locale-sensitive transformations such as {@link #toLowerCase()} or
|
||||||
|
* {@link #toUpperCase()}. It is intended for caseless matching,
|
||||||
|
* searching, and indexing.
|
||||||
|
*
|
||||||
|
* @apiNote
|
||||||
|
* This method is the Unicode-compliant alternative to
|
||||||
|
* {@link #equalsIgnoreCase(String)}. It implements full case folding as
|
||||||
|
* defined by the Unicode Standard, which may differ from the simpler
|
||||||
|
* per-character mapping performed by {@code equalsIgnoreCase}.
|
||||||
|
* For example:
|
||||||
|
* {@snippet lang=java :
|
||||||
|
* String a = "Fuß";
|
||||||
|
* String b = "FUSS";
|
||||||
|
* boolean equalsFoldCase = a.equalsFoldCase(b); // returns true
|
||||||
|
* boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* @param anotherString
|
||||||
|
* The {@code String} to compare this {@code String} against
|
||||||
|
*
|
||||||
|
* @return {@code true} if the given object is not {@code null} and represents
|
||||||
|
* the same sequence of characters as this string under Unicode case
|
||||||
|
* folding; {@code false} otherwise.
|
||||||
|
*
|
||||||
|
* @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching
|
||||||
|
* @see #compareToFoldCase(String)
|
||||||
|
* @see #equalsIgnoreCase(String)
|
||||||
|
* @since 26
|
||||||
|
*/
|
||||||
|
public boolean equalsFoldCase(String anotherString) {
|
||||||
|
if (this == anotherString) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (anotherString == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compares two strings lexicographically.
|
* Compares two strings lexicographically.
|
||||||
* The comparison is based on the Unicode value of each character in
|
* The comparison is based on the Unicode value of each character in
|
||||||
@ -2303,12 +2384,86 @@ public final class String
|
|||||||
* than this String, ignoring case considerations.
|
* than this String, ignoring case considerations.
|
||||||
* @see java.text.Collator
|
* @see java.text.Collator
|
||||||
* @see #codePoints()
|
* @see #codePoints()
|
||||||
|
* @see #compareToFoldCase(String)
|
||||||
* @since 1.2
|
* @since 1.2
|
||||||
*/
|
*/
|
||||||
public int compareToIgnoreCase(String str) {
|
public int compareToIgnoreCase(String str) {
|
||||||
return CASE_INSENSITIVE_ORDER.compare(this, str);
|
return CASE_INSENSITIVE_ORDER.compare(this, str);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A Comparator that orders {@code String} objects as by
|
||||||
|
* {@link #compareToFoldCase(String) compareToFoldCase()}.
|
||||||
|
*
|
||||||
|
* @see #compareToFoldCase(String)
|
||||||
|
* @since 26
|
||||||
|
*/
|
||||||
|
public static final Comparator<String> UNICODE_CASEFOLD_ORDER
|
||||||
|
= new FoldCaseComparator();
|
||||||
|
|
||||||
|
private static class FoldCaseComparator implements Comparator<String> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(String s1, String s2) {
|
||||||
|
byte[] v1 = s1.value;
|
||||||
|
byte[] v2 = s2.value;
|
||||||
|
if (s1.coder == s2.coder()) {
|
||||||
|
return s1.coder == LATIN1 ? StringLatin1.compareToFC(v1, v2)
|
||||||
|
: StringUTF16.compareToFC(v1, v2);
|
||||||
|
}
|
||||||
|
return s1.coder == LATIN1 ? StringLatin1.compareToFC_UTF16(v1, v2)
|
||||||
|
: StringUTF16.compareToFC_Latin1(v1, v2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares two strings lexicographically using <em>{@index "Unicode case folding"}</em>.
|
||||||
|
* This method returns an integer whose sign is that of calling {@code compareTo}
|
||||||
|
* on the Unicode case folded version of the strings. Unicode Case folding
|
||||||
|
* eliminates differences in case according to the Unicode Standard, using the
|
||||||
|
* mappings defined in
|
||||||
|
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
|
||||||
|
* including 1:M mappings, such as {@code"ß"} → {@code }"ss"}.
|
||||||
|
* <p>
|
||||||
|
* Case folding is a locale-independent, language-neutral form of case mapping,
|
||||||
|
* primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)},
|
||||||
|
* which applies a simpler locale-insensitive uppercase mapping. This method
|
||||||
|
* follows the Unicode <em>{@index "full"}</em> case folding, providing stable and
|
||||||
|
* consistent results across all environments.
|
||||||
|
* <p>
|
||||||
|
* Note that this method does <em>not</em> take locale into account, and may
|
||||||
|
* produce results that differ from locale-sensitive ordering. Use
|
||||||
|
* {@link java.text.Collator} for locale-sensitive comparison.
|
||||||
|
*
|
||||||
|
* @apiNote
|
||||||
|
* This method is the Unicode-compliant alternative to
|
||||||
|
* {@link #compareToIgnoreCase(String)}. It implements the
|
||||||
|
* <em>{@index "full case folding"}</em> as defined by the Unicode Standard, which
|
||||||
|
* may differ from the simpler per-character mapping performed by
|
||||||
|
* {@code compareToIgnoreCase}.
|
||||||
|
* For example:
|
||||||
|
* {@snippet lang=java :
|
||||||
|
* String a = "Fuß";
|
||||||
|
* String b = "FUSS";
|
||||||
|
* int cmpFoldCase = a.compareToFoldCase(b); // returns 0
|
||||||
|
* int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* @param str the {@code String} to be compared.
|
||||||
|
* @return a negative integer, zero, or a positive integer as the specified
|
||||||
|
* String is greater than, equal to, or less than this String,
|
||||||
|
* ignoring case considerations by case folding.
|
||||||
|
*
|
||||||
|
* @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching
|
||||||
|
* @see java.text.Collator
|
||||||
|
* @see #compareToIgnoreCase(String)
|
||||||
|
* @see #equalsFoldCase(String)
|
||||||
|
* @since 26
|
||||||
|
*/
|
||||||
|
public int compareToFoldCase(String str) {
|
||||||
|
return UNICODE_CASEFOLD_ORDER.compare(this, str);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests if two string regions are equal.
|
* Tests if two string regions are equal.
|
||||||
* <p>
|
* <p>
|
||||||
|
|||||||
@ -32,6 +32,8 @@ import java.util.function.Consumer;
|
|||||||
import java.util.function.IntConsumer;
|
import java.util.function.IntConsumer;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import jdk.internal.lang.CaseFolding;
|
||||||
import jdk.internal.util.ArraysSupport;
|
import jdk.internal.util.ArraysSupport;
|
||||||
import jdk.internal.vm.annotation.IntrinsicCandidate;
|
import jdk.internal.vm.annotation.IntrinsicCandidate;
|
||||||
|
|
||||||
@ -179,6 +181,128 @@ final class StringLatin1 {
|
|||||||
return len1 - len2;
|
return len1 - len2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
|
||||||
|
int k1 = off, k2 = ooff;
|
||||||
|
boolean lo1 = false, lo2 = false; // true if we have a leftover 's' from u+00df -> ss
|
||||||
|
while ((k1 < last || lo1) && (k2 < olast || lo2)) {
|
||||||
|
int c1, c2;
|
||||||
|
if (lo1) {
|
||||||
|
c1 = 0x73; // leftover 's'
|
||||||
|
lo1 = false;
|
||||||
|
} else {
|
||||||
|
c1 = getChar(value, k1++);
|
||||||
|
if (c1 == 0xdf) {
|
||||||
|
c1 = 0x73;
|
||||||
|
lo1 = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (lo2) {
|
||||||
|
c2 = 0x73; // 's'
|
||||||
|
lo2 = false;
|
||||||
|
} else {
|
||||||
|
c2 = getChar(other, k2++);
|
||||||
|
if (c2 == 0xdf) {
|
||||||
|
c2 = 0x73;
|
||||||
|
lo2 = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!CharacterDataLatin1.equalsIgnoreCase((byte)c1, (byte)c2)) {
|
||||||
|
return Character.toLowerCase(c1) - Character.toLowerCase(c2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (k1 < last || lo1) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (k2 < olast || lo2) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int compareToFC(byte[] value, byte[] other) {
|
||||||
|
int len = value.length;
|
||||||
|
int olen = other.length;
|
||||||
|
int lim = Math.min(len, olen);
|
||||||
|
for (int k = 0; k < lim; k++) {
|
||||||
|
byte b1 = value[k];
|
||||||
|
byte b2 = other[k];
|
||||||
|
if (!CharacterDataLatin1.equalsIgnoreCase(b1, b2)) {
|
||||||
|
int c1 = b1 & 0xff;
|
||||||
|
int c2 = b2 & 0xff;
|
||||||
|
if (c1 == 0xdf || c2 == 0xdf) { // 0xdf is the only 1:M in latin1 range
|
||||||
|
return compareToFC0(value, k, len, other, k, olen);
|
||||||
|
}
|
||||||
|
return Character.toLowerCase(c1) - Character.toLowerCase(c2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len - olen;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int compareToFC0_UTF16(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
|
||||||
|
int f1 = 0, f2 = 0;
|
||||||
|
int k1 = off, k2 = ooff;
|
||||||
|
while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) {
|
||||||
|
int c1, c2;
|
||||||
|
if (f1 != 0) {
|
||||||
|
c1 = (f1 & 0xffff); f1 >>>= 16;
|
||||||
|
} else {
|
||||||
|
c1 = getChar(value, k1++);
|
||||||
|
var f = CaseFolding.fold(c1);
|
||||||
|
if (CaseFolding.isSingleCodePoint(f)) {
|
||||||
|
c1 = (int)(f & 0xfffff);
|
||||||
|
} else {
|
||||||
|
c1 = (int)f & 0xffff;
|
||||||
|
f1 = (int)(f >>> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (f2 != 0) {
|
||||||
|
c2 = f2 & 0xffff; f2 >>>= 16;
|
||||||
|
} else {
|
||||||
|
c2 = StringUTF16.codePointAt(other, k2, olast, true);
|
||||||
|
k2 += Character.charCount(c2);
|
||||||
|
var f = CaseFolding.fold(c2);
|
||||||
|
if (CaseFolding.isSingleCodePoint(f)) {
|
||||||
|
c2 = (int)(f & 0xfffff);
|
||||||
|
} else {
|
||||||
|
c2 = (int)(f & 0xffff);
|
||||||
|
f2 = (int)(f >>> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (c1 != c2) {
|
||||||
|
return c1 - c2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (k1 < last || f1 != 0) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (k2 < olast || f2 != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// latin1 vs utf16
|
||||||
|
static int compareToFC_UTF16(byte[] value, byte[] other) {
|
||||||
|
int last = length(value);
|
||||||
|
int olast = StringUTF16.length(other);
|
||||||
|
int lim = Math.min(last, olast);
|
||||||
|
for (int k = 0; k < lim; k++) {
|
||||||
|
int cp1 = getChar(value, k);
|
||||||
|
int cp2 = StringUTF16.codePointAt(other, k, olast, true);
|
||||||
|
if (cp1 != cp2) {
|
||||||
|
long cf1 = CaseFolding.fold(cp1);
|
||||||
|
long cf2 = CaseFolding.fold(cp2);
|
||||||
|
if (cf1 != cf2) {
|
||||||
|
if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) {
|
||||||
|
return compareToFC0_UTF16(value, k, last, other, k, olast);
|
||||||
|
}
|
||||||
|
return (int)(cf1 - cf2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return last - olast;
|
||||||
|
}
|
||||||
|
|
||||||
static int hashCode(byte[] value) {
|
static int hashCode(byte[] value) {
|
||||||
return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0);
|
return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,6 +34,7 @@ import java.util.function.IntConsumer;
|
|||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import jdk.internal.lang.CaseFolding;
|
||||||
import jdk.internal.misc.Unsafe;
|
import jdk.internal.misc.Unsafe;
|
||||||
import jdk.internal.util.ArraysSupport;
|
import jdk.internal.util.ArraysSupport;
|
||||||
import jdk.internal.vm.annotation.ForceInline;
|
import jdk.internal.vm.annotation.ForceInline;
|
||||||
@ -93,7 +94,7 @@ final class StringUTF16 {
|
|||||||
return value.length >> 1;
|
return value.length >> 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int codePointAt(byte[] value, int index, int end, boolean checked) {
|
static int codePointAt(byte[] value, int index, int end, boolean checked) {
|
||||||
assert index < end;
|
assert index < end;
|
||||||
if (checked) {
|
if (checked) {
|
||||||
checkIndex(index, value);
|
checkIndex(index, value);
|
||||||
@ -592,6 +593,77 @@ final class StringUTF16 {
|
|||||||
return -StringLatin1.compareToCI_UTF16(other, value);
|
return -StringLatin1.compareToCI_UTF16(other, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int compareToFC_Latin1(byte[] value, byte[] other) {
|
||||||
|
return -StringLatin1.compareToFC_UTF16(other, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
|
||||||
|
int f1 = 0, f2 = 0;
|
||||||
|
int k1 = off, k2 = ooff;
|
||||||
|
while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) {
|
||||||
|
int c1, c2;
|
||||||
|
if (f1 != 0) {
|
||||||
|
c1 = f1 & 0xffff; f1 >>>= 16;
|
||||||
|
} else {
|
||||||
|
c1 = StringUTF16.codePointAt(value, k1, last, true);
|
||||||
|
k1 += Character.charCount(c1);
|
||||||
|
var f = CaseFolding.fold(c1);
|
||||||
|
if (CaseFolding.isSingleCodePoint(f)) {
|
||||||
|
c1 = (int)(f & 0xfffff);
|
||||||
|
} else {
|
||||||
|
c1 = (int)(f & 0xffff);
|
||||||
|
f1 = (int)(f >> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (f2 != 0) {
|
||||||
|
c2 = f2 & 0xffff; f2 >>>= 16;
|
||||||
|
} else {
|
||||||
|
c2 = StringUTF16.codePointAt(other, k2, olast, true);
|
||||||
|
k2 += Character.charCount(c2);
|
||||||
|
var f = CaseFolding.fold(c2);
|
||||||
|
if (CaseFolding.isSingleCodePoint(f)) {
|
||||||
|
c2 = (int)(f & 0xfffff);
|
||||||
|
} else {
|
||||||
|
c2 = (int)(f & 0xffff);
|
||||||
|
f2 = (int)(f >>> 16);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (c1 != c2) {
|
||||||
|
return c1 - c2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (k1 < last || f1 != 0) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (k2 < olast || f2 != 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int compareToFC(byte[] value, byte[] other) {
|
||||||
|
int tlast = length(value);
|
||||||
|
int olast = length(other);
|
||||||
|
int lim = Math.min(tlast, olast);
|
||||||
|
int k = 0;
|
||||||
|
while (k < lim) {
|
||||||
|
int cp1 = codePointAt(value, k, tlast, true);
|
||||||
|
int cp2 = codePointAt(other, k, olast, true);
|
||||||
|
if (cp1 != cp2) {
|
||||||
|
long cf1 = CaseFolding.fold(cp1);
|
||||||
|
long cf2 = CaseFolding.fold(cp2);
|
||||||
|
if (cf1 != cf2) {
|
||||||
|
if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) {
|
||||||
|
return compareToFC0(value, k, tlast, other, k, olast);
|
||||||
|
}
|
||||||
|
return (int) cf1 - (int) cf2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
k += Character.charCount(cp1);
|
||||||
|
}
|
||||||
|
return tlast - olast;
|
||||||
|
}
|
||||||
|
|
||||||
static int hashCode(byte[] value) {
|
static int hashCode(byte[] value) {
|
||||||
return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0);
|
return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -43,8 +43,8 @@ import java.util.function.Predicate;
|
|||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import jdk.internal.lang.CaseFolding;
|
||||||
import jdk.internal.util.ArraysSupport;
|
import jdk.internal.util.ArraysSupport;
|
||||||
import jdk.internal.util.regex.CaseFolding;
|
|
||||||
import jdk.internal.util.regex.Grapheme;
|
import jdk.internal.util.regex.Grapheme;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -0,0 +1,208 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation. Oracle designates this
|
||||||
|
* particular file as subject to the "Classpath" exception as provided
|
||||||
|
* by Oracle in the LICENSE file that accompanied this code.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package jdk.internal.lang;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
|
import static java.util.Map.entry;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility class that handles Unicode case folding properties defined in
|
||||||
|
* CasingFolding.txt, including 1:M full case folding.
|
||||||
|
*/
|
||||||
|
public final class CaseFolding {
|
||||||
|
|
||||||
|
private CaseFolding() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests whether the specified code point has a folding mapping entry defined.
|
||||||
|
*
|
||||||
|
* @param cp
|
||||||
|
* the Unicode code point to test
|
||||||
|
* @return {@code true} if the given code point has a case folding mapping entry
|
||||||
|
* defined in (@code caseFoldingMap}, {@code false} otherwise
|
||||||
|
*/
|
||||||
|
public static boolean isDefined(int cp) {
|
||||||
|
return getDefined(cp) != -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the case-folded form of the specified code point according
|
||||||
|
* to the Unicode case folding mappings.
|
||||||
|
* <p>
|
||||||
|
* If the code point has no case folding mapping defined, this method returns
|
||||||
|
* the original code point.
|
||||||
|
*
|
||||||
|
* Possible combinations of the returning case-folding form as a long value
|
||||||
|
*
|
||||||
|
* +---+---------+--------+---------+--------+--------+
|
||||||
|
* | 1:1 mapping | 0000 | 0000 | 000x | xxxx | 0041 => 0061 or 1E921 => 1E943
|
||||||
|
* +---+---------+--------+---------+--------+--------+
|
||||||
|
* | 1:2 mapping | 0002 | 0000 | xxxx | xxxx | FB02 => 0066 006C
|
||||||
|
* +---+---------+--------+---------+--------+--------+
|
||||||
|
* | 1:3 mapping | 0003 | xxxx | xxxx | xxxx | FB03 => 0066 0066 0069
|
||||||
|
* +---+---------+--------+---------+--------+--------+
|
||||||
|
*
|
||||||
|
* @param cp
|
||||||
|
* the Unicode code point to fold
|
||||||
|
* @return a long value representing the case-folded form of the input
|
||||||
|
* code point, encoded as TBD
|
||||||
|
*/
|
||||||
|
public static long fold(int cp) {
|
||||||
|
var fold = getDefined(cp);
|
||||||
|
return fold == -1 ? cp : fold;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isSingleCodePoint(long fold) {
|
||||||
|
return (fold >> 48) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
|
||||||
|
* matching, according to the
|
||||||
|
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
|
||||||
|
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
|
||||||
|
* <p>
|
||||||
|
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
|
||||||
|
* be applied to literals and (optionally) to character classes. When applied to character classes, each
|
||||||
|
* character class is expected to be closed under simple case folding. See the standard for the
|
||||||
|
* detailed explanation and example of "closed".
|
||||||
|
* <p>
|
||||||
|
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
|
||||||
|
* <ol>
|
||||||
|
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
|
||||||
|
* <li>Specify which character properties or constructs are closed under the matching.</li>
|
||||||
|
* </ol>
|
||||||
|
* <p>
|
||||||
|
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
|
||||||
|
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
|
||||||
|
* family may appears independently or within a class.
|
||||||
|
* <p>
|
||||||
|
* For loose/case-insensitive matching, the back-refs, slices and singles apply {@code toUpperCase} and
|
||||||
|
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
|
||||||
|
* matching.
|
||||||
|
* <p>
|
||||||
|
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
|
||||||
|
* if their behavior is clearly specified.
|
||||||
|
* <p>
|
||||||
|
* This method addresses that requirement for the "range" construct within in character class by computing
|
||||||
|
* the additional characters that should be included to close the range under simple case folding:
|
||||||
|
* <p>
|
||||||
|
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
|
||||||
|
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
|
||||||
|
* character is not already in the range, then that mapped character (typically lowercase) is added to
|
||||||
|
* the expansion set.
|
||||||
|
* <p>
|
||||||
|
* This allows regex character class "range" implementation to use the returned expansion set to support
|
||||||
|
* additional case-insensitive matching, without duplicating characters already covered by the existing
|
||||||
|
* regex range implementation. The expectation is the matching is done using both the uppercase and
|
||||||
|
* lowercase forms of the input character, for example
|
||||||
|
*
|
||||||
|
* <pre>{@code
|
||||||
|
*
|
||||||
|
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
|
||||||
|
* inRange(lower, Character.toLower(ch), upper) ||
|
||||||
|
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
|
||||||
|
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
|
||||||
|
* }</pre>
|
||||||
|
*
|
||||||
|
* @param start the starting code point of the character range
|
||||||
|
* @param end the ending code point of the character range
|
||||||
|
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
|
||||||
|
* those already in the range
|
||||||
|
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
|
||||||
|
*/
|
||||||
|
public static int[] getClassRangeClosingCharacters(int start, int end) {
|
||||||
|
int[] expanded = new int[expanded_case_cps.length];
|
||||||
|
int off = 0;
|
||||||
|
for (int cp : expanded_case_cps) {
|
||||||
|
if (cp >= start && cp <= end) {
|
||||||
|
int folding = expanded_case_map.get(cp);
|
||||||
|
if (folding < start || folding > end) {
|
||||||
|
expanded[off++] = folding;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Arrays.copyOf(expanded, off);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
|
||||||
|
%%%Expanded_Case_Map_Entries
|
||||||
|
);
|
||||||
|
|
||||||
|
private static final int[] expanded_case_cps = expanded_case_map.keySet()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(Integer::intValue)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
private static final int HASH_CP = 0;
|
||||||
|
private static final int HASH_INDEX = 1;
|
||||||
|
private static final int HASH_NEXT = 2;
|
||||||
|
|
||||||
|
private static int[][] hashKeys(int[] keys) {
|
||||||
|
var hashes = new int[keys.length << 1][3]; // cp + hash + next
|
||||||
|
var off = keys.length;
|
||||||
|
for (int i = 0; i < keys.length; i++) {
|
||||||
|
var cp = keys[i];
|
||||||
|
var hash = cp % keys.length;
|
||||||
|
while (hashes[hash][HASH_CP] != 0) {
|
||||||
|
var next = hashes[hash][HASH_NEXT];
|
||||||
|
if (next == 0) {
|
||||||
|
hashes[hash][HASH_NEXT] = off;
|
||||||
|
hash = off++;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
hash = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
hashes[hash][HASH_CP] = cp;
|
||||||
|
hashes[hash][HASH_INDEX] = i;
|
||||||
|
}
|
||||||
|
return Arrays.copyOf(hashes, off);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long getDefined(int cp) {
|
||||||
|
var hashes = CASE_FOLDING_HASHES;
|
||||||
|
var length = CASE_FOLDING_CPS.length; // hashed based on total defined.
|
||||||
|
var hash = cp % length;
|
||||||
|
while (hashes[hash][HASH_CP] != cp) {
|
||||||
|
var next = hashes[hash][HASH_NEXT];
|
||||||
|
if (next == 0) {
|
||||||
|
return -1; // hash miss
|
||||||
|
}
|
||||||
|
hash = next;
|
||||||
|
}
|
||||||
|
var index = hashes[hash][HASH_INDEX];
|
||||||
|
return CASE_FOLDING_VALUES[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
%%%Entries
|
||||||
|
|
||||||
|
private static final int[][] CASE_FOLDING_HASHES = hashKeys(CASE_FOLDING_CPS);
|
||||||
|
}
|
||||||
@ -1,116 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
||||||
*
|
|
||||||
* This code is free software; you can redistribute it and/or modify it
|
|
||||||
* under the terms of the GNU General Public License version 2 only, as
|
|
||||||
* published by the Free Software Foundation. Oracle designates this
|
|
||||||
* particular file as subject to the "Classpath" exception as provided
|
|
||||||
* by Oracle in the LICENSE file that accompanied this code.
|
|
||||||
*
|
|
||||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
||||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
||||||
* version 2 for more details (a copy is included in the LICENSE file that
|
|
||||||
* accompanied this code).
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License version
|
|
||||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
||||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
*
|
|
||||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
||||||
* or visit www.oracle.com if you need additional information or have any
|
|
||||||
* questions.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package jdk.internal.util.regex;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
import static java.util.Map.entry;
|
|
||||||
|
|
||||||
public final class CaseFolding {
|
|
||||||
|
|
||||||
private static final Map<Integer, Integer> expanded_case_map = Map.ofEntries(
|
|
||||||
%%%Entries
|
|
||||||
);
|
|
||||||
|
|
||||||
private static final int[] expanded_case_cps = expanded_case_map.keySet()
|
|
||||||
.stream()
|
|
||||||
.mapToInt(Integer::intValue)
|
|
||||||
.toArray();
|
|
||||||
|
|
||||||
private CaseFolding() {}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
|
|
||||||
* matching, according to the
|
|
||||||
* <a href="https://www.unicode.org/reports/tr18/#Simple_Loose_Matches">Simple Loose Matches</a>
|
|
||||||
* rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
|
|
||||||
* <p>
|
|
||||||
* To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
|
|
||||||
* be applied to literals and (optionally) to character classes. When applied to character classes, each
|
|
||||||
* character class is expected to be closed under simple case folding. See the standard for the
|
|
||||||
* detailed explanation and example of "closed".
|
|
||||||
* <p>
|
|
||||||
* RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
|
|
||||||
* <ol>
|
|
||||||
* <li>Provide at least the simple, default Unicode case-insensitive matching, and</li>
|
|
||||||
* <li>Specify which character properties or constructs are closed under the matching.</li>
|
|
||||||
* </ol>
|
|
||||||
* <p>
|
|
||||||
* In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
|
|
||||||
* back-refs, string slice (sequences), single, family(char-property) and class range. Single and
|
|
||||||
* family may appears independently or within a class.
|
|
||||||
* <p>
|
|
||||||
* For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
|
|
||||||
* {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
|
|
||||||
* matching.
|
|
||||||
* <p>
|
|
||||||
* The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
|
|
||||||
* if their behavior is clearly specified.
|
|
||||||
* <p>
|
|
||||||
* This method addresses that requirement for the "range" construct within in character class by computing
|
|
||||||
* the additional characters that should be included to close the range under simple case folding:
|
|
||||||
* <p>
|
|
||||||
* For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
|
|
||||||
* case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
|
|
||||||
* character is not already in the range, then that mapped character (typically lowercase) is added to
|
|
||||||
* the expansion set.
|
|
||||||
* <p>
|
|
||||||
* This allows regex character class "range" implementation to use the returned expansion set to support
|
|
||||||
* additional case-insensitive matching, without duplicating characters already covered by the existing
|
|
||||||
* regex range implementation. The expectation is the matching is done using both the uppercase and
|
|
||||||
* lowercase forms of the input character, for example
|
|
||||||
*
|
|
||||||
* <pre>{@code
|
|
||||||
*
|
|
||||||
* ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
|
|
||||||
* inRange(lower, Character.toLower(ch), upper) ||
|
|
||||||
* additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
|
|
||||||
* additionalClosingCharacters.contains(Character.toUpperCase(ch))
|
|
||||||
* }</pre>
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
|
|
||||||
* @param start the starting code point of the character range
|
|
||||||
* @param end the ending code point of the character range
|
|
||||||
* @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
|
|
||||||
* those already in the range
|
|
||||||
*/
|
|
||||||
public static int[] getClassRangeClosingCharacters(int start, int end) {
|
|
||||||
int[] expanded = new int[expanded_case_cps.length];
|
|
||||||
int off = 0;
|
|
||||||
for (int cp : expanded_case_cps) {
|
|
||||||
if (cp >= start && cp <= end) {
|
|
||||||
int folding = expanded_case_map.get(cp);
|
|
||||||
if (folding < start || folding > end) {
|
|
||||||
expanded[off++] = folding;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return Arrays.copyOf(expanded, off);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
329
test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
Normal file
329
test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
Normal file
@ -0,0 +1,329 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @test
|
||||||
|
* @summary tests unicode case-folding based String comparison and equality
|
||||||
|
* @bug 4397357
|
||||||
|
* @library /lib/testlibrary/java/lang
|
||||||
|
* @modules java.base/jdk.internal.lang:+open
|
||||||
|
* @run junit/othervm
|
||||||
|
* UnicodeCaseFoldingTest
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
|
import org.junit.jupiter.params.provider.Arguments;
|
||||||
|
import org.junit.jupiter.params.provider.MethodSource;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import jdk.internal.lang.CaseFolding;
|
||||||
|
|
||||||
|
public class UnicodeCaseFoldingTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testAllCommnFullCodePointsListedInCaseFoldinigTxt() throws Throwable {
|
||||||
|
var filter = "^.*; [CF]; .*$"; // C=common, F=full, for full case folding
|
||||||
|
var results = Files.lines(UCDFiles.CASEFOLDING)
|
||||||
|
.filter(line -> !line.startsWith("#") && line.matches(filter))
|
||||||
|
.map(line -> {
|
||||||
|
var fields = line.split("; ");
|
||||||
|
var cp = Integer.parseInt(fields[0], 16);
|
||||||
|
fields = fields[2].trim().split(" ");
|
||||||
|
var folding = new int[fields.length];
|
||||||
|
for (int i = 0; i < folding.length; i++) {
|
||||||
|
folding[i] = Integer.parseInt(fields[i], 16);
|
||||||
|
}
|
||||||
|
var source = new String(Character.toChars(cp));
|
||||||
|
var expected = new String(folding, 0, folding.length);
|
||||||
|
// (1) Verify the folding result matches expected
|
||||||
|
assertEquals(expected, foldCase(source), "CaseFolding.fold(): ");
|
||||||
|
|
||||||
|
// (2) Verify compareToFoldCase() result
|
||||||
|
assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
|
||||||
|
assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
|
||||||
|
|
||||||
|
// (3) Verify equalsFoldCase() result
|
||||||
|
assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
|
||||||
|
assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(error -> error != null)
|
||||||
|
.toArray();
|
||||||
|
assertEquals(0, results.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testAllSimpleCodePointsListedInCaseFoldinigTxt() throws Throwable {
|
||||||
|
// S=simple, for simple case folding. The simple case folding should still matches
|
||||||
|
var filter = "^.*; [S]; .*$";
|
||||||
|
var results = Files.lines(UCDFiles.CASEFOLDING)
|
||||||
|
.filter(line -> !line.startsWith("#") && line.matches(filter))
|
||||||
|
.map(line -> {
|
||||||
|
var fields = line.split("; ");
|
||||||
|
var cp = Integer.parseInt(fields[0], 16);
|
||||||
|
fields = fields[2].trim().split(" ");
|
||||||
|
var folding = new int[fields.length];
|
||||||
|
for (int i = 0; i < folding.length; i++) {
|
||||||
|
folding[i] = Integer.parseInt(fields[i], 16);
|
||||||
|
}
|
||||||
|
var source = new String(Character.toChars(cp));
|
||||||
|
var expected = new String(folding, 0, folding.length);
|
||||||
|
|
||||||
|
// (1) Verify compareToFoldCase() result
|
||||||
|
assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
|
||||||
|
assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
|
||||||
|
|
||||||
|
// (2) Verify equalsFoldCase() result
|
||||||
|
assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
|
||||||
|
assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(error -> error != null)
|
||||||
|
.toArray();
|
||||||
|
assertEquals(0, results.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAllCodePointsFoldToThemselvesIfNotListed() throws Exception {
|
||||||
|
// Collect all code points that appear in CaseFolding.txt
|
||||||
|
var listed = Files.lines(UCDFiles.CASEFOLDING)
|
||||||
|
.filter(line -> !line.startsWith("#") && line.matches("^.*; [CF]; .*$"))
|
||||||
|
.map(line -> Integer.parseInt(line.split("; ")[0], 16))
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
var failures = new ArrayList<String>();
|
||||||
|
|
||||||
|
// Scan BMP + Supplementary Plane 1 (U+0000..U+1FFFF)
|
||||||
|
for (int cp = Character.MIN_CODE_POINT; cp <= 0x1FFFF; cp++) {
|
||||||
|
if (!Character.isDefined(cp)) {
|
||||||
|
continue; // skip undefined
|
||||||
|
}
|
||||||
|
if (Character.isSurrogate((char) cp)) {
|
||||||
|
continue; // skip surrogate code units
|
||||||
|
}
|
||||||
|
if (listed.contains(cp)) {
|
||||||
|
continue; // already tested separately
|
||||||
|
}
|
||||||
|
String s = new String(Character.toChars(cp));
|
||||||
|
String folded = foldCase(s);
|
||||||
|
if (!s.equals(folded)) {
|
||||||
|
failures.add(String.format("Unexpected folding: U+%04X '%s' → '%s'", cp, s, folded));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(0, failures.size(),
|
||||||
|
() -> "Some unlisted code points folded unexpectedly:\n"
|
||||||
|
+ String.join("\n", failures));
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest(name = "CaseFold \"{0}\" → \"{1}\"")
|
||||||
|
@MethodSource("caseFoldTestCases")
|
||||||
|
void testIndividualCaseFolding(String input, String expected) {
|
||||||
|
assertEquals(expected, foldCase(input));
|
||||||
|
}
|
||||||
|
|
||||||
|
static Stream<Arguments> caseFoldTestCases() {
|
||||||
|
return Stream.of(
|
||||||
|
// ASCII simple cases
|
||||||
|
Arguments.of("ABC", "abc"),
|
||||||
|
Arguments.of("already", "already"),
|
||||||
|
Arguments.of("MiXeD123", "mixed123"),
|
||||||
|
// --- Latin-1 to non-Latin-1 fold ---
|
||||||
|
Arguments.of("aBc\u00B5Efg", "abc\u03BCefg"), // "µ" → "μ"
|
||||||
|
Arguments.of("test\u00B5\ud801\udc00X", "test\u03bc\ud801\udc28x"),
|
||||||
|
// German Eszett
|
||||||
|
Arguments.of("Stra\u00DFe", "strasse"), // "Straße"
|
||||||
|
Arguments.of("\u1E9E", "ss"), // "ẞ" capital sharp S
|
||||||
|
// Turkish dotted I / dotless i
|
||||||
|
Arguments.of("I", "i"),
|
||||||
|
Arguments.of("\u0130", "i\u0307"), // capital dotted I → "i + dot above"
|
||||||
|
Arguments.of("\u0069\u0307", "i\u0307"), // small i + dot above remains
|
||||||
|
Arguments.of("\u0131", "\u0131"), // "ı" (dotless i stays dotless)
|
||||||
|
|
||||||
|
// Greek special cases ---
|
||||||
|
Arguments.of("\u039F\u03A3", "\u03BF\u03C3"), // "ΟΣ" → "οσ" final sigma always folds to normal sigma
|
||||||
|
Arguments.of("\u1F88", "\u1F00\u03B9"), // "ᾈ" → "ἀι" Alpha with psili + ypogegrammeni
|
||||||
|
Arguments.of("\u039C\u03AC\u03CA\u03BF\u03C2", "\u03BC\u03AC\u03CA\u03BF\u03C3"), // "Μάϊος" → "μάϊοσ"
|
||||||
|
Arguments.of("\u1F08", "\u1F00"), // Ἀ (Capital Alpha with psili) → ἀ
|
||||||
|
|
||||||
|
// Supplementary Plane characters
|
||||||
|
Arguments.of("\uD801\uDC00", "\uD801\uDC28"), // Deseret Capital Letter Long I → Small
|
||||||
|
Arguments.of("\uD801\uDC01", "\uD801\uDC29"), // Deseret Capital Letter Long E → Small
|
||||||
|
|
||||||
|
// Supplementary inside ASCII
|
||||||
|
Arguments.of("abc\uD801\uDC00def", "abc\uD801\uDC28def"),
|
||||||
|
// Ligatures and compatibility folds
|
||||||
|
Arguments.of("\uFB00", "ff"), // ff → ff
|
||||||
|
Arguments.of("\uFB03", "ffi"), // ffi → ffi
|
||||||
|
Arguments.of("\u212A", "k"), // Kelvin sign → k
|
||||||
|
|
||||||
|
Arguments.of("abc\uFB00def", "abcffdef"), // ff → ff
|
||||||
|
Arguments.of("abc\uFB03def", "abcffidef"), // ffi → ffi
|
||||||
|
Arguments.of("abc\u212Adef", "abckdef"), // Kelvin sign → k
|
||||||
|
|
||||||
|
// --- Fullwidth ---
|
||||||
|
Arguments.of("\uFF21\uFF22\uFF23", "\uFF41\uFF42\uFF43"), // "ABC" → "abc"
|
||||||
|
|
||||||
|
// --- Armenian ---
|
||||||
|
Arguments.of("\u0531", "\u0561"), // "Ա" → "ա"
|
||||||
|
|
||||||
|
// --- Cherokee ---
|
||||||
|
Arguments.of("\u13A0", "\u13A0"), // Capital Cherokee A folds to itself
|
||||||
|
Arguments.of("\uAB70", "\u13A0") // Small Cherokee A folds Capital Cherokee A
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Stream<Arguments> caseFoldEqualProvider() {
|
||||||
|
return Stream.of(
|
||||||
|
Arguments.of("abc", "ABC"),
|
||||||
|
Arguments.of("aBcDe", "AbCdE"),
|
||||||
|
Arguments.of("\u00C0\u00E7", "\u00E0\u00C7"), // Àç vs àÇ
|
||||||
|
Arguments.of("straße", "STRASSE"), // ß → ss
|
||||||
|
Arguments.of("\uD83C\uDDE6", "\uD83C\uDDE6"), // 🇦 vs 🇦
|
||||||
|
Arguments.of("\u1E9E", "ss"), // ẞ (capital sharp S)
|
||||||
|
Arguments.of("\u03A3", "\u03C3"), // Σ vs σ (Greek Sigma)
|
||||||
|
Arguments.of("\u03C3", "\u03C2"), // σ vs ς (Greek sigma/final sigma)
|
||||||
|
Arguments.of("\u212B", "\u00E5"), // Å (Angstrom sign) vs å
|
||||||
|
Arguments.of("\uFB00", "ff"), // ff (ligature)
|
||||||
|
Arguments.of("\u01C5", "\u01C5"), // Dž (Latin capital D with small z with caron)
|
||||||
|
Arguments.of("Caf\u00E9", "CAF\u00C9"), // Café vs CAFÉ
|
||||||
|
Arguments.of("\u03BA\u03B1\u03BB\u03B7\u03BC\u03AD\u03C1\u03B1", "\u039A\u0391\u039B\u0397\u039C\u0388\u03A1\u0391"), // καλημέρα vs ΚΑΛΗΜΕΡΑ
|
||||||
|
Arguments.of("\u4E2D\u56FD", "\u4E2D\u56FD"), // 中国
|
||||||
|
Arguments.of("\u03B1", "\u0391"), // α vs Α (Greek alpha)
|
||||||
|
Arguments.of("\u212B", "\u00C5"), // Å vs Å
|
||||||
|
// from StringCompareToIgnoreCase
|
||||||
|
Arguments.of("\u0100\u0102\u0104\u0106\u0108", "\u0100\u0102\u0104\u0106\u0109"), // ĀĂĄĆĈ vs ĀĂĄĆĉ
|
||||||
|
Arguments.of("\u0101\u0103\u0105\u0107\u0109", "\u0100\u0102\u0104\u0106\u0109"), // āăąćĉ vs ĀĂĄĆĉ
|
||||||
|
Arguments.of("\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04",
|
||||||
|
"\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c"), // 𐐀𐐁𐐂𐐃𐐄 vs 𐐀𐐁𐐂𐐃𐐬
|
||||||
|
Arguments.of("\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c",
|
||||||
|
"\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c") // 𐐨𐐩𐐪𐐫𐐬 vs 𐐀𐐁𐐂𐐃𐐬
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@MethodSource("caseFoldEqualProvider")
|
||||||
|
void testcompareToFoldCaseEquals(String s1, String s2) {
|
||||||
|
assertEquals(0, s1.compareToFoldCase(s2));
|
||||||
|
assertEquals(0, s2.compareToFoldCase(s1));
|
||||||
|
assertEquals(true, s1.equalsFoldCase(s2));
|
||||||
|
assertEquals(true, s2.equalsFoldCase(s1));
|
||||||
|
assertEquals(foldCase(s1), foldCase(s2));
|
||||||
|
}
|
||||||
|
|
||||||
|
static Stream<Arguments> caseFoldOrderingProvider() {
|
||||||
|
return Stream.of(
|
||||||
|
Arguments.of("asa", "aß", -1), // ß → ss → "asa" < "ass"
|
||||||
|
Arguments.of("aß", "asa", +1),
|
||||||
|
Arguments.of("a\u00DF", "ass", 0), // aß vs ass
|
||||||
|
Arguments.of("\uFB03", "ffi", 0), // ffi (ligature)
|
||||||
|
Arguments.of("\u00C5", "Z", 1), // Å vs Z
|
||||||
|
Arguments.of("A", "\u00C0", -1), // A vs À
|
||||||
|
Arguments.of("\u03A9", "\u03C9", 0), // Ω vs ω
|
||||||
|
Arguments.of("\u03C2", "\u03C3", 0), // ς vs σ
|
||||||
|
Arguments.of("\uD835\uDD23", "R", 1), // 𝔯 (fraktur r) vs R
|
||||||
|
Arguments.of("\uFF26", "E", 1), // F (full-width F) vs E
|
||||||
|
Arguments.of("\u00C9clair", "Eclair", 1), // Éclair vs Eclair
|
||||||
|
Arguments.of("\u03bc\u00df", "\u00b5s", 1),
|
||||||
|
Arguments.of("\u00b5s", "\u03bc\u00df", -1)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@MethodSource("caseFoldOrderingProvider")
|
||||||
|
void testcompareToFoldCaseOrdering(String s1, String s2, int expectedSign) {
|
||||||
|
int cmp = s1.compareToFoldCase(s2);
|
||||||
|
assertEquals(expectedSign, Integer.signum(cmp));
|
||||||
|
}
|
||||||
|
|
||||||
|
static Stream<Arguments> roundTripProvider() {
|
||||||
|
return Stream.of(
|
||||||
|
Arguments.of("abc"),
|
||||||
|
Arguments.of("ABC"),
|
||||||
|
Arguments.of("straße"),
|
||||||
|
Arguments.of("Àç"),
|
||||||
|
Arguments.of("aß"),
|
||||||
|
Arguments.of("\uFB02uff"), // fluff (ligature in "fluff")
|
||||||
|
Arguments.of("\u00C9COLE") // ÉCOLE
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@MethodSource("roundTripProvider")
|
||||||
|
void testCaseFoldRoundTrip(String s) {
|
||||||
|
String folded = foldCase(s);
|
||||||
|
assertEquals(0, s.compareToFoldCase(folded));
|
||||||
|
assertEquals(0, folded.compareToFoldCase(s));
|
||||||
|
assertEquals(true, s.equalsFoldCase(folded));
|
||||||
|
assertEquals(true, folded.equalsFoldCase(s));
|
||||||
|
}
|
||||||
|
|
||||||
|
// helper to test the integrity of folding mapping
|
||||||
|
private static int[] longToFolding(long value) {
|
||||||
|
int len = (int) (value >>> 48);
|
||||||
|
if (len == 0) {
|
||||||
|
return new int[]{(int) (value & 0xFFFFF)};
|
||||||
|
} else {
|
||||||
|
var folding = new int[len];
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
folding[i] = (int) (value & 0xFFFF);
|
||||||
|
value >>= 16;
|
||||||
|
}
|
||||||
|
return folding;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String foldCase(String s) {
|
||||||
|
int first;
|
||||||
|
int len = s.length();
|
||||||
|
int cpCnt = 1;
|
||||||
|
for (first = 0; first < len; first += cpCnt) {
|
||||||
|
int cp = s.codePointAt(first);
|
||||||
|
if (CaseFolding.isDefined(cp)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
cpCnt = Character.charCount(cp);
|
||||||
|
}
|
||||||
|
if (first == len) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
StringBuilder sb = new StringBuilder(len);
|
||||||
|
sb.append(s, 0, first);
|
||||||
|
for (int i = first; i < len; i += cpCnt) {
|
||||||
|
int cp = s.codePointAt(i);
|
||||||
|
int[] folded = longToFolding(CaseFolding.fold(cp));
|
||||||
|
for (int f : folded) {
|
||||||
|
sb.appendCodePoint(f);
|
||||||
|
}
|
||||||
|
cpCnt = Character.charCount(cp);
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,200 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
package org.openjdk.bench.java.lang;
|
||||||
|
|
||||||
|
import org.openjdk.jmh.annotations.*;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This benchmark naively explores String::compareToFoldCase performance
|
||||||
|
*/
|
||||||
|
@BenchmarkMode(Mode.AverageTime)
|
||||||
|
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||||
|
@State(Scope.Thread)
|
||||||
|
@Warmup(iterations = 5, time = 1)
|
||||||
|
@Measurement(iterations = 5, time = 1)
|
||||||
|
@Fork(3)
|
||||||
|
public class StringCompareToFoldCase {
|
||||||
|
|
||||||
|
private String asciiUpper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||||
|
private String asciiUpperLower = "ABCDEFGHIJKLMNOpqrstuvwxyz";
|
||||||
|
private String asciiLower = "abcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
|
private String asciiWithDF = "abcdßßßßßßßßßßßßßßßßWXYZ";
|
||||||
|
private String asciiWithDFSS = "abcdssssssssssssssssßßßßßßßßWXYZ";
|
||||||
|
|
||||||
|
private String asciiLatine1 = "ABCDEFGHIJKLMNOpqrstuvwxyz0";
|
||||||
|
private String asciiLatin1UTF16 = "abcdefghijklmnopqrstuvwxyz\u0391";
|
||||||
|
|
||||||
|
private String greekUpper = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u0395"; // ΑΒΓΔΕ
|
||||||
|
private String greekUpperLower = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u03B5"; // ΑΒΓΔε
|
||||||
|
private String greekLower = "\u03B1\u03B2\u03B3\u03B4\u03B5\u03B1\u03B2\u03B3\u03B4\u03B5"; // αβγδε
|
||||||
|
|
||||||
|
public String supUpper = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04";
|
||||||
|
public String supUpperLower = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c";
|
||||||
|
public String supLower = "\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c";
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int asciiUpperLower() {
|
||||||
|
return asciiUpper.compareToIgnoreCase(asciiUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int asciiLower() {
|
||||||
|
return asciiUpper.compareToIgnoreCase(asciiLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int greekUpperLower() {
|
||||||
|
return greekUpper.compareToIgnoreCase(greekUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int greekLower() {
|
||||||
|
return greekUpper.compareToIgnoreCase(greekLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int latin1UTF16() {
|
||||||
|
return asciiLatine1.compareToIgnoreCase(asciiLatin1UTF16);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int supUpperLower() {
|
||||||
|
return supUpper.compareToIgnoreCase(supUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int supLower() {
|
||||||
|
return supUpper.compareToIgnoreCase(supLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int asciiUpperLowerFC() {
|
||||||
|
return asciiUpper.compareToFoldCase(asciiUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int asciiLowerFC() {
|
||||||
|
return asciiUpper.compareToFoldCase(asciiLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int asciiWithDFFC() {
|
||||||
|
return asciiWithDF.compareToFoldCase(asciiWithDFSS);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int greekUpperLowerFC() {
|
||||||
|
return greekUpper.compareToFoldCase(greekUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int greekLowerFC() {
|
||||||
|
return greekUpper.compareToFoldCase(greekLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int latin1UTF16FC() {
|
||||||
|
return asciiLatine1.compareToFoldCase(asciiLatin1UTF16); }
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int supUpperLowerFC() {
|
||||||
|
return supUpper.compareToFoldCase(supUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int supLowerFC() {
|
||||||
|
return supUpper.compareToFoldCase(supLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean asciiUpperLowerEQ() {
|
||||||
|
return asciiUpper.equalsIgnoreCase(asciiUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean asciiLowerEQ() {
|
||||||
|
return asciiUpper.equalsIgnoreCase(asciiLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean greekUpperLowerEQ() {
|
||||||
|
return greekUpper.equalsIgnoreCase(greekUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean greekLowerEQ() {
|
||||||
|
return greekUpper.equalsIgnoreCase(greekLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean latin1UTF16EQ() {
|
||||||
|
return asciiLatine1.equalsIgnoreCase(asciiLatin1UTF16);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean supUpperLowerEQ() {
|
||||||
|
return supUpper.equalsIgnoreCase(supUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean supLowerEQ() {
|
||||||
|
return supUpper.equalsIgnoreCase(supLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean asciiUpperLowerEQFC() {
|
||||||
|
return asciiUpper.equalsFoldCase(asciiUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean asciiLowerEQFC() {
|
||||||
|
return asciiUpper.equalsFoldCase(asciiLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean greekUpperLowerEQFC() {
|
||||||
|
return greekUpper.equalsFoldCase(greekUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean greekLowerEQFC() {
|
||||||
|
return greekUpper.equalsFoldCase(greekLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean latin1UTF16EQFC() {
|
||||||
|
return asciiLatine1.equalsFoldCase(asciiLatin1UTF16);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean supUpperLowerEQFC() {
|
||||||
|
return supUpper.equalsFoldCase(supUpperLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public boolean supLowerEQFC() {
|
||||||
|
return supUpper.equalsFoldCase(supLower);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user