From b97ed667db0bd527461b2b385af3001f53d71c19 Mon Sep 17 00:00:00 2001 From: Xueming Shen Date: Tue, 2 Dec 2025 19:47:18 +0000 Subject: [PATCH] 8365675: Add String Unicode Case-Folding Support Reviewed-by: rriggs, naoto, ihse --- make/ToolsJdk.gmk | 2 +- .../tools/generatecharacter/CaseFolding.java | 73 ---- .../GenerateCaseFolding.java | 134 +++++++ .../java.base/gensrc/GensrcCharacterData.gmk | 17 + make/modules/java.base/gensrc/GensrcRegex.gmk | 17 - .../share/classes/java/lang/String.java | 161 ++++++++- .../share/classes/java/lang/StringLatin1.java | 124 +++++++ .../share/classes/java/lang/StringUTF16.java | 74 +++- .../classes/java/util/regex/Pattern.java | 2 +- .../internal/lang/CaseFolding.java.template | 208 +++++++++++ .../util/regex/CaseFolding.java.template | 116 ------ .../lang/String/UnicodeCaseFoldingTest.java | 329 ++++++++++++++++++ .../java/lang/StringCompareToFoldCase.java | 200 +++++++++++ 13 files changed, 1245 insertions(+), 212 deletions(-) delete mode 100644 make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java create mode 100644 make/jdk/src/classes/build/tools/generatecharacter/GenerateCaseFolding.java create mode 100644 src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template delete mode 100644 src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template create mode 100644 test/jdk/java/lang/String/UnicodeCaseFoldingTest.java create mode 100644 test/micro/org/openjdk/bench/java/lang/StringCompareToFoldCase.java diff --git a/make/ToolsJdk.gmk b/make/ToolsJdk.gmk index 629cadbf83a..b04d7820c91 100644 --- a/make/ToolsJdk.gmk +++ b/make/ToolsJdk.gmk @@ -79,7 +79,7 @@ TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_too build.tools.generateextraproperties.GenerateExtraProperties TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \ - build.tools.generatecharacter.CaseFolding + build.tools.generatecharacter.GenerateCaseFolding TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \ build.tools.makezipreproducible.MakeZipReproducible diff --git a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java deleted file mode 100644 index 9abc2059b6a..00000000000 --- a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package build.tools.generatecharacter; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.nio.file.StandardOpenOption; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class CaseFolding { - - public static void main(String[] args) throws Throwable { - if (args.length != 3) { - System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java"); - System.exit(1); - } - var templateFile = Paths.get(args[0]); - var caseFoldingTxt = Paths.get(args[1]); - var genSrcFile = Paths.get(args[2]); - var supportedTypes = "^.*; [CTS]; .*$"; - var caseFoldingEntries = Files.lines(caseFoldingTxt) - .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) - .map(line -> { - String[] cols = line.split("; "); - return new String[] {cols[0], cols[1], cols[2]}; - }) - .filter(cols -> { - // the folding case doesn't map back to the original char. - var cp1 = Integer.parseInt(cols[0], 16); - var cp2 = Integer.parseInt(cols[2], 16); - return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; - }) - .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) - .collect(Collectors.joining(",\n", "", "")); - - // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's. - // 0049; T; 0131; # LATIN CAPITAL LETTER I - final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49); - - // Generate .java file - Files.write( - genSrcFile, - Files.lines(templateFile) - .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line) - .collect(Collectors.toList()), - StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); - } -} diff --git a/make/jdk/src/classes/build/tools/generatecharacter/GenerateCaseFolding.java b/make/jdk/src/classes/build/tools/generatecharacter/GenerateCaseFolding.java new file mode 100644 index 00000000000..2f6a9add5cb --- /dev/null +++ b/make/jdk/src/classes/build/tools/generatecharacter/GenerateCaseFolding.java @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package build.tools.generatecharacter; + +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.util.Arrays; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class GenerateCaseFolding { + + public static void main(String[] args) throws Throwable { + if (args.length != 3) { + System.err.println("Usage: java GenerateCaseFolding TemplateFile CaseFolding.txt CaseFolding.java"); + System.exit(1); + } + var templateFile = Paths.get(args[0]); + var caseFoldingTxt = Paths.get(args[1]); + var genSrcFile = Paths.get(args[2]); + + // java.lang + var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding + String[][] caseFoldings = Files.lines(caseFoldingTxt) + .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) + .map(line -> { + var fields = line.split("; "); + var cp = fields[0]; + fields = fields[2].trim().split(" "); + var folding = new String[fields.length + 1]; + folding[0] = cp; + System.arraycopy(fields, 0, folding, 1, fields.length); + return folding; + }) + .toArray(size -> new String[size][]); + + // util.regex + var expandedSupportedTypes = "^.*; [CTS]; .*$"; + var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt) + .filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes)) + .map(line -> { + String[] cols = line.split("; "); + return new String[]{cols[0], cols[1], cols[2]}; + }) + .filter(cols -> { + // the folding case doesn't map back to the original char. + var cp1 = Integer.parseInt(cols[0], 16); + var cp2 = Integer.parseInt(cols[2], 16); + return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; + }) + .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) + .collect(Collectors.joining(",\n", "", "")); + + // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's. + // 0049; T; 0131; # LATIN CAPITAL LETTER I + final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49); + + Files.write( + genSrcFile, + Files.lines(templateFile) + .map(line -> line.contains("%%%Entries") ? genFoldingEntries(caseFoldings) : line) + .map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line) + .collect(Collectors.toList()), + StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); + } + + private static long foldingToLong(String[] folding) { + int cp = Integer.parseInt(folding[0], 16); + long value = (long)Integer.parseInt(folding[1], 16); + if (!Character.isSupplementaryCodePoint(cp) && folding.length != 2) { + var shift = 16; + for (int j = 2; j < folding.length; j++) { + value |= (long)Integer.parseInt(folding[j], 16) << shift; + shift <<= 1; + } + value = value | (long) (folding.length - 1) << 48; + } + return value; + } + + private static String genFoldingEntries(String[][] foldings) { + StringBuilder sb = new StringBuilder(); + sb.append(" private static final int[] CASE_FOLDING_CPS = {\n"); + int width = 10; + for (int i = 0; i < foldings.length; i++) { + if (i % width == 0) + sb.append(" "); + sb.append(String.format("0X%s", foldings[i][0])); + if (i < foldings.length - 1) + sb.append(", "); + if (i % width == width - 1 || i == foldings.length - 1) + sb.append("\n"); + } + sb.append(" };\n\n"); + + sb.append(" private static final long[] CASE_FOLDING_VALUES = {\n"); + width = 6; + for (int i = 0; i < foldings.length; i++) { + if (i % width == 0) + sb.append(" "); // indent + sb.append(String.format("0x%013xL", foldingToLong(foldings[i]))); + if (i < foldings.length - 1) + sb.append(", "); + if (i % width == width - 1 || i == foldings.length - 1) { + sb.append("\n"); + } + } + sb.append(" };\n"); + return sb.toString(); + } +} diff --git a/make/modules/java.base/gensrc/GensrcCharacterData.gmk b/make/modules/java.base/gensrc/GensrcCharacterData.gmk index c05b126299b..d7947d907e2 100644 --- a/make/modules/java.base/gensrc/GensrcCharacterData.gmk +++ b/make/modules/java.base/gensrc/GensrcCharacterData.gmk @@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA) ################################################################################ + +GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/lang/CaseFolding.java + +STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template +CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt + +$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT) + $(call LogInfo, Generating $@) + $(call MakeTargetDir) + $(TOOL_GENERATECASEFOLDING) \ + $(STRINGCASEFOLDING_TEMPLATE) \ + $(CASEFOLDINGTXT) \ + $(GENSRC_STRINGCASEFOLDING) + +TARGETS += $(GENSRC_STRINGCASEFOLDING) + + endif # include guard include MakeIncludeEnd.gmk diff --git a/make/modules/java.base/gensrc/GensrcRegex.gmk b/make/modules/java.base/gensrc/GensrcRegex.gmk index a30f22b34d4..c46a029e2c2 100644 --- a/make/modules/java.base/gensrc/GensrcRegex.gmk +++ b/make/modules/java.base/gensrc/GensrcRegex.gmk @@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK) ################################################################################ -GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java - -CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template -CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt - -$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT) - $(call LogInfo, Generating $@) - $(call MakeTargetDir) - $(TOOL_GENERATECASEFOLDING) \ - $(CASEFOLDINGTEMP) \ - $(CASEFOLDINGTXT) \ - $(GENSRC_CASEFOLDING) - -TARGETS += $(GENSRC_CASEFOLDING) - -################################################################################ - endif # include guard include MakeIncludeEnd.gmk diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 52f908c9e98..d7aef113e15 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -117,9 +117,38 @@ import sun.nio.cs.UTF_8; * Unicode code points (i.e., characters), in addition to those for * dealing with Unicode code units (i.e., {@code char} values). * - *

Unless otherwise noted, methods for comparing Strings do not take locale - * into account. The {@link java.text.Collator} class provides methods for - * finer-grain, locale-sensitive String comparison. + *

String comparison and case-insensitive matching + * + *

There are several related ways to compare {@code String} values; choose + * the one whose semantics fit your purpose: + * + *

+ * + *

Unless otherwise noted, methods for comparing Strings do not take locale into + * account. The {@link java.text.Collator} class provides methods for finer-grain, + * locale-sensitive String comparison. * * @implNote The implementation of the string concatenation operator is left to * the discretion of a Java compiler, as long as the compiler ultimately conforms @@ -2179,6 +2208,7 @@ public final class String * false} otherwise * * @see #equals(Object) + * @see #equalsFoldCase(String) * @see #codePoints() */ public boolean equalsIgnoreCase(String anotherString) { @@ -2188,6 +2218,57 @@ public final class String && regionMatches(true, 0, anotherString, 0, length()); } + /** + * Compares this {@code String} to another {@code String} for equality, + * using {@index "Unicode case folding"}. Two strings are considered equal + * by this method if their case-folded forms are identical. + *

+ * Case folding is defined by the Unicode Standard in + * CaseFolding.txt, + * including 1:M mappings. For example, {@code "Fuß".equalsFoldCase("FUSS")} + * returns {@code true}, since the character {@code U+00DF} (sharp s) folds + * to {@code "ss"}. + *

+ * Case folding is locale-independent and language-neutral, unlike + * locale-sensitive transformations such as {@link #toLowerCase()} or + * {@link #toUpperCase()}. It is intended for caseless matching, + * searching, and indexing. + * + * @apiNote + * This method is the Unicode-compliant alternative to + * {@link #equalsIgnoreCase(String)}. It implements full case folding as + * defined by the Unicode Standard, which may differ from the simpler + * per-character mapping performed by {@code equalsIgnoreCase}. + * For example: + * {@snippet lang=java : + * String a = "Fuß"; + * String b = "FUSS"; + * boolean equalsFoldCase = a.equalsFoldCase(b); // returns true + * boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false + * } + * + * @param anotherString + * The {@code String} to compare this {@code String} against + * + * @return {@code true} if the given object is not {@code null} and represents + * the same sequence of characters as this string under Unicode case + * folding; {@code false} otherwise. + * + * @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching + * @see #compareToFoldCase(String) + * @see #equalsIgnoreCase(String) + * @since 26 + */ + public boolean equalsFoldCase(String anotherString) { + if (this == anotherString) { + return true; + } + if (anotherString == null) { + return false; + } + return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0; + } + /** * Compares two strings lexicographically. * The comparison is based on the Unicode value of each character in @@ -2303,12 +2384,86 @@ public final class String * than this String, ignoring case considerations. * @see java.text.Collator * @see #codePoints() + * @see #compareToFoldCase(String) * @since 1.2 */ public int compareToIgnoreCase(String str) { return CASE_INSENSITIVE_ORDER.compare(this, str); } + /** + * A Comparator that orders {@code String} objects as by + * {@link #compareToFoldCase(String) compareToFoldCase()}. + * + * @see #compareToFoldCase(String) + * @since 26 + */ + public static final Comparator UNICODE_CASEFOLD_ORDER + = new FoldCaseComparator(); + + private static class FoldCaseComparator implements Comparator { + + @Override + public int compare(String s1, String s2) { + byte[] v1 = s1.value; + byte[] v2 = s2.value; + if (s1.coder == s2.coder()) { + return s1.coder == LATIN1 ? StringLatin1.compareToFC(v1, v2) + : StringUTF16.compareToFC(v1, v2); + } + return s1.coder == LATIN1 ? StringLatin1.compareToFC_UTF16(v1, v2) + : StringUTF16.compareToFC_Latin1(v1, v2); + } + } + + /** + * Compares two strings lexicographically using {@index "Unicode case folding"}. + * This method returns an integer whose sign is that of calling {@code compareTo} + * on the Unicode case folded version of the strings. Unicode Case folding + * eliminates differences in case according to the Unicode Standard, using the + * mappings defined in + * CaseFolding.txt, + * including 1:M mappings, such as {@code"ß"} → {@code }"ss"}. + *

+ * Case folding is a locale-independent, language-neutral form of case mapping, + * primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)}, + * which applies a simpler locale-insensitive uppercase mapping. This method + * follows the Unicode {@index "full"} case folding, providing stable and + * consistent results across all environments. + *

+ * Note that this method does not take locale into account, and may + * produce results that differ from locale-sensitive ordering. Use + * {@link java.text.Collator} for locale-sensitive comparison. + * + * @apiNote + * This method is the Unicode-compliant alternative to + * {@link #compareToIgnoreCase(String)}. It implements the + * {@index "full case folding"} as defined by the Unicode Standard, which + * may differ from the simpler per-character mapping performed by + * {@code compareToIgnoreCase}. + * For example: + * {@snippet lang=java : + * String a = "Fuß"; + * String b = "FUSS"; + * int cmpFoldCase = a.compareToFoldCase(b); // returns 0 + * int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0 + * } + * + * @param str the {@code String} to be compared. + * @return a negative integer, zero, or a positive integer as the specified + * String is greater than, equal to, or less than this String, + * ignoring case considerations by case folding. + * + * @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching + * @see java.text.Collator + * @see #compareToIgnoreCase(String) + * @see #equalsFoldCase(String) + * @since 26 + */ + public int compareToFoldCase(String str) { + return UNICODE_CASEFOLD_ORDER.compare(this, str); + } + /** * Tests if two string regions are equal. *

diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java index 61c62d049bc..21a8b2dd61a 100644 --- a/src/java.base/share/classes/java/lang/StringLatin1.java +++ b/src/java.base/share/classes/java/lang/StringLatin1.java @@ -32,6 +32,8 @@ import java.util.function.Consumer; import java.util.function.IntConsumer; import java.util.stream.Stream; import java.util.stream.StreamSupport; + +import jdk.internal.lang.CaseFolding; import jdk.internal.util.ArraysSupport; import jdk.internal.vm.annotation.IntrinsicCandidate; @@ -179,6 +181,128 @@ final class StringLatin1 { return len1 - len2; } + private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int k1 = off, k2 = ooff; + boolean lo1 = false, lo2 = false; // true if we have a leftover 's' from u+00df -> ss + while ((k1 < last || lo1) && (k2 < olast || lo2)) { + int c1, c2; + if (lo1) { + c1 = 0x73; // leftover 's' + lo1 = false; + } else { + c1 = getChar(value, k1++); + if (c1 == 0xdf) { + c1 = 0x73; + lo1 = true; + } + } + if (lo2) { + c2 = 0x73; // 's' + lo2 = false; + } else { + c2 = getChar(other, k2++); + if (c2 == 0xdf) { + c2 = 0x73; + lo2 = true; + } + } + if (!CharacterDataLatin1.equalsIgnoreCase((byte)c1, (byte)c2)) { + return Character.toLowerCase(c1) - Character.toLowerCase(c2); + } + } + if (k1 < last || lo1) { + return 1; + } + if (k2 < olast || lo2) { + return -1; + } + return 0; + } + + static int compareToFC(byte[] value, byte[] other) { + int len = value.length; + int olen = other.length; + int lim = Math.min(len, olen); + for (int k = 0; k < lim; k++) { + byte b1 = value[k]; + byte b2 = other[k]; + if (!CharacterDataLatin1.equalsIgnoreCase(b1, b2)) { + int c1 = b1 & 0xff; + int c2 = b2 & 0xff; + if (c1 == 0xdf || c2 == 0xdf) { // 0xdf is the only 1:M in latin1 range + return compareToFC0(value, k, len, other, k, olen); + } + return Character.toLowerCase(c1) - Character.toLowerCase(c2); + } + } + return len - olen; + } + + private static int compareToFC0_UTF16(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int f1 = 0, f2 = 0; + int k1 = off, k2 = ooff; + while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) { + int c1, c2; + if (f1 != 0) { + c1 = (f1 & 0xffff); f1 >>>= 16; + } else { + c1 = getChar(value, k1++); + var f = CaseFolding.fold(c1); + if (CaseFolding.isSingleCodePoint(f)) { + c1 = (int)(f & 0xfffff); + } else { + c1 = (int)f & 0xffff; + f1 = (int)(f >>> 16); + } + } + if (f2 != 0) { + c2 = f2 & 0xffff; f2 >>>= 16; + } else { + c2 = StringUTF16.codePointAt(other, k2, olast, true); + k2 += Character.charCount(c2); + var f = CaseFolding.fold(c2); + if (CaseFolding.isSingleCodePoint(f)) { + c2 = (int)(f & 0xfffff); + } else { + c2 = (int)(f & 0xffff); + f2 = (int)(f >>> 16); + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < last || f1 != 0) { + return 1; + } + if (k2 < olast || f2 != 0) { + return -1; + } + return 0; + } + + // latin1 vs utf16 + static int compareToFC_UTF16(byte[] value, byte[] other) { + int last = length(value); + int olast = StringUTF16.length(other); + int lim = Math.min(last, olast); + for (int k = 0; k < lim; k++) { + int cp1 = getChar(value, k); + int cp2 = StringUTF16.codePointAt(other, k, olast, true); + if (cp1 != cp2) { + long cf1 = CaseFolding.fold(cp1); + long cf2 = CaseFolding.fold(cp2); + if (cf1 != cf2) { + if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) { + return compareToFC0_UTF16(value, k, last, other, k, olast); + } + return (int)(cf1 - cf2); + } + } + } + return last - olast; + } + static int hashCode(byte[] value) { return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0); } diff --git a/src/java.base/share/classes/java/lang/StringUTF16.java b/src/java.base/share/classes/java/lang/StringUTF16.java index 4e31c9728e9..75c9e8239ba 100644 --- a/src/java.base/share/classes/java/lang/StringUTF16.java +++ b/src/java.base/share/classes/java/lang/StringUTF16.java @@ -34,6 +34,7 @@ import java.util.function.IntConsumer; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.lang.CaseFolding; import jdk.internal.misc.Unsafe; import jdk.internal.util.ArraysSupport; import jdk.internal.vm.annotation.ForceInline; @@ -93,7 +94,7 @@ final class StringUTF16 { return value.length >> 1; } - private static int codePointAt(byte[] value, int index, int end, boolean checked) { + static int codePointAt(byte[] value, int index, int end, boolean checked) { assert index < end; if (checked) { checkIndex(index, value); @@ -592,6 +593,77 @@ final class StringUTF16 { return -StringLatin1.compareToCI_UTF16(other, value); } + public static int compareToFC_Latin1(byte[] value, byte[] other) { + return -StringLatin1.compareToFC_UTF16(other, value); + } + + private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) { + int f1 = 0, f2 = 0; + int k1 = off, k2 = ooff; + while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) { + int c1, c2; + if (f1 != 0) { + c1 = f1 & 0xffff; f1 >>>= 16; + } else { + c1 = StringUTF16.codePointAt(value, k1, last, true); + k1 += Character.charCount(c1); + var f = CaseFolding.fold(c1); + if (CaseFolding.isSingleCodePoint(f)) { + c1 = (int)(f & 0xfffff); + } else { + c1 = (int)(f & 0xffff); + f1 = (int)(f >> 16); + } + } + if (f2 != 0) { + c2 = f2 & 0xffff; f2 >>>= 16; + } else { + c2 = StringUTF16.codePointAt(other, k2, olast, true); + k2 += Character.charCount(c2); + var f = CaseFolding.fold(c2); + if (CaseFolding.isSingleCodePoint(f)) { + c2 = (int)(f & 0xfffff); + } else { + c2 = (int)(f & 0xffff); + f2 = (int)(f >>> 16); + } + } + if (c1 != c2) { + return c1 - c2; + } + } + if (k1 < last || f1 != 0) { + return 1; + } + if (k2 < olast || f2 != 0) { + return -1; + } + return 0; + } + + public static int compareToFC(byte[] value, byte[] other) { + int tlast = length(value); + int olast = length(other); + int lim = Math.min(tlast, olast); + int k = 0; + while (k < lim) { + int cp1 = codePointAt(value, k, tlast, true); + int cp2 = codePointAt(other, k, olast, true); + if (cp1 != cp2) { + long cf1 = CaseFolding.fold(cp1); + long cf2 = CaseFolding.fold(cp2); + if (cf1 != cf2) { + if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) { + return compareToFC0(value, k, tlast, other, k, olast); + } + return (int) cf1 - (int) cf2; + } + } + k += Character.charCount(cp1); + } + return tlast - olast; + } + static int hashCode(byte[] value) { return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0); } diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index 2908370acd5..58c9186924b 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -43,8 +43,8 @@ import java.util.function.Predicate; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import jdk.internal.lang.CaseFolding; import jdk.internal.util.ArraysSupport; -import jdk.internal.util.regex.CaseFolding; import jdk.internal.util.regex.Grapheme; /** diff --git a/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template new file mode 100644 index 00000000000..24a183c8da0 --- /dev/null +++ b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.internal.lang; + +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static java.util.Map.entry; + +/** + * Utility class that handles Unicode case folding properties defined in + * CasingFolding.txt, including 1:M full case folding. + */ +public final class CaseFolding { + + private CaseFolding() {} + + /** + * Tests whether the specified code point has a folding mapping entry defined. + * + * @param cp + * the Unicode code point to test + * @return {@code true} if the given code point has a case folding mapping entry + * defined in (@code caseFoldingMap}, {@code false} otherwise + */ + public static boolean isDefined(int cp) { + return getDefined(cp) != -1; + } + + /** + * Returns the case-folded form of the specified code point according + * to the Unicode case folding mappings. + *

+ * If the code point has no case folding mapping defined, this method returns + * the original code point. + * + * Possible combinations of the returning case-folding form as a long value + * + * +---+---------+--------+---------+--------+--------+ + * | 1:1 mapping | 0000 | 0000 | 000x | xxxx | 0041 => 0061 or 1E921 => 1E943 + * +---+---------+--------+---------+--------+--------+ + * | 1:2 mapping | 0002 | 0000 | xxxx | xxxx | FB02 => 0066 006C + * +---+---------+--------+---------+--------+--------+ + * | 1:3 mapping | 0003 | xxxx | xxxx | xxxx | FB03 => 0066 0066 0069 + * +---+---------+--------+---------+--------+--------+ + * + * @param cp + * the Unicode code point to fold + * @return a long value representing the case-folded form of the input + * code point, encoded as TBD + */ + public static long fold(int cp) { + var fold = getDefined(cp); + return fold == -1 ? cp : fold; + } + + public static boolean isSingleCodePoint(long fold) { + return (fold >> 48) == 0; + } + + /** + * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive + * matching, according to the + * Simple Loose Matches + * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions. + *

+ * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must + * be applied to literals and (optionally) to character classes. When applied to character classes, each + * character class is expected to be closed under simple case folding. See the standard for the + * detailed explanation and example of "closed". + *

+ * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should + *

    + *
  1. Provide at least the simple, default Unicode case-insensitive matching, and
  2. + *
  3. Specify which character properties or constructs are closed under the matching.
  4. + *
+ *

+ * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching: + * back-refs, string slice (sequences), single, family(char-property) and class range. Single and + * family may appears independently or within a class. + *

+ * For loose/case-insensitive matching, the back-refs, slices and singles apply {@code toUpperCase} and + * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for + * matching. + *

+ * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5, + * if their behavior is clearly specified. + *

+ * This method addresses that requirement for the "range" construct within in character class by computing + * the additional characters that should be included to close the range under simple case folding: + *

+ * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple + * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped + * character is not already in the range, then that mapped character (typically lowercase) is added to + * the expansion set. + *

+ * This allows regex character class "range" implementation to use the returned expansion set to support + * additional case-insensitive matching, without duplicating characters already covered by the existing + * regex range implementation. The expectation is the matching is done using both the uppercase and + * lowercase forms of the input character, for example + * + *

{@code
+    *
+    *     ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
+    *           inRange(lower, Character.toLower(ch), upper) ||
+    *           additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
+    *           additionalClosingCharacters.contains(Character.toUpperCase(ch))
+    * }
+ * + * @param start the starting code point of the character range + * @param end the ending code point of the character range + * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding + * those already in the range + * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches + */ + public static int[] getClassRangeClosingCharacters(int start, int end) { + int[] expanded = new int[expanded_case_cps.length]; + int off = 0; + for (int cp : expanded_case_cps) { + if (cp >= start && cp <= end) { + int folding = expanded_case_map.get(cp); + if (folding < start || folding > end) { + expanded[off++] = folding; + } + } + } + return Arrays.copyOf(expanded, off); + } + + private static final Map expanded_case_map = Map.ofEntries( +%%%Expanded_Case_Map_Entries + ); + + private static final int[] expanded_case_cps = expanded_case_map.keySet() + .stream() + .mapToInt(Integer::intValue) + .toArray(); + + private static final int HASH_CP = 0; + private static final int HASH_INDEX = 1; + private static final int HASH_NEXT = 2; + + private static int[][] hashKeys(int[] keys) { + var hashes = new int[keys.length << 1][3]; // cp + hash + next + var off = keys.length; + for (int i = 0; i < keys.length; i++) { + var cp = keys[i]; + var hash = cp % keys.length; + while (hashes[hash][HASH_CP] != 0) { + var next = hashes[hash][HASH_NEXT]; + if (next == 0) { + hashes[hash][HASH_NEXT] = off; + hash = off++; + break; + } else { + hash = next; + } + } + hashes[hash][HASH_CP] = cp; + hashes[hash][HASH_INDEX] = i; + } + return Arrays.copyOf(hashes, off); + } + + private static long getDefined(int cp) { + var hashes = CASE_FOLDING_HASHES; + var length = CASE_FOLDING_CPS.length; // hashed based on total defined. + var hash = cp % length; + while (hashes[hash][HASH_CP] != cp) { + var next = hashes[hash][HASH_NEXT]; + if (next == 0) { + return -1; // hash miss + } + hash = next; + } + var index = hashes[hash][HASH_INDEX]; + return CASE_FOLDING_VALUES[index]; + } + +%%%Entries + + private static final int[][] CASE_FOLDING_HASHES = hashKeys(CASE_FOLDING_CPS); +} diff --git a/src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template b/src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template deleted file mode 100644 index 8ffbde6c535..00000000000 --- a/src/java.base/share/classes/jdk/internal/util/regex/CaseFolding.java.template +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package jdk.internal.util.regex; - -import java.util.Arrays; -import java.util.Map; -import java.util.Objects; - -import static java.util.Map.entry; - -public final class CaseFolding { - - private static final Map expanded_case_map = Map.ofEntries( -%%%Entries - ); - - private static final int[] expanded_case_cps = expanded_case_map.keySet() - .stream() - .mapToInt(Integer::intValue) - .toArray(); - - private CaseFolding() {} - - /** - * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive - * matching, according to the - * Simple Loose Matches - * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions. - *

- * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must - * be applied to literals and (optionally) to character classes. When applied to character classes, each - * character class is expected to be closed under simple case folding. See the standard for the - * detailed explanation and example of "closed". - *

- * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should - *

    - *
  1. Provide at least the simple, default Unicode case-insensitive matching, and
  2. - *
  3. Specify which character properties or constructs are closed under the matching.
  4. - *
- *

- * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching: - * back-refs, string slice (sequences), single, family(char-property) and class range. Single and - * family may appears independently or within a class. - *

- * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and - * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for - * matching. - *

- * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5, - * if their behavior is clearly specified. - *

- * This method addresses that requirement for the "range" construct within in character class by computing - * the additional characters that should be included to close the range under simple case folding: - *

- * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple - * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped - * character is not already in the range, then that mapped character (typically lowercase) is added to - * the expansion set. - *

- * This allows regex character class "range" implementation to use the returned expansion set to support - * additional case-insensitive matching, without duplicating characters already covered by the existing - * regex range implementation. The expectation is the matching is done using both the uppercase and - * lowercase forms of the input character, for example - * - *

{@code
-     *
-     *     ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
-     *           inRange(lower, Character.toLower(ch), upper) ||
-     *           additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
-     *           additionalClosingCharacters.contains(Character.toUpperCase(ch))
-     * }
- * - *

- * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches - * @param start the starting code point of the character range - * @param end the ending code point of the character range - * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding - * those already in the range - */ - public static int[] getClassRangeClosingCharacters(int start, int end) { - int[] expanded = new int[expanded_case_cps.length]; - int off = 0; - for (int cp : expanded_case_cps) { - if (cp >= start && cp <= end) { - int folding = expanded_case_map.get(cp); - if (folding < start || folding > end) { - expanded[off++] = folding; - } - } - } - return Arrays.copyOf(expanded, off); - } -} diff --git a/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java new file mode 100644 index 00000000000..86b3fba0a27 --- /dev/null +++ b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @summary tests unicode case-folding based String comparison and equality + * @bug 4397357 + * @library /lib/testlibrary/java/lang + * @modules java.base/jdk.internal.lang:+open + * @run junit/othervm + * UnicodeCaseFoldingTest + */ + +import java.nio.file.Files; +import java.util.stream.Stream; +import java.util.stream.Collectors; +import java.util.ArrayList; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import jdk.internal.lang.CaseFolding; + +public class UnicodeCaseFoldingTest { + + @Test + void testAllCommnFullCodePointsListedInCaseFoldinigTxt() throws Throwable { + var filter = "^.*; [CF]; .*$"; // C=common, F=full, for full case folding + var results = Files.lines(UCDFiles.CASEFOLDING) + .filter(line -> !line.startsWith("#") && line.matches(filter)) + .map(line -> { + var fields = line.split("; "); + var cp = Integer.parseInt(fields[0], 16); + fields = fields[2].trim().split(" "); + var folding = new int[fields.length]; + for (int i = 0; i < folding.length; i++) { + folding[i] = Integer.parseInt(fields[i], 16); + } + var source = new String(Character.toChars(cp)); + var expected = new String(folding, 0, folding.length); + // (1) Verify the folding result matches expected + assertEquals(expected, foldCase(source), "CaseFolding.fold(): "); + + // (2) Verify compareToFoldCase() result + assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)"); + assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)"); + + // (3) Verify equalsFoldCase() result + assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)"); + assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)"); + return null; + }) + .filter(error -> error != null) + .toArray(); + assertEquals(0, results.length); + } + + @Test + void testAllSimpleCodePointsListedInCaseFoldinigTxt() throws Throwable { + // S=simple, for simple case folding. The simple case folding should still matches + var filter = "^.*; [S]; .*$"; + var results = Files.lines(UCDFiles.CASEFOLDING) + .filter(line -> !line.startsWith("#") && line.matches(filter)) + .map(line -> { + var fields = line.split("; "); + var cp = Integer.parseInt(fields[0], 16); + fields = fields[2].trim().split(" "); + var folding = new int[fields.length]; + for (int i = 0; i < folding.length; i++) { + folding[i] = Integer.parseInt(fields[i], 16); + } + var source = new String(Character.toChars(cp)); + var expected = new String(folding, 0, folding.length); + + // (1) Verify compareToFoldCase() result + assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)"); + assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)"); + + // (2) Verify equalsFoldCase() result + assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)"); + assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)"); + return null; + }) + .filter(error -> error != null) + .toArray(); + assertEquals(0, results.length); + } + + @Test + public void testAllCodePointsFoldToThemselvesIfNotListed() throws Exception { + // Collect all code points that appear in CaseFolding.txt + var listed = Files.lines(UCDFiles.CASEFOLDING) + .filter(line -> !line.startsWith("#") && line.matches("^.*; [CF]; .*$")) + .map(line -> Integer.parseInt(line.split("; ")[0], 16)) + .collect(Collectors.toSet()); + + var failures = new ArrayList(); + + // Scan BMP + Supplementary Plane 1 (U+0000..U+1FFFF) + for (int cp = Character.MIN_CODE_POINT; cp <= 0x1FFFF; cp++) { + if (!Character.isDefined(cp)) { + continue; // skip undefined + } + if (Character.isSurrogate((char) cp)) { + continue; // skip surrogate code units + } + if (listed.contains(cp)) { + continue; // already tested separately + } + String s = new String(Character.toChars(cp)); + String folded = foldCase(s); + if (!s.equals(folded)) { + failures.add(String.format("Unexpected folding: U+%04X '%s' → '%s'", cp, s, folded)); + } + } + + assertEquals(0, failures.size(), + () -> "Some unlisted code points folded unexpectedly:\n" + + String.join("\n", failures)); + } + + @ParameterizedTest(name = "CaseFold \"{0}\" → \"{1}\"") + @MethodSource("caseFoldTestCases") + void testIndividualCaseFolding(String input, String expected) { + assertEquals(expected, foldCase(input)); + } + + static Stream caseFoldTestCases() { + return Stream.of( + // ASCII simple cases + Arguments.of("ABC", "abc"), + Arguments.of("already", "already"), + Arguments.of("MiXeD123", "mixed123"), + // --- Latin-1 to non-Latin-1 fold --- + Arguments.of("aBc\u00B5Efg", "abc\u03BCefg"), // "µ" → "μ" + Arguments.of("test\u00B5\ud801\udc00X", "test\u03bc\ud801\udc28x"), + // German Eszett + Arguments.of("Stra\u00DFe", "strasse"), // "Straße" + Arguments.of("\u1E9E", "ss"), // "ẞ" capital sharp S + // Turkish dotted I / dotless i + Arguments.of("I", "i"), + Arguments.of("\u0130", "i\u0307"), // capital dotted I → "i + dot above" + Arguments.of("\u0069\u0307", "i\u0307"), // small i + dot above remains + Arguments.of("\u0131", "\u0131"), // "ı" (dotless i stays dotless) + + // Greek special cases --- + Arguments.of("\u039F\u03A3", "\u03BF\u03C3"), // "ΟΣ" → "οσ" final sigma always folds to normal sigma + Arguments.of("\u1F88", "\u1F00\u03B9"), // "ᾈ" → "ἀι" Alpha with psili + ypogegrammeni + Arguments.of("\u039C\u03AC\u03CA\u03BF\u03C2", "\u03BC\u03AC\u03CA\u03BF\u03C3"), // "Μάϊος" → "μάϊοσ" + Arguments.of("\u1F08", "\u1F00"), // Ἀ (Capital Alpha with psili) → ἀ + + // Supplementary Plane characters + Arguments.of("\uD801\uDC00", "\uD801\uDC28"), // Deseret Capital Letter Long I → Small + Arguments.of("\uD801\uDC01", "\uD801\uDC29"), // Deseret Capital Letter Long E → Small + + // Supplementary inside ASCII + Arguments.of("abc\uD801\uDC00def", "abc\uD801\uDC28def"), + // Ligatures and compatibility folds + Arguments.of("\uFB00", "ff"), // ff → ff + Arguments.of("\uFB03", "ffi"), // ffi → ffi + Arguments.of("\u212A", "k"), // Kelvin sign → k + + Arguments.of("abc\uFB00def", "abcffdef"), // ff → ff + Arguments.of("abc\uFB03def", "abcffidef"), // ffi → ffi + Arguments.of("abc\u212Adef", "abckdef"), // Kelvin sign → k + + // --- Fullwidth --- + Arguments.of("\uFF21\uFF22\uFF23", "\uFF41\uFF42\uFF43"), // "ABC" → "abc" + + // --- Armenian --- + Arguments.of("\u0531", "\u0561"), // "Ա" → "ա" + + // --- Cherokee --- + Arguments.of("\u13A0", "\u13A0"), // Capital Cherokee A folds to itself + Arguments.of("\uAB70", "\u13A0") // Small Cherokee A folds Capital Cherokee A + ); + } + + static Stream caseFoldEqualProvider() { + return Stream.of( + Arguments.of("abc", "ABC"), + Arguments.of("aBcDe", "AbCdE"), + Arguments.of("\u00C0\u00E7", "\u00E0\u00C7"), // Àç vs àÇ + Arguments.of("straße", "STRASSE"), // ß → ss + Arguments.of("\uD83C\uDDE6", "\uD83C\uDDE6"), // 🇦 vs 🇦 + Arguments.of("\u1E9E", "ss"), // ẞ (capital sharp S) + Arguments.of("\u03A3", "\u03C3"), // Σ vs σ (Greek Sigma) + Arguments.of("\u03C3", "\u03C2"), // σ vs ς (Greek sigma/final sigma) + Arguments.of("\u212B", "\u00E5"), // Å (Angstrom sign) vs å + Arguments.of("\uFB00", "ff"), // ff (ligature) + Arguments.of("\u01C5", "\u01C5"), // Dž (Latin capital D with small z with caron) + Arguments.of("Caf\u00E9", "CAF\u00C9"), // Café vs CAFÉ + Arguments.of("\u03BA\u03B1\u03BB\u03B7\u03BC\u03AD\u03C1\u03B1", "\u039A\u0391\u039B\u0397\u039C\u0388\u03A1\u0391"), // καλημέρα vs ΚΑΛΗΜΕΡΑ + Arguments.of("\u4E2D\u56FD", "\u4E2D\u56FD"), // 中国 + Arguments.of("\u03B1", "\u0391"), // α vs Α (Greek alpha) + Arguments.of("\u212B", "\u00C5"), // Å vs Å + // from StringCompareToIgnoreCase + Arguments.of("\u0100\u0102\u0104\u0106\u0108", "\u0100\u0102\u0104\u0106\u0109"), // ĀĂĄĆĈ vs ĀĂĄĆĉ + Arguments.of("\u0101\u0103\u0105\u0107\u0109", "\u0100\u0102\u0104\u0106\u0109"), // āăąćĉ vs ĀĂĄĆĉ + Arguments.of("\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04", + "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c"), // 𐐀𐐁𐐂𐐃𐐄 vs 𐐀𐐁𐐂𐐃𐐬 + Arguments.of("\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c", + "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c") // 𐐨𐐩𐐪𐐫𐐬 vs 𐐀𐐁𐐂𐐃𐐬 + ); + } + + @ParameterizedTest + @MethodSource("caseFoldEqualProvider") + void testcompareToFoldCaseEquals(String s1, String s2) { + assertEquals(0, s1.compareToFoldCase(s2)); + assertEquals(0, s2.compareToFoldCase(s1)); + assertEquals(true, s1.equalsFoldCase(s2)); + assertEquals(true, s2.equalsFoldCase(s1)); + assertEquals(foldCase(s1), foldCase(s2)); + } + + static Stream caseFoldOrderingProvider() { + return Stream.of( + Arguments.of("asa", "aß", -1), // ß → ss → "asa" < "ass" + Arguments.of("aß", "asa", +1), + Arguments.of("a\u00DF", "ass", 0), // aß vs ass + Arguments.of("\uFB03", "ffi", 0), // ffi (ligature) + Arguments.of("\u00C5", "Z", 1), // Å vs Z + Arguments.of("A", "\u00C0", -1), // A vs À + Arguments.of("\u03A9", "\u03C9", 0), // Ω vs ω + Arguments.of("\u03C2", "\u03C3", 0), // ς vs σ + Arguments.of("\uD835\uDD23", "R", 1), // 𝔯 (fraktur r) vs R + Arguments.of("\uFF26", "E", 1), // F (full-width F) vs E + Arguments.of("\u00C9clair", "Eclair", 1), // Éclair vs Eclair + Arguments.of("\u03bc\u00df", "\u00b5s", 1), + Arguments.of("\u00b5s", "\u03bc\u00df", -1) + ); + } + + @ParameterizedTest + @MethodSource("caseFoldOrderingProvider") + void testcompareToFoldCaseOrdering(String s1, String s2, int expectedSign) { + int cmp = s1.compareToFoldCase(s2); + assertEquals(expectedSign, Integer.signum(cmp)); + } + + static Stream roundTripProvider() { + return Stream.of( + Arguments.of("abc"), + Arguments.of("ABC"), + Arguments.of("straße"), + Arguments.of("Àç"), + Arguments.of("aß"), + Arguments.of("\uFB02uff"), // fluff (ligature in "fluff") + Arguments.of("\u00C9COLE") // ÉCOLE + ); + } + + @ParameterizedTest + @MethodSource("roundTripProvider") + void testCaseFoldRoundTrip(String s) { + String folded = foldCase(s); + assertEquals(0, s.compareToFoldCase(folded)); + assertEquals(0, folded.compareToFoldCase(s)); + assertEquals(true, s.equalsFoldCase(folded)); + assertEquals(true, folded.equalsFoldCase(s)); + } + + // helper to test the integrity of folding mapping + private static int[] longToFolding(long value) { + int len = (int) (value >>> 48); + if (len == 0) { + return new int[]{(int) (value & 0xFFFFF)}; + } else { + var folding = new int[len]; + for (int i = 0; i < len; i++) { + folding[i] = (int) (value & 0xFFFF); + value >>= 16; + } + return folding; + } + } + + private static String foldCase(String s) { + int first; + int len = s.length(); + int cpCnt = 1; + for (first = 0; first < len; first += cpCnt) { + int cp = s.codePointAt(first); + if (CaseFolding.isDefined(cp)) { + break; + } + cpCnt = Character.charCount(cp); + } + if (first == len) { + return s; + } + StringBuilder sb = new StringBuilder(len); + sb.append(s, 0, first); + for (int i = first; i < len; i += cpCnt) { + int cp = s.codePointAt(i); + int[] folded = longToFolding(CaseFolding.fold(cp)); + for (int f : folded) { + sb.appendCodePoint(f); + } + cpCnt = Character.charCount(cp); + } + return sb.toString(); + } +} diff --git a/test/micro/org/openjdk/bench/java/lang/StringCompareToFoldCase.java b/test/micro/org/openjdk/bench/java/lang/StringCompareToFoldCase.java new file mode 100644 index 00000000000..dff4d874705 --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/StringCompareToFoldCase.java @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang; + +import org.openjdk.jmh.annotations.*; +import java.util.concurrent.TimeUnit; + +/* + * This benchmark naively explores String::compareToFoldCase performance + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +@Warmup(iterations = 5, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork(3) +public class StringCompareToFoldCase { + + private String asciiUpper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + private String asciiUpperLower = "ABCDEFGHIJKLMNOpqrstuvwxyz"; + private String asciiLower = "abcdefghijklmnopqrstuvwxyz"; + + private String asciiWithDF = "abcdßßßßßßßßßßßßßßßßWXYZ"; + private String asciiWithDFSS = "abcdssssssssssssssssßßßßßßßßWXYZ"; + + private String asciiLatine1 = "ABCDEFGHIJKLMNOpqrstuvwxyz0"; + private String asciiLatin1UTF16 = "abcdefghijklmnopqrstuvwxyz\u0391"; + + private String greekUpper = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u0395"; // ΑΒΓΔΕ + private String greekUpperLower = "\u0391\u0392\u0393\u0394\u0395\u0391\u0392\u0393\u0394\u03B5"; // ΑΒΓΔε + private String greekLower = "\u03B1\u03B2\u03B3\u03B4\u03B5\u03B1\u03B2\u03B3\u03B4\u03B5"; // αβγδε + + public String supUpper = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc04"; + public String supUpperLower = "\ud801\udc00\ud801\udc01\ud801\udc02\ud801\udc03\ud801\udc2c"; + public String supLower = "\ud801\udc28\ud801\udc29\ud801\udc2a\ud801\udc2b\ud801\udc2c"; + + @Benchmark + public int asciiUpperLower() { + return asciiUpper.compareToIgnoreCase(asciiUpperLower); + } + + @Benchmark + public int asciiLower() { + return asciiUpper.compareToIgnoreCase(asciiLower); + } + + @Benchmark + public int greekUpperLower() { + return greekUpper.compareToIgnoreCase(greekUpperLower); + } + + @Benchmark + public int greekLower() { + return greekUpper.compareToIgnoreCase(greekLower); + } + + @Benchmark + public int latin1UTF16() { + return asciiLatine1.compareToIgnoreCase(asciiLatin1UTF16); + } + + @Benchmark + public int supUpperLower() { + return supUpper.compareToIgnoreCase(supUpperLower); + } + + @Benchmark + public int supLower() { + return supUpper.compareToIgnoreCase(supLower); + } + + @Benchmark + public int asciiUpperLowerFC() { + return asciiUpper.compareToFoldCase(asciiUpperLower); + } + + @Benchmark + public int asciiLowerFC() { + return asciiUpper.compareToFoldCase(asciiLower); + } + + @Benchmark + public int asciiWithDFFC() { + return asciiWithDF.compareToFoldCase(asciiWithDFSS); + } + + @Benchmark + public int greekUpperLowerFC() { + return greekUpper.compareToFoldCase(greekUpperLower); + } + + @Benchmark + public int greekLowerFC() { + return greekUpper.compareToFoldCase(greekLower); + } + + @Benchmark + public int latin1UTF16FC() { + return asciiLatine1.compareToFoldCase(asciiLatin1UTF16); } + + @Benchmark + public int supUpperLowerFC() { + return supUpper.compareToFoldCase(supUpperLower); + } + + @Benchmark + public int supLowerFC() { + return supUpper.compareToFoldCase(supLower); + } + + @Benchmark + public boolean asciiUpperLowerEQ() { + return asciiUpper.equalsIgnoreCase(asciiUpperLower); + } + + @Benchmark + public boolean asciiLowerEQ() { + return asciiUpper.equalsIgnoreCase(asciiLower); + } + + @Benchmark + public boolean greekUpperLowerEQ() { + return greekUpper.equalsIgnoreCase(greekUpperLower); + } + + @Benchmark + public boolean greekLowerEQ() { + return greekUpper.equalsIgnoreCase(greekLower); + } + + @Benchmark + public boolean latin1UTF16EQ() { + return asciiLatine1.equalsIgnoreCase(asciiLatin1UTF16); + } + + @Benchmark + public boolean supUpperLowerEQ() { + return supUpper.equalsIgnoreCase(supUpperLower); + } + + @Benchmark + public boolean supLowerEQ() { + return supUpper.equalsIgnoreCase(supLower); + } + + @Benchmark + public boolean asciiUpperLowerEQFC() { + return asciiUpper.equalsFoldCase(asciiUpperLower); + } + + @Benchmark + public boolean asciiLowerEQFC() { + return asciiUpper.equalsFoldCase(asciiLower); + } + + @Benchmark + public boolean greekUpperLowerEQFC() { + return greekUpper.equalsFoldCase(greekUpperLower); + } + + @Benchmark + public boolean greekLowerEQFC() { + return greekUpper.equalsFoldCase(greekLower); + } + + @Benchmark + public boolean latin1UTF16EQFC() { + return asciiLatine1.equalsFoldCase(asciiLatin1UTF16); + } + + @Benchmark + public boolean supUpperLowerEQFC() { + return supUpper.equalsFoldCase(supUpperLower); + } + + @Benchmark + public boolean supLowerEQFC() { + return supUpper.equalsFoldCase(supLower); + } + }