diff --git a/make/ToolsJdk.gmk b/make/ToolsJdk.gmk index 629cadbf83a..b04d7820c91 100644 --- a/make/ToolsJdk.gmk +++ b/make/ToolsJdk.gmk @@ -79,7 +79,7 @@ TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_too build.tools.generateextraproperties.GenerateExtraProperties TOOL_GENERATECASEFOLDING = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \ - build.tools.generatecharacter.CaseFolding + build.tools.generatecharacter.GenerateCaseFolding TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \ build.tools.makezipreproducible.MakeZipReproducible diff --git a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java b/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java deleted file mode 100644 index 9abc2059b6a..00000000000 --- a/make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package build.tools.generatecharacter; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.nio.file.StandardOpenOption; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class CaseFolding { - - public static void main(String[] args) throws Throwable { - if (args.length != 3) { - System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java"); - System.exit(1); - } - var templateFile = Paths.get(args[0]); - var caseFoldingTxt = Paths.get(args[1]); - var genSrcFile = Paths.get(args[2]); - var supportedTypes = "^.*; [CTS]; .*$"; - var caseFoldingEntries = Files.lines(caseFoldingTxt) - .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) - .map(line -> { - String[] cols = line.split("; "); - return new String[] {cols[0], cols[1], cols[2]}; - }) - .filter(cols -> { - // the folding case doesn't map back to the original char. - var cp1 = Integer.parseInt(cols[0], 16); - var cp2 = Integer.parseInt(cols[2], 16); - return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; - }) - .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) - .collect(Collectors.joining(",\n", "", "")); - - // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's. - // 0049; T; 0131; # LATIN CAPITAL LETTER I - final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49); - - // Generate .java file - Files.write( - genSrcFile, - Files.lines(templateFile) - .map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line) - .collect(Collectors.toList()), - StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); - } -} diff --git a/make/jdk/src/classes/build/tools/generatecharacter/GenerateCaseFolding.java b/make/jdk/src/classes/build/tools/generatecharacter/GenerateCaseFolding.java new file mode 100644 index 00000000000..2f6a9add5cb --- /dev/null +++ b/make/jdk/src/classes/build/tools/generatecharacter/GenerateCaseFolding.java @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package build.tools.generatecharacter; + +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.util.Arrays; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class GenerateCaseFolding { + + public static void main(String[] args) throws Throwable { + if (args.length != 3) { + System.err.println("Usage: java GenerateCaseFolding TemplateFile CaseFolding.txt CaseFolding.java"); + System.exit(1); + } + var templateFile = Paths.get(args[0]); + var caseFoldingTxt = Paths.get(args[1]); + var genSrcFile = Paths.get(args[2]); + + // java.lang + var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding + String[][] caseFoldings = Files.lines(caseFoldingTxt) + .filter(line -> !line.startsWith("#") && line.matches(supportedTypes)) + .map(line -> { + var fields = line.split("; "); + var cp = fields[0]; + fields = fields[2].trim().split(" "); + var folding = new String[fields.length + 1]; + folding[0] = cp; + System.arraycopy(fields, 0, folding, 1, fields.length); + return folding; + }) + .toArray(size -> new String[size][]); + + // util.regex + var expandedSupportedTypes = "^.*; [CTS]; .*$"; + var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt) + .filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes)) + .map(line -> { + String[] cols = line.split("; "); + return new String[]{cols[0], cols[1], cols[2]}; + }) + .filter(cols -> { + // the folding case doesn't map back to the original char. + var cp1 = Integer.parseInt(cols[0], 16); + var cp2 = Integer.parseInt(cols[2], 16); + return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1; + }) + .map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2])) + .collect(Collectors.joining(",\n", "", "")); + + // hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's. + // 0049; T; 0131; # LATIN CAPITAL LETTER I + final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49); + + Files.write( + genSrcFile, + Files.lines(templateFile) + .map(line -> line.contains("%%%Entries") ? genFoldingEntries(caseFoldings) : line) + .map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line) + .collect(Collectors.toList()), + StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); + } + + private static long foldingToLong(String[] folding) { + int cp = Integer.parseInt(folding[0], 16); + long value = (long)Integer.parseInt(folding[1], 16); + if (!Character.isSupplementaryCodePoint(cp) && folding.length != 2) { + var shift = 16; + for (int j = 2; j < folding.length; j++) { + value |= (long)Integer.parseInt(folding[j], 16) << shift; + shift <<= 1; + } + value = value | (long) (folding.length - 1) << 48; + } + return value; + } + + private static String genFoldingEntries(String[][] foldings) { + StringBuilder sb = new StringBuilder(); + sb.append(" private static final int[] CASE_FOLDING_CPS = {\n"); + int width = 10; + for (int i = 0; i < foldings.length; i++) { + if (i % width == 0) + sb.append(" "); + sb.append(String.format("0X%s", foldings[i][0])); + if (i < foldings.length - 1) + sb.append(", "); + if (i % width == width - 1 || i == foldings.length - 1) + sb.append("\n"); + } + sb.append(" };\n\n"); + + sb.append(" private static final long[] CASE_FOLDING_VALUES = {\n"); + width = 6; + for (int i = 0; i < foldings.length; i++) { + if (i % width == 0) + sb.append(" "); // indent + sb.append(String.format("0x%013xL", foldingToLong(foldings[i]))); + if (i < foldings.length - 1) + sb.append(", "); + if (i % width == width - 1 || i == foldings.length - 1) { + sb.append("\n"); + } + } + sb.append(" };\n"); + return sb.toString(); + } +} diff --git a/make/modules/java.base/gensrc/GensrcCharacterData.gmk b/make/modules/java.base/gensrc/GensrcCharacterData.gmk index c05b126299b..d7947d907e2 100644 --- a/make/modules/java.base/gensrc/GensrcCharacterData.gmk +++ b/make/modules/java.base/gensrc/GensrcCharacterData.gmk @@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA) ################################################################################ + +GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/lang/CaseFolding.java + +STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template +CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt + +$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT) + $(call LogInfo, Generating $@) + $(call MakeTargetDir) + $(TOOL_GENERATECASEFOLDING) \ + $(STRINGCASEFOLDING_TEMPLATE) \ + $(CASEFOLDINGTXT) \ + $(GENSRC_STRINGCASEFOLDING) + +TARGETS += $(GENSRC_STRINGCASEFOLDING) + + endif # include guard include MakeIncludeEnd.gmk diff --git a/make/modules/java.base/gensrc/GensrcRegex.gmk b/make/modules/java.base/gensrc/GensrcRegex.gmk index a30f22b34d4..c46a029e2c2 100644 --- a/make/modules/java.base/gensrc/GensrcRegex.gmk +++ b/make/modules/java.base/gensrc/GensrcRegex.gmk @@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK) ################################################################################ -GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java - -CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template -CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt - -$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT) - $(call LogInfo, Generating $@) - $(call MakeTargetDir) - $(TOOL_GENERATECASEFOLDING) \ - $(CASEFOLDINGTEMP) \ - $(CASEFOLDINGTXT) \ - $(GENSRC_CASEFOLDING) - -TARGETS += $(GENSRC_CASEFOLDING) - -################################################################################ - endif # include guard include MakeIncludeEnd.gmk diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index 52f908c9e98..d7aef113e15 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -117,9 +117,38 @@ import sun.nio.cs.UTF_8; * Unicode code points (i.e., characters), in addition to those for * dealing with Unicode code units (i.e., {@code char} values). * - *
Unless otherwise noted, methods for comparing Strings do not take locale - * into account. The {@link java.text.Collator} class provides methods for - * finer-grain, locale-sensitive String comparison. + *
String comparison and case-insensitive matching + * + *
There are several related ways to compare {@code String} values; choose + * the one whose semantics fit your purpose: + * + *
Unless otherwise noted, methods for comparing Strings do not take locale into + * account. The {@link java.text.Collator} class provides methods for finer-grain, + * locale-sensitive String comparison. * * @implNote The implementation of the string concatenation operator is left to * the discretion of a Java compiler, as long as the compiler ultimately conforms @@ -2179,6 +2208,7 @@ public final class String * false} otherwise * * @see #equals(Object) + * @see #equalsFoldCase(String) * @see #codePoints() */ public boolean equalsIgnoreCase(String anotherString) { @@ -2188,6 +2218,57 @@ public final class String && regionMatches(true, 0, anotherString, 0, length()); } + /** + * Compares this {@code String} to another {@code String} for equality, + * using {@index "Unicode case folding"}. Two strings are considered equal + * by this method if their case-folded forms are identical. + *
+ * Case folding is defined by the Unicode Standard in + * CaseFolding.txt, + * including 1:M mappings. For example, {@code "Fuß".equalsFoldCase("FUSS")} + * returns {@code true}, since the character {@code U+00DF} (sharp s) folds + * to {@code "ss"}. + *
+ * Case folding is locale-independent and language-neutral, unlike
+ * locale-sensitive transformations such as {@link #toLowerCase()} or
+ * {@link #toUpperCase()}. It is intended for caseless matching,
+ * searching, and indexing.
+ *
+ * @apiNote
+ * This method is the Unicode-compliant alternative to
+ * {@link #equalsIgnoreCase(String)}. It implements full case folding as
+ * defined by the Unicode Standard, which may differ from the simpler
+ * per-character mapping performed by {@code equalsIgnoreCase}.
+ * For example:
+ * {@snippet lang=java :
+ * String a = "Fuß";
+ * String b = "FUSS";
+ * boolean equalsFoldCase = a.equalsFoldCase(b); // returns true
+ * boolean equalsIgnoreCase = a.equalsIgnoreCase(b); // returns false
+ * }
+ *
+ * @param anotherString
+ * The {@code String} to compare this {@code String} against
+ *
+ * @return {@code true} if the given object is not {@code null} and represents
+ * the same sequence of characters as this string under Unicode case
+ * folding; {@code false} otherwise.
+ *
+ * @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching
+ * @see #compareToFoldCase(String)
+ * @see #equalsIgnoreCase(String)
+ * @since 26
+ */
+ public boolean equalsFoldCase(String anotherString) {
+ if (this == anotherString) {
+ return true;
+ }
+ if (anotherString == null) {
+ return false;
+ }
+ return UNICODE_CASEFOLD_ORDER.compare(this, anotherString) == 0;
+ }
+
/**
* Compares two strings lexicographically.
* The comparison is based on the Unicode value of each character in
@@ -2303,12 +2384,86 @@ public final class String
* than this String, ignoring case considerations.
* @see java.text.Collator
* @see #codePoints()
+ * @see #compareToFoldCase(String)
* @since 1.2
*/
public int compareToIgnoreCase(String str) {
return CASE_INSENSITIVE_ORDER.compare(this, str);
}
+ /**
+ * A Comparator that orders {@code String} objects as by
+ * {@link #compareToFoldCase(String) compareToFoldCase()}.
+ *
+ * @see #compareToFoldCase(String)
+ * @since 26
+ */
+ public static final Comparator
+ * Case folding is a locale-independent, language-neutral form of case mapping,
+ * primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)},
+ * which applies a simpler locale-insensitive uppercase mapping. This method
+ * follows the Unicode {@index "full"} case folding, providing stable and
+ * consistent results across all environments.
+ *
+ * Note that this method does not take locale into account, and may
+ * produce results that differ from locale-sensitive ordering. Use
+ * {@link java.text.Collator} for locale-sensitive comparison.
+ *
+ * @apiNote
+ * This method is the Unicode-compliant alternative to
+ * {@link #compareToIgnoreCase(String)}. It implements the
+ * {@index "full case folding"} as defined by the Unicode Standard, which
+ * may differ from the simpler per-character mapping performed by
+ * {@code compareToIgnoreCase}.
+ * For example:
+ * {@snippet lang=java :
+ * String a = "Fuß";
+ * String b = "FUSS";
+ * int cmpFoldCase = a.compareToFoldCase(b); // returns 0
+ * int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
+ * }
+ *
+ * @param str the {@code String} to be compared.
+ * @return a negative integer, zero, or a positive integer as the specified
+ * String is greater than, equal to, or less than this String,
+ * ignoring case considerations by case folding.
+ *
+ * @spec https://www.unicode.org/versions/latest/core-spec/chapter-5/#G21790 Unicode Caseless Matching
+ * @see java.text.Collator
+ * @see #compareToIgnoreCase(String)
+ * @see #equalsFoldCase(String)
+ * @since 26
+ */
+ public int compareToFoldCase(String str) {
+ return UNICODE_CASEFOLD_ORDER.compare(this, str);
+ }
+
/**
* Tests if two string regions are equal.
*
diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java
index 61c62d049bc..21a8b2dd61a 100644
--- a/src/java.base/share/classes/java/lang/StringLatin1.java
+++ b/src/java.base/share/classes/java/lang/StringLatin1.java
@@ -32,6 +32,8 @@ import java.util.function.Consumer;
import java.util.function.IntConsumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
+
+import jdk.internal.lang.CaseFolding;
import jdk.internal.util.ArraysSupport;
import jdk.internal.vm.annotation.IntrinsicCandidate;
@@ -179,6 +181,128 @@ final class StringLatin1 {
return len1 - len2;
}
+ private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
+ int k1 = off, k2 = ooff;
+ boolean lo1 = false, lo2 = false; // true if we have a leftover 's' from u+00df -> ss
+ while ((k1 < last || lo1) && (k2 < olast || lo2)) {
+ int c1, c2;
+ if (lo1) {
+ c1 = 0x73; // leftover 's'
+ lo1 = false;
+ } else {
+ c1 = getChar(value, k1++);
+ if (c1 == 0xdf) {
+ c1 = 0x73;
+ lo1 = true;
+ }
+ }
+ if (lo2) {
+ c2 = 0x73; // 's'
+ lo2 = false;
+ } else {
+ c2 = getChar(other, k2++);
+ if (c2 == 0xdf) {
+ c2 = 0x73;
+ lo2 = true;
+ }
+ }
+ if (!CharacterDataLatin1.equalsIgnoreCase((byte)c1, (byte)c2)) {
+ return Character.toLowerCase(c1) - Character.toLowerCase(c2);
+ }
+ }
+ if (k1 < last || lo1) {
+ return 1;
+ }
+ if (k2 < olast || lo2) {
+ return -1;
+ }
+ return 0;
+ }
+
+ static int compareToFC(byte[] value, byte[] other) {
+ int len = value.length;
+ int olen = other.length;
+ int lim = Math.min(len, olen);
+ for (int k = 0; k < lim; k++) {
+ byte b1 = value[k];
+ byte b2 = other[k];
+ if (!CharacterDataLatin1.equalsIgnoreCase(b1, b2)) {
+ int c1 = b1 & 0xff;
+ int c2 = b2 & 0xff;
+ if (c1 == 0xdf || c2 == 0xdf) { // 0xdf is the only 1:M in latin1 range
+ return compareToFC0(value, k, len, other, k, olen);
+ }
+ return Character.toLowerCase(c1) - Character.toLowerCase(c2);
+ }
+ }
+ return len - olen;
+ }
+
+ private static int compareToFC0_UTF16(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
+ int f1 = 0, f2 = 0;
+ int k1 = off, k2 = ooff;
+ while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) {
+ int c1, c2;
+ if (f1 != 0) {
+ c1 = (f1 & 0xffff); f1 >>>= 16;
+ } else {
+ c1 = getChar(value, k1++);
+ var f = CaseFolding.fold(c1);
+ if (CaseFolding.isSingleCodePoint(f)) {
+ c1 = (int)(f & 0xfffff);
+ } else {
+ c1 = (int)f & 0xffff;
+ f1 = (int)(f >>> 16);
+ }
+ }
+ if (f2 != 0) {
+ c2 = f2 & 0xffff; f2 >>>= 16;
+ } else {
+ c2 = StringUTF16.codePointAt(other, k2, olast, true);
+ k2 += Character.charCount(c2);
+ var f = CaseFolding.fold(c2);
+ if (CaseFolding.isSingleCodePoint(f)) {
+ c2 = (int)(f & 0xfffff);
+ } else {
+ c2 = (int)(f & 0xffff);
+ f2 = (int)(f >>> 16);
+ }
+ }
+ if (c1 != c2) {
+ return c1 - c2;
+ }
+ }
+ if (k1 < last || f1 != 0) {
+ return 1;
+ }
+ if (k2 < olast || f2 != 0) {
+ return -1;
+ }
+ return 0;
+ }
+
+ // latin1 vs utf16
+ static int compareToFC_UTF16(byte[] value, byte[] other) {
+ int last = length(value);
+ int olast = StringUTF16.length(other);
+ int lim = Math.min(last, olast);
+ for (int k = 0; k < lim; k++) {
+ int cp1 = getChar(value, k);
+ int cp2 = StringUTF16.codePointAt(other, k, olast, true);
+ if (cp1 != cp2) {
+ long cf1 = CaseFolding.fold(cp1);
+ long cf2 = CaseFolding.fold(cp2);
+ if (cf1 != cf2) {
+ if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) {
+ return compareToFC0_UTF16(value, k, last, other, k, olast);
+ }
+ return (int)(cf1 - cf2);
+ }
+ }
+ }
+ return last - olast;
+ }
+
static int hashCode(byte[] value) {
return ArraysSupport.hashCodeOfUnsigned(value, 0, value.length, 0);
}
diff --git a/src/java.base/share/classes/java/lang/StringUTF16.java b/src/java.base/share/classes/java/lang/StringUTF16.java
index 4e31c9728e9..75c9e8239ba 100644
--- a/src/java.base/share/classes/java/lang/StringUTF16.java
+++ b/src/java.base/share/classes/java/lang/StringUTF16.java
@@ -34,6 +34,7 @@ import java.util.function.IntConsumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
+import jdk.internal.lang.CaseFolding;
import jdk.internal.misc.Unsafe;
import jdk.internal.util.ArraysSupport;
import jdk.internal.vm.annotation.ForceInline;
@@ -93,7 +94,7 @@ final class StringUTF16 {
return value.length >> 1;
}
- private static int codePointAt(byte[] value, int index, int end, boolean checked) {
+ static int codePointAt(byte[] value, int index, int end, boolean checked) {
assert index < end;
if (checked) {
checkIndex(index, value);
@@ -592,6 +593,77 @@ final class StringUTF16 {
return -StringLatin1.compareToCI_UTF16(other, value);
}
+ public static int compareToFC_Latin1(byte[] value, byte[] other) {
+ return -StringLatin1.compareToFC_UTF16(other, value);
+ }
+
+ private static int compareToFC0(byte[] value, int off, int last, byte[] other, int ooff, int olast) {
+ int f1 = 0, f2 = 0;
+ int k1 = off, k2 = ooff;
+ while ((k1 < last || f1 != 0) && (k2 < olast || f2 != 0)) {
+ int c1, c2;
+ if (f1 != 0) {
+ c1 = f1 & 0xffff; f1 >>>= 16;
+ } else {
+ c1 = StringUTF16.codePointAt(value, k1, last, true);
+ k1 += Character.charCount(c1);
+ var f = CaseFolding.fold(c1);
+ if (CaseFolding.isSingleCodePoint(f)) {
+ c1 = (int)(f & 0xfffff);
+ } else {
+ c1 = (int)(f & 0xffff);
+ f1 = (int)(f >> 16);
+ }
+ }
+ if (f2 != 0) {
+ c2 = f2 & 0xffff; f2 >>>= 16;
+ } else {
+ c2 = StringUTF16.codePointAt(other, k2, olast, true);
+ k2 += Character.charCount(c2);
+ var f = CaseFolding.fold(c2);
+ if (CaseFolding.isSingleCodePoint(f)) {
+ c2 = (int)(f & 0xfffff);
+ } else {
+ c2 = (int)(f & 0xffff);
+ f2 = (int)(f >>> 16);
+ }
+ }
+ if (c1 != c2) {
+ return c1 - c2;
+ }
+ }
+ if (k1 < last || f1 != 0) {
+ return 1;
+ }
+ if (k2 < olast || f2 != 0) {
+ return -1;
+ }
+ return 0;
+ }
+
+ public static int compareToFC(byte[] value, byte[] other) {
+ int tlast = length(value);
+ int olast = length(other);
+ int lim = Math.min(tlast, olast);
+ int k = 0;
+ while (k < lim) {
+ int cp1 = codePointAt(value, k, tlast, true);
+ int cp2 = codePointAt(other, k, olast, true);
+ if (cp1 != cp2) {
+ long cf1 = CaseFolding.fold(cp1);
+ long cf2 = CaseFolding.fold(cp2);
+ if (cf1 != cf2) {
+ if (!CaseFolding.isSingleCodePoint(cf1) || !CaseFolding.isSingleCodePoint(cf2)) {
+ return compareToFC0(value, k, tlast, other, k, olast);
+ }
+ return (int) cf1 - (int) cf2;
+ }
+ }
+ k += Character.charCount(cp1);
+ }
+ return tlast - olast;
+ }
+
static int hashCode(byte[] value) {
return ArraysSupport.hashCodeOfUTF16(value, 0, value.length >> 1, 0);
}
diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java
index 2908370acd5..58c9186924b 100644
--- a/src/java.base/share/classes/java/util/regex/Pattern.java
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java
@@ -43,8 +43,8 @@ import java.util.function.Predicate;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
+import jdk.internal.lang.CaseFolding;
import jdk.internal.util.ArraysSupport;
-import jdk.internal.util.regex.CaseFolding;
import jdk.internal.util.regex.Grapheme;
/**
diff --git a/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template
new file mode 100644
index 00000000000..24a183c8da0
--- /dev/null
+++ b/src/java.base/share/classes/jdk/internal/lang/CaseFolding.java.template
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.internal.lang;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static java.util.Map.entry;
+
+/**
+ * Utility class that handles Unicode case folding properties defined in
+ * CasingFolding.txt, including 1:M full case folding.
+ */
+public final class CaseFolding {
+
+ private CaseFolding() {}
+
+ /**
+ * Tests whether the specified code point has a folding mapping entry defined.
+ *
+ * @param cp
+ * the Unicode code point to test
+ * @return {@code true} if the given code point has a case folding mapping entry
+ * defined in (@code caseFoldingMap}, {@code false} otherwise
+ */
+ public static boolean isDefined(int cp) {
+ return getDefined(cp) != -1;
+ }
+
+ /**
+ * Returns the case-folded form of the specified code point according
+ * to the Unicode case folding mappings.
+ *
+ * If the code point has no case folding mapping defined, this method returns
+ * the original code point.
+ *
+ * Possible combinations of the returning case-folding form as a long value
+ *
+ * +---+---------+--------+---------+--------+--------+
+ * | 1:1 mapping | 0000 | 0000 | 000x | xxxx | 0041 => 0061 or 1E921 => 1E943
+ * +---+---------+--------+---------+--------+--------+
+ * | 1:2 mapping | 0002 | 0000 | xxxx | xxxx | FB02 => 0066 006C
+ * +---+---------+--------+---------+--------+--------+
+ * | 1:3 mapping | 0003 | xxxx | xxxx | xxxx | FB03 => 0066 0066 0069
+ * +---+---------+--------+---------+--------+--------+
+ *
+ * @param cp
+ * the Unicode code point to fold
+ * @return a long value representing the case-folded form of the input
+ * code point, encoded as TBD
+ */
+ public static long fold(int cp) {
+ var fold = getDefined(cp);
+ return fold == -1 ? cp : fold;
+ }
+
+ public static boolean isSingleCodePoint(long fold) {
+ return (fold >> 48) == 0;
+ }
+
+ /**
+ * Returns an expansion set to "close" a given regex Unicode character class range for case-sensitive
+ * matching, according to the
+ * Simple Loose Matches
+ * rule defined in Unicode Technical Standard #18: Unicode Regular Expressions.
+ *
+ * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
+ * be applied to literals and (optionally) to character classes. When applied to character classes, each
+ * character class is expected to be closed under simple case folding. See the standard for the
+ * detailed explanation and example of "closed".
+ *
+ * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
+ *
+ * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
+ * back-refs, string slice (sequences), single, family(char-property) and class range. Single and
+ * family may appears independently or within a class.
+ *
+ * For loose/case-insensitive matching, the back-refs, slices and singles apply {@code toUpperCase} and
+ * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
+ * matching.
+ *
+ * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
+ * if their behavior is clearly specified.
+ *
+ * This method addresses that requirement for the "range" construct within in character class by computing
+ * the additional characters that should be included to close the range under simple case folding:
+ *
+ * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
+ * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
+ * character is not already in the range, then that mapped character (typically lowercase) is added to
+ * the expansion set.
+ *
+ * This allows regex character class "range" implementation to use the returned expansion set to support
+ * additional case-insensitive matching, without duplicating characters already covered by the existing
+ * regex range implementation. The expectation is the matching is done using both the uppercase and
+ * lowercase forms of the input character, for example
+ *
+ *
- * To conform with Level 1 of UTS #18, specifically RL1.5: Simple Loose Matches, simple case folding must
- * be applied to literals and (optionally) to character classes. When applied to character classes, each
- * character class is expected to be closed under simple case folding. See the standard for the
- * detailed explanation and example of "closed".
- *
- * RL1.5 states: To meet this requirement, an implementation that supports case-sensitive matching should
- *
- * In the {@code Pattern} implementation, 5 types of constructs maybe case-sensitive when matching:
- * back-refs, string slice (sequences), single, family(char-property) and class range. Single and
- * family may appears independently or within a class.
- *
- * For loose/case-insensitive matching, the back-refs, slices and singles apply {code toUpperCase} and
- * {@code toLowerCase} to both the pattern and the input string. This effectively 'close' the class for
- * matching.
- *
- * The family/char-properties are not "closed" and should remain unchanged. This is acceptable per RL1.5,
- * if their behavior is clearly specified.
- *
- * This method addresses that requirement for the "range" construct within in character class by computing
- * the additional characters that should be included to close the range under simple case folding:
- *
- * For each character in the input range {@code [start, end]} (inclusive), if the character has a simple
- * case folding mapping in Unicode's CaseFolding.txt, the mapping is not a round-trip map, and the mapped
- * character is not already in the range, then that mapped character (typically lowercase) is added to
- * the expansion set.
- *
- * This allows regex character class "range" implementation to use the returned expansion set to support
- * additional case-insensitive matching, without duplicating characters already covered by the existing
- * regex range implementation. The expectation is the matching is done using both the uppercase and
- * lowercase forms of the input character, for example
- *
- *
- * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
- * @param start the starting code point of the character range
- * @param end the ending code point of the character range
- * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
- * those already in the range
- */
- public static int[] getClassRangeClosingCharacters(int start, int end) {
- int[] expanded = new int[expanded_case_cps.length];
- int off = 0;
- for (int cp : expanded_case_cps) {
- if (cp >= start && cp <= end) {
- int folding = expanded_case_map.get(cp);
- if (folding < start || folding > end) {
- expanded[off++] = folding;
- }
- }
- }
- return Arrays.copyOf(expanded, off);
- }
-}
diff --git a/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
new file mode 100644
index 00000000000..86b3fba0a27
--- /dev/null
+++ b/test/jdk/java/lang/String/UnicodeCaseFoldingTest.java
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @summary tests unicode case-folding based String comparison and equality
+ * @bug 4397357
+ * @library /lib/testlibrary/java/lang
+ * @modules java.base/jdk.internal.lang:+open
+ * @run junit/othervm
+ * UnicodeCaseFoldingTest
+ */
+
+import java.nio.file.Files;
+import java.util.stream.Stream;
+import java.util.stream.Collectors;
+import java.util.ArrayList;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import jdk.internal.lang.CaseFolding;
+
+public class UnicodeCaseFoldingTest {
+
+ @Test
+ void testAllCommnFullCodePointsListedInCaseFoldinigTxt() throws Throwable {
+ var filter = "^.*; [CF]; .*$"; // C=common, F=full, for full case folding
+ var results = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches(filter))
+ .map(line -> {
+ var fields = line.split("; ");
+ var cp = Integer.parseInt(fields[0], 16);
+ fields = fields[2].trim().split(" ");
+ var folding = new int[fields.length];
+ for (int i = 0; i < folding.length; i++) {
+ folding[i] = Integer.parseInt(fields[i], 16);
+ }
+ var source = new String(Character.toChars(cp));
+ var expected = new String(folding, 0, folding.length);
+ // (1) Verify the folding result matches expected
+ assertEquals(expected, foldCase(source), "CaseFolding.fold(): ");
+
+ // (2) Verify compareToFoldCase() result
+ assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
+ assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
+
+ // (3) Verify equalsFoldCase() result
+ assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
+ assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
+ return null;
+ })
+ .filter(error -> error != null)
+ .toArray();
+ assertEquals(0, results.length);
+ }
+
+ @Test
+ void testAllSimpleCodePointsListedInCaseFoldinigTxt() throws Throwable {
+ // S=simple, for simple case folding. The simple case folding should still matches
+ var filter = "^.*; [S]; .*$";
+ var results = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches(filter))
+ .map(line -> {
+ var fields = line.split("; ");
+ var cp = Integer.parseInt(fields[0], 16);
+ fields = fields[2].trim().split(" ");
+ var folding = new int[fields.length];
+ for (int i = 0; i < folding.length; i++) {
+ folding[i] = Integer.parseInt(fields[i], 16);
+ }
+ var source = new String(Character.toChars(cp));
+ var expected = new String(folding, 0, folding.length);
+
+ // (1) Verify compareToFoldCase() result
+ assertEquals(0, source.compareToFoldCase(expected), "source.compareToFoldCase(expected)");
+ assertEquals(0, expected.compareToFoldCase(source), "expected.compareToFoldCase(source)");
+
+ // (2) Verify equalsFoldCase() result
+ assertEquals(true, source.equalsFoldCase(expected), "source.equalsFoldCase(expected)");
+ assertEquals(true, expected.equalsFoldCase(source), "expected.equalsFoldCase(source)");
+ return null;
+ })
+ .filter(error -> error != null)
+ .toArray();
+ assertEquals(0, results.length);
+ }
+
+ @Test
+ public void testAllCodePointsFoldToThemselvesIfNotListed() throws Exception {
+ // Collect all code points that appear in CaseFolding.txt
+ var listed = Files.lines(UCDFiles.CASEFOLDING)
+ .filter(line -> !line.startsWith("#") && line.matches("^.*; [CF]; .*$"))
+ .map(line -> Integer.parseInt(line.split("; ")[0], 16))
+ .collect(Collectors.toSet());
+
+ var failures = new ArrayList
+ *
+ * {@code
+ *
+ * ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
+ * inRange(lower, Character.toLower(ch), upper) ||
+ * additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
+ * additionalClosingCharacters.contains(Character.toUpperCase(ch))
+ * }
+ *
+ * @param start the starting code point of the character range
+ * @param end the ending code point of the character range
+ * @return a {@code int[]} containing the all simple case equivalents of characters in the range, excluding
+ * those already in the range
+ * @spec https://www.unicode.org/reports/tr18/#Simple_Loose_Matches
+ */
+ public static int[] getClassRangeClosingCharacters(int start, int end) {
+ int[] expanded = new int[expanded_case_cps.length];
+ int off = 0;
+ for (int cp : expanded_case_cps) {
+ if (cp >= start && cp <= end) {
+ int folding = expanded_case_map.get(cp);
+ if (folding < start || folding > end) {
+ expanded[off++] = folding;
+ }
+ }
+ }
+ return Arrays.copyOf(expanded, off);
+ }
+
+ private static final Map
- *
- * {@code
- *
- * ch -> inRange(lower, Character.toUpperCase(ch), upper) ||
- * inRange(lower, Character.toLower(ch), upper) ||
- * additionalClosingCharacters.contains(Character.toUpperCase(ch)) ||
- * additionalClosingCharacters.contains(Character.toUpperCase(ch))
- * }
- *
- *