diff --git a/src/java.base/share/classes/sun/util/locale/LanguageTag.java b/src/java.base/share/classes/sun/util/locale/LanguageTag.java index 6036c1dd04f..0b2fee7f2cd 100644 --- a/src/java.base/share/classes/sun/util/locale/LanguageTag.java +++ b/src/java.base/share/classes/sun/util/locale/LanguageTag.java @@ -34,17 +34,21 @@ package sun.util.locale; import java.text.ParsePosition; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.IllformedLocaleException; import java.util.List; import java.util.Locale; -import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.StringJoiner; // List fields are unmodifiable -public record LanguageTag(String language, String script, String region, String privateuse, - List extlangs, List variants, List extensions) { +public record LanguageTag(String language, + String script, + String region, + String privateuse, + List extlangs, + List variants, + List extensions) { public static final String SEP = "-"; public static final String PRIVATEUSE = "x"; @@ -53,78 +57,6 @@ public record LanguageTag(String language, String script, String region, String private static final String EMPTY_SUBTAG = ""; private static final List EMPTY_SUBTAGS = List.of(); - // Map contains legacy language tags and its preferred mappings from - // http://www.ietf.org/rfc/rfc5646.txt - // Keys are lower-case strings. - private static final Map LEGACY; - - static { - // grandfathered = irregular ; non-redundant tags registered - // / regular ; during the RFC 3066 era - // - // irregular = "en-GB-oed" ; irregular tags do not match - // / "i-ami" ; the 'langtag' production and - // / "i-bnn" ; would not otherwise be - // / "i-default" ; considered 'well-formed' - // / "i-enochian" ; These tags are all valid, - // / "i-hak" ; but most are deprecated - // / "i-klingon" ; in favor of more modern - // / "i-lux" ; subtags or subtag - // / "i-mingo" ; combination - // / "i-navajo" - // / "i-pwn" - // / "i-tao" - // / "i-tay" - // / "i-tsu" - // / "sgn-BE-FR" - // / "sgn-BE-NL" - // / "sgn-CH-DE" - // - // regular = "art-lojban" ; these tags match the 'langtag' - // / "cel-gaulish" ; production, but their subtags - // / "no-bok" ; are not extended language - // / "no-nyn" ; or variant subtags: their meaning - // / "zh-guoyu" ; is defined by their registration - // / "zh-hakka" ; and all of these are deprecated - // / "zh-min" ; in favor of a more modern - // / "zh-min-nan" ; subtag or sequence of subtags - // / "zh-xiang" - - final String[][] entries = { - //{"tag", "preferred"}, - {"art-lojban", "jbo"}, - {"cel-gaulish", "xtg-x-cel-gaulish"}, // fallback - {"en-GB-oed", "en-GB-x-oed"}, // fallback - {"i-ami", "ami"}, - {"i-bnn", "bnn"}, - {"i-default", "en-x-i-default"}, // fallback - {"i-enochian", "und-x-i-enochian"}, // fallback - {"i-hak", "hak"}, - {"i-klingon", "tlh"}, - {"i-lux", "lb"}, - {"i-mingo", "see-x-i-mingo"}, // fallback - {"i-navajo", "nv"}, - {"i-pwn", "pwn"}, - {"i-tao", "tao"}, - {"i-tay", "tay"}, - {"i-tsu", "tsu"}, - {"no-bok", "nb"}, - {"no-nyn", "nn"}, - {"sgn-BE-FR", "sfb"}, - {"sgn-BE-NL", "vgt"}, - {"sgn-CH-DE", "sgg"}, - {"zh-guoyu", "cmn"}, - {"zh-hakka", "hak"}, - {"zh-min", "nan-x-zh-min"}, // fallback - {"zh-min-nan", "nan"}, - {"zh-xiang", "hsn"}, - }; - LEGACY = HashMap.newHashMap(entries.length); - for (String[] e : entries) { - LEGACY.put(LocaleUtils.toLowerString(e[0]), e); - } - } - /* * BNF in RFC5646 * @@ -175,14 +107,10 @@ public record LanguageTag(String language, String script, String region, String StringTokenIterator itr; var errorMsg = new StringBuilder(); - // Check if the tag is a legacy language tag - String[] gfmap = LEGACY.get(LocaleUtils.toLowerString(languageTag)); - if (gfmap != null) { - // use preferred mapping - itr = new StringTokenIterator(gfmap[1], SEP); - } else { - itr = new StringTokenIterator(languageTag, SEP); - } + // Check if the tag is a legacy tag + var pref = legacyToPreferred(LocaleUtils.toLowerString(languageTag)); + // If legacy use preferred mapping, otherwise use the tag as is + itr = new StringTokenIterator(Objects.requireNonNullElse(pref, languageTag), SEP); String language = parseLanguage(itr, pp); List extlangs; @@ -400,15 +328,24 @@ public record LanguageTag(String language, String script, String region, String public static String caseFoldTag(String tag) { parse(tag, new ParsePosition(0), false); + StringBuilder bldr = new StringBuilder(tag.length()); + String[] subtags = tag.split(SEP); // Legacy tags - String potentialLegacy = tag.toLowerCase(Locale.ROOT); - if (LEGACY.containsKey(potentialLegacy)) { - return LEGACY.get(potentialLegacy)[0]; + if (legacyToPreferred(tag.toLowerCase(Locale.ROOT)) != null) { + // Fold the legacy tag + for (int i = 0; i < subtags.length ; i++) { + // 2 ALPHA Region subtag(s) are upper, all other subtags are lower + if (i > 0 && subtags[i].length() == 2) { + bldr.append(LocaleUtils.toUpperString(subtags[i])).append(SEP); + } else { + bldr.append(LocaleUtils.toLowerString(subtags[i])).append(SEP); + } + } + bldr.setLength(bldr.length() - 1); // Remove trailing '-' + return bldr.toString(); } // Non-legacy tags - StringBuilder bldr = new StringBuilder(tag.length()); - String[] subtags = tag.split("-"); boolean privateFound = false; boolean singletonFound = false; boolean privUseVarFound = false; @@ -435,7 +372,7 @@ public record LanguageTag(String language, String script, String region, String bldr.append(subtag.toLowerCase(Locale.ROOT)); } if (i != subtags.length-1) { - bldr.append("-"); + bldr.append(SEP); } } return bldr.substring(0); @@ -567,6 +504,47 @@ public record LanguageTag(String language, String script, String region, String return new LanguageTag(language, script, region, privateuse, EMPTY_SUBTAGS, variants, extensions); } + /* + * Converts a legacy tag to its preferred mapping if it exists, otherwise null. + * The keys are mapped and stored as lower case. (Folded on demand). + * See http://www.ietf.org/rfc/rfc5646.txt Section 2.1 and 2.2.8 for the + * full syntax and case accurate legacy tags. + */ + private static String legacyToPreferred(String tag) { + if (tag.length() < 5) { + return null; + } + return switch (tag) { + case "art-lojban" -> "jbo"; + case "cel-gaulish" -> "xtg-x-cel-gaulish"; // fallback + case "en-gb-oed" -> "en-GB-x-oed"; // fallback + case "i-ami" -> "ami"; + case "i-bnn" -> "bnn"; + case "i-default" -> "en-x-i-default"; // fallback + case "i-enochian" -> "und-x-i-enochian"; // fallback + case "i-hak", + "zh-hakka" -> "hak"; + case "i-klingon" -> "tlh"; + case "i-lux" -> "lb"; + case "i-mingo" -> "see-x-i-mingo"; // fallback + case "i-navajo" -> "nv"; + case "i-pwn" -> "pwn"; + case "i-tao" -> "tao"; + case "i-tay" -> "tay"; + case "i-tsu" -> "tsu"; + case "no-bok" -> "nb"; + case "no-nyn" -> "nn"; + case "sgn-be-fr" -> "sfb"; + case "sgn-be-nl" -> "vgt"; + case "sgn-ch-de" -> "sgg"; + case "zh-guoyu" -> "cmn"; + case "zh-min" -> "nan-x-zh-min"; // fallback + case "zh-min-nan" -> "nan"; + case "zh-xiang" -> "hsn"; + default -> null; + }; + } + // // Language subtag syntax checking methods // diff --git a/test/jdk/java/util/Locale/CaseFoldLanguageTagTest.java b/test/jdk/java/util/Locale/CaseFoldLanguageTagTest.java index fdee5075229..f3babb7e4c2 100644 --- a/test/jdk/java/util/Locale/CaseFoldLanguageTagTest.java +++ b/test/jdk/java/util/Locale/CaseFoldLanguageTagTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,12 +23,11 @@ /* * @test - * @bug 8159337 + * @bug 8159337 8368981 * @summary Test Locale.caseFoldLanguageTag(String languageTag) * @run junit CaseFoldLanguageTagTest */ -import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; @@ -54,24 +53,67 @@ public class CaseFoldLanguageTagTest { @ParameterizedTest @MethodSource("wellFormedTags") - public void wellFormedTags(String tag, String foldedTag) { + void wellFormedTagsTest(String tag, String foldedTag) { assertEquals(foldedTag, Locale.caseFoldLanguageTag(tag), String.format("Folded %s", tag)); } + @ParameterizedTest + @MethodSource("legacyTags") + void legacyTagsTest(String tag) { + var lowerTag = tag.toLowerCase(Locale.ROOT); + var upperTag = tag.toUpperCase(Locale.ROOT); + assertEquals(tag, Locale.caseFoldLanguageTag(lowerTag), + String.format("Folded %s", lowerTag)); + assertEquals(tag, Locale.caseFoldLanguageTag(upperTag), + String.format("Folded %s", upperTag)); + } + @ParameterizedTest @MethodSource("illFormedTags") - public void illFormedTags(String tag) { + void illFormedTagsTest(String tag) { assertThrows(IllformedLocaleException.class, () -> Locale.caseFoldLanguageTag(tag)); } @Test - public void throwNPE() { + void throwNPETest() { assertThrows(NullPointerException.class, () -> Locale.caseFoldLanguageTag(null)); } - private static Stream wellFormedTags() { + // Well-formed legacy tags in expected case + static Stream legacyTags() { + return Stream.of( + "art-lojban", + "cel-gaulish", + "en-GB-oed", + "i-ami", + "i-bnn", + "i-default", + "i-enochian", + "i-hak", + "i-klingon", + "i-lux", + "i-mingo", + "i-navajo", + "i-pwn", + "i-tao", + "i-tay", + "i-tsu", + "no-bok", + "no-nyn", + "sgn-BE-FR", + "sgn-BE-NL", + "sgn-CH-DE", + "zh-guoyu", + "zh-hakka", + "zh-min", + "zh-min-nan", + "zh-xiang" + ); + } + + static Stream wellFormedTags() { return Stream.of( // langtag tests // language @@ -124,16 +166,6 @@ public class CaseFoldLanguageTagTest { Arguments.of("X-A-ABC", "x-a-abc"), // private w/ extended (incl. 1) Arguments.of("X-A-AB-Abcd", "x-a-ab-abcd"), // private w/ extended (incl. 1, 2, 4) - // Legacy tests - // irregular - Arguments.of("I-AMI", "i-ami"), - Arguments.of("EN-gb-OED", "en-GB-oed"), - Arguments.of("SGN-be-fr", "sgn-BE-FR"), - // regular - Arguments.of("NO-BOK", "no-bok"), - Arguments.of("CEL-GAULISH", "cel-gaulish"), - Arguments.of("ZH-MIN-NAN", "zh-min-nan"), - // Special JDK Cases (Variant and x-lvariant) Arguments.of("de-POSIX-x-URP-lvariant-Abc-Def", "de-POSIX-x-urp-lvariant-Abc-Def"), Arguments.of("JA-JPAN-JP-U-CA-JAPANESE-x-RANDOM-lvariant-JP", @@ -150,7 +182,7 @@ public class CaseFoldLanguageTagTest { ); } - private static Stream illFormedTags() { + static Stream illFormedTags() { return Stream.of( // Starts with non-language Arguments.of("xabadadoo-me"),