8368981: Case Fold Locale Legacy Tags On Demand

Reviewed-by: rriggs, naoto
This commit is contained in:
Justin Lu 2025-10-03 16:25:55 +00:00
parent f81c7c592b
commit 23a65644ae
2 changed files with 118 additions and 108 deletions

View File

@ -34,17 +34,21 @@ package sun.util.locale;
import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.IllformedLocaleException;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.StringJoiner;
// List fields are unmodifiable
public record LanguageTag(String language, String script, String region, String privateuse,
List<String> extlangs, List<String> variants, List<String> extensions) {
public record LanguageTag(String language,
String script,
String region,
String privateuse,
List<String> extlangs,
List<String> variants,
List<String> extensions) {
public static final String SEP = "-";
public static final String PRIVATEUSE = "x";
@ -53,78 +57,6 @@ public record LanguageTag(String language, String script, String region, String
private static final String EMPTY_SUBTAG = "";
private static final List<String> EMPTY_SUBTAGS = List.of();
// Map contains legacy language tags and its preferred mappings from
// http://www.ietf.org/rfc/rfc5646.txt
// Keys are lower-case strings.
private static final Map<String, String[]> LEGACY;
static {
// grandfathered = irregular ; non-redundant tags registered
// / regular ; during the RFC 3066 era
//
// irregular = "en-GB-oed" ; irregular tags do not match
// / "i-ami" ; the 'langtag' production and
// / "i-bnn" ; would not otherwise be
// / "i-default" ; considered 'well-formed'
// / "i-enochian" ; These tags are all valid,
// / "i-hak" ; but most are deprecated
// / "i-klingon" ; in favor of more modern
// / "i-lux" ; subtags or subtag
// / "i-mingo" ; combination
// / "i-navajo"
// / "i-pwn"
// / "i-tao"
// / "i-tay"
// / "i-tsu"
// / "sgn-BE-FR"
// / "sgn-BE-NL"
// / "sgn-CH-DE"
//
// regular = "art-lojban" ; these tags match the 'langtag'
// / "cel-gaulish" ; production, but their subtags
// / "no-bok" ; are not extended language
// / "no-nyn" ; or variant subtags: their meaning
// / "zh-guoyu" ; is defined by their registration
// / "zh-hakka" ; and all of these are deprecated
// / "zh-min" ; in favor of a more modern
// / "zh-min-nan" ; subtag or sequence of subtags
// / "zh-xiang"
final String[][] entries = {
//{"tag", "preferred"},
{"art-lojban", "jbo"},
{"cel-gaulish", "xtg-x-cel-gaulish"}, // fallback
{"en-GB-oed", "en-GB-x-oed"}, // fallback
{"i-ami", "ami"},
{"i-bnn", "bnn"},
{"i-default", "en-x-i-default"}, // fallback
{"i-enochian", "und-x-i-enochian"}, // fallback
{"i-hak", "hak"},
{"i-klingon", "tlh"},
{"i-lux", "lb"},
{"i-mingo", "see-x-i-mingo"}, // fallback
{"i-navajo", "nv"},
{"i-pwn", "pwn"},
{"i-tao", "tao"},
{"i-tay", "tay"},
{"i-tsu", "tsu"},
{"no-bok", "nb"},
{"no-nyn", "nn"},
{"sgn-BE-FR", "sfb"},
{"sgn-BE-NL", "vgt"},
{"sgn-CH-DE", "sgg"},
{"zh-guoyu", "cmn"},
{"zh-hakka", "hak"},
{"zh-min", "nan-x-zh-min"}, // fallback
{"zh-min-nan", "nan"},
{"zh-xiang", "hsn"},
};
LEGACY = HashMap.newHashMap(entries.length);
for (String[] e : entries) {
LEGACY.put(LocaleUtils.toLowerString(e[0]), e);
}
}
/*
* BNF in RFC5646
*
@ -175,14 +107,10 @@ public record LanguageTag(String language, String script, String region, String
StringTokenIterator itr;
var errorMsg = new StringBuilder();
// Check if the tag is a legacy language tag
String[] gfmap = LEGACY.get(LocaleUtils.toLowerString(languageTag));
if (gfmap != null) {
// use preferred mapping
itr = new StringTokenIterator(gfmap[1], SEP);
} else {
itr = new StringTokenIterator(languageTag, SEP);
}
// Check if the tag is a legacy tag
var pref = legacyToPreferred(LocaleUtils.toLowerString(languageTag));
// If legacy use preferred mapping, otherwise use the tag as is
itr = new StringTokenIterator(Objects.requireNonNullElse(pref, languageTag), SEP);
String language = parseLanguage(itr, pp);
List<String> extlangs;
@ -400,15 +328,24 @@ public record LanguageTag(String language, String script, String region, String
public static String caseFoldTag(String tag) {
parse(tag, new ParsePosition(0), false);
StringBuilder bldr = new StringBuilder(tag.length());
String[] subtags = tag.split(SEP);
// Legacy tags
String potentialLegacy = tag.toLowerCase(Locale.ROOT);
if (LEGACY.containsKey(potentialLegacy)) {
return LEGACY.get(potentialLegacy)[0];
if (legacyToPreferred(tag.toLowerCase(Locale.ROOT)) != null) {
// Fold the legacy tag
for (int i = 0; i < subtags.length ; i++) {
// 2 ALPHA Region subtag(s) are upper, all other subtags are lower
if (i > 0 && subtags[i].length() == 2) {
bldr.append(LocaleUtils.toUpperString(subtags[i])).append(SEP);
} else {
bldr.append(LocaleUtils.toLowerString(subtags[i])).append(SEP);
}
}
bldr.setLength(bldr.length() - 1); // Remove trailing '-'
return bldr.toString();
}
// Non-legacy tags
StringBuilder bldr = new StringBuilder(tag.length());
String[] subtags = tag.split("-");
boolean privateFound = false;
boolean singletonFound = false;
boolean privUseVarFound = false;
@ -435,7 +372,7 @@ public record LanguageTag(String language, String script, String region, String
bldr.append(subtag.toLowerCase(Locale.ROOT));
}
if (i != subtags.length-1) {
bldr.append("-");
bldr.append(SEP);
}
}
return bldr.substring(0);
@ -567,6 +504,47 @@ public record LanguageTag(String language, String script, String region, String
return new LanguageTag(language, script, region, privateuse, EMPTY_SUBTAGS, variants, extensions);
}
/*
* Converts a legacy tag to its preferred mapping if it exists, otherwise null.
* The keys are mapped and stored as lower case. (Folded on demand).
* See http://www.ietf.org/rfc/rfc5646.txt Section 2.1 and 2.2.8 for the
* full syntax and case accurate legacy tags.
*/
private static String legacyToPreferred(String tag) {
if (tag.length() < 5) {
return null;
}
return switch (tag) {
case "art-lojban" -> "jbo";
case "cel-gaulish" -> "xtg-x-cel-gaulish"; // fallback
case "en-gb-oed" -> "en-GB-x-oed"; // fallback
case "i-ami" -> "ami";
case "i-bnn" -> "bnn";
case "i-default" -> "en-x-i-default"; // fallback
case "i-enochian" -> "und-x-i-enochian"; // fallback
case "i-hak",
"zh-hakka" -> "hak";
case "i-klingon" -> "tlh";
case "i-lux" -> "lb";
case "i-mingo" -> "see-x-i-mingo"; // fallback
case "i-navajo" -> "nv";
case "i-pwn" -> "pwn";
case "i-tao" -> "tao";
case "i-tay" -> "tay";
case "i-tsu" -> "tsu";
case "no-bok" -> "nb";
case "no-nyn" -> "nn";
case "sgn-be-fr" -> "sfb";
case "sgn-be-nl" -> "vgt";
case "sgn-ch-de" -> "sgg";
case "zh-guoyu" -> "cmn";
case "zh-min" -> "nan-x-zh-min"; // fallback
case "zh-min-nan" -> "nan";
case "zh-xiang" -> "hsn";
default -> null;
};
}
//
// Language subtag syntax checking methods
//

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -23,12 +23,11 @@
/*
* @test
* @bug 8159337
* @bug 8159337 8368981
* @summary Test Locale.caseFoldLanguageTag(String languageTag)
* @run junit CaseFoldLanguageTagTest
*/
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
@ -54,24 +53,67 @@ public class CaseFoldLanguageTagTest {
@ParameterizedTest
@MethodSource("wellFormedTags")
public void wellFormedTags(String tag, String foldedTag) {
void wellFormedTagsTest(String tag, String foldedTag) {
assertEquals(foldedTag, Locale.caseFoldLanguageTag(tag), String.format("Folded %s", tag));
}
@ParameterizedTest
@MethodSource("legacyTags")
void legacyTagsTest(String tag) {
var lowerTag = tag.toLowerCase(Locale.ROOT);
var upperTag = tag.toUpperCase(Locale.ROOT);
assertEquals(tag, Locale.caseFoldLanguageTag(lowerTag),
String.format("Folded %s", lowerTag));
assertEquals(tag, Locale.caseFoldLanguageTag(upperTag),
String.format("Folded %s", upperTag));
}
@ParameterizedTest
@MethodSource("illFormedTags")
public void illFormedTags(String tag) {
void illFormedTagsTest(String tag) {
assertThrows(IllformedLocaleException.class, () ->
Locale.caseFoldLanguageTag(tag));
}
@Test
public void throwNPE() {
void throwNPETest() {
assertThrows(NullPointerException.class, () ->
Locale.caseFoldLanguageTag(null));
}
private static Stream<Arguments> wellFormedTags() {
// Well-formed legacy tags in expected case
static Stream<String> legacyTags() {
return Stream.of(
"art-lojban",
"cel-gaulish",
"en-GB-oed",
"i-ami",
"i-bnn",
"i-default",
"i-enochian",
"i-hak",
"i-klingon",
"i-lux",
"i-mingo",
"i-navajo",
"i-pwn",
"i-tao",
"i-tay",
"i-tsu",
"no-bok",
"no-nyn",
"sgn-BE-FR",
"sgn-BE-NL",
"sgn-CH-DE",
"zh-guoyu",
"zh-hakka",
"zh-min",
"zh-min-nan",
"zh-xiang"
);
}
static Stream<Arguments> wellFormedTags() {
return Stream.of(
// langtag tests
// language
@ -124,16 +166,6 @@ public class CaseFoldLanguageTagTest {
Arguments.of("X-A-ABC", "x-a-abc"), // private w/ extended (incl. 1)
Arguments.of("X-A-AB-Abcd", "x-a-ab-abcd"), // private w/ extended (incl. 1, 2, 4)
// Legacy tests
// irregular
Arguments.of("I-AMI", "i-ami"),
Arguments.of("EN-gb-OED", "en-GB-oed"),
Arguments.of("SGN-be-fr", "sgn-BE-FR"),
// regular
Arguments.of("NO-BOK", "no-bok"),
Arguments.of("CEL-GAULISH", "cel-gaulish"),
Arguments.of("ZH-MIN-NAN", "zh-min-nan"),
// Special JDK Cases (Variant and x-lvariant)
Arguments.of("de-POSIX-x-URP-lvariant-Abc-Def", "de-POSIX-x-urp-lvariant-Abc-Def"),
Arguments.of("JA-JPAN-JP-U-CA-JAPANESE-x-RANDOM-lvariant-JP",
@ -150,7 +182,7 @@ public class CaseFoldLanguageTagTest {
);
}
private static Stream<Arguments> illFormedTags() {
static Stream<Arguments> illFormedTags() {
return Stream.of(
// Starts with non-language
Arguments.of("xabadadoo-me"),