diff --git a/src/java.base/share/classes/sun/nio/cs/UTF_32Coder.java b/src/java.base/share/classes/sun/nio/cs/UTF_32Coder.java index c6f38ec9bfc..72e59d22e2c 100644 --- a/src/java.base/share/classes/sun/nio/cs/UTF_32Coder.java +++ b/src/java.base/share/classes/sun/nio/cs/UTF_32Coder.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -185,5 +185,12 @@ class UTF_32Coder { doneBOM = !doBOM; } + public boolean canEncode(char c) { + return !Character.isSurrogate(c); + } + + public boolean canEncode(CharSequence cs) { + return Unicode.isValidUnicode(cs); + } } } diff --git a/src/java.base/share/classes/sun/nio/cs/UTF_8.java b/src/java.base/share/classes/sun/nio/cs/UTF_8.java index 2928ae6d509..fda8e5eec1f 100644 --- a/src/java.base/share/classes/sun/nio/cs/UTF_8.java +++ b/src/java.base/share/classes/sun/nio/cs/UTF_8.java @@ -424,6 +424,10 @@ public final class UTF_8 extends Unicode { return !Character.isSurrogate(c); } + public boolean canEncode(CharSequence cs) { + return Unicode.isValidUnicode(cs); + } + public boolean isLegalReplacement(byte[] repl) { return ((repl.length == 1 && repl[0] >= 0) || super.isLegalReplacement(repl)); diff --git a/src/java.base/share/classes/sun/nio/cs/Unicode.java b/src/java.base/share/classes/sun/nio/cs/Unicode.java index aac77a13ffb..06a50f125c5 100644 --- a/src/java.base/share/classes/sun/nio/cs/Unicode.java +++ b/src/java.base/share/classes/sun/nio/cs/Unicode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -95,4 +95,23 @@ abstract class Unicode extends Charset || (cs.name().equals("x-Johab")) || (cs.name().equals("Shift_JIS"))); } + + static boolean isValidUnicode(CharSequence cs) { + int length = cs.length(); + for (int i = 0; i < length;) { + char c = cs.charAt(i++); + if (Character.isHighSurrogate(c)) { + if (i == length) { + return false; + } + char low = cs.charAt(i++); + if (!Character.isLowSurrogate(low)) { + return false; + } + } else if (Character.isLowSurrogate(c)) { + return false; + } + } + return true; + } } diff --git a/src/java.base/share/classes/sun/nio/cs/UnicodeEncoder.java b/src/java.base/share/classes/sun/nio/cs/UnicodeEncoder.java index 7b34fb2d512..6f7413dcbf8 100644 --- a/src/java.base/share/classes/sun/nio/cs/UnicodeEncoder.java +++ b/src/java.base/share/classes/sun/nio/cs/UnicodeEncoder.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -108,4 +108,8 @@ public abstract class UnicodeEncoder extends CharsetEncoder { public boolean canEncode(char c) { return ! Character.isSurrogate(c); } + + public boolean canEncode(CharSequence cs) { + return Unicode.isValidUnicode(cs); + } } diff --git a/test/jdk/java/nio/charset/CharsetEncoder/CanEncode.java b/test/jdk/java/nio/charset/CharsetEncoder/CanEncode.java index 8545ef61be9..d4dabf70910 100644 --- a/test/jdk/java/nio/charset/CharsetEncoder/CanEncode.java +++ b/test/jdk/java/nio/charset/CharsetEncoder/CanEncode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -66,14 +66,21 @@ public class CanEncode { Charset cs = Charset.forName(csn); CharsetEncoder ce = cs.newEncoder(); - if (cs.name().equals("US-ASCII")) { - ck(ce, 'x', true); - ck(ce, '\u00B6', false); - ck(ce, "x", true); - ck(ce, "\u00B6", false); - ck(ce, "xyzzy", true); - ck(ce, "xy\u00B6", false); - } + // Basic multilingual plane + boolean utf = csn.startsWith("UTF-"); + ck(ce, 'x', true); + ck(ce, '\u00B6', utf); + ck(ce, "", true); + ck(ce, "x", true); + ck(ce, "\u00B6", utf); + ck(ce, "xyzzy", true); + ck(ce, "xy\u00B6", utf); + + // Paired surrogates + ck(ce, "\uD83D\uDE00", utf); + ck(ce, "XX\uD83D\uDE00", utf); + ck(ce, "\uD83D\uDE00XX", utf); + ck(ce, "X\uD83D\uDE00X", utf); // Unpaired surrogates should never be encodable ck(ce, '\ud800', false); @@ -81,15 +88,36 @@ public class CanEncode { ck(ce, '\udffe', false); ck(ce, '\udfff', false); ck(ce, "\ud800", false); + ck(ce, "XX\ud800", false); + ck(ce, "\ud800XX", false); + ck(ce, "X\ud800X", false); ck(ce, "\ud801", false); + ck(ce, "XX\ud801", false); + ck(ce, "\ud801XX", false); + ck(ce, "X\ud801X", false); ck(ce, "\udffe", false); + ck(ce, "XX\udffe", false); + ck(ce, "\udffeXX", false); + ck(ce, "X\udffeX", false); ck(ce, "\udfff", false); + ck(ce, "XX\udfff", false); + ck(ce, "\udfffXX", false); + ck(ce, "X\udfffX", false); + if (errors > 0) { + throw new RuntimeException(errors + " errors for Charset " + csn); + } } public static void main(String[] args) throws Exception { test("US-ASCII"); test("UTF-8"); + test("UTF-16"); + test("UTF-16LE"); + test("UTF-16BE"); + test("UTF-32"); + test("UTF-32LE"); + test("UTF-32BE"); } } diff --git a/test/micro/org/openjdk/bench/java/nio/CharsetCanEncode.java b/test/micro/org/openjdk/bench/java/nio/CharsetCanEncode.java index ebfbc217a95..8c08a876696 100644 --- a/test/micro/org/openjdk/bench/java/nio/CharsetCanEncode.java +++ b/test/micro/org/openjdk/bench/java/nio/CharsetCanEncode.java @@ -65,6 +65,9 @@ public class CharsetCanEncode { // sun.nio.cs.UTF_16LE private CharsetEncoder utf16le = Charset.forName("UTF-16LE").newEncoder(); + // sun.nio.cs.UTF_32LE + private CharsetEncoder utf32le = Charset.forName("UTF-32LE").newEncoder(); + @Benchmark public boolean asciiCanEncodeCharYes() { return ascii.canEncode('D'); @@ -184,4 +187,24 @@ public class CharsetCanEncode { public boolean utf16leCanEncodeStringNo() { return utf16le.canEncode(String.valueOf(Character.MIN_SURROGATE)); } + + @Benchmark + public boolean utf32leCanEncodeCharYes() { + return utf32le.canEncode('D'); + } + + @Benchmark + public boolean utf32leCanEncodeStringYes() { + return utf32le.canEncode("D"); + } + + @Benchmark + public boolean utf32leCanEncodeCharNo() { + return utf32le.canEncode(Character.MIN_SURROGATE); + } + + @Benchmark + public boolean utf32leCanEncodeStringNo() { + return utf32le.canEncode(String.valueOf(Character.MIN_SURROGATE)); + } }