8381015: CharsetEncoder.canEncode(CharSequence) is slow for UTF-8, UTF-16, UTF-32

Reviewed-by: naoto, vyazici
This commit is contained in:
Daniel Gredler 2026-03-31 21:46:05 +00:00
parent 20c3082aac
commit f46a698113
6 changed files with 97 additions and 12 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -185,5 +185,12 @@ class UTF_32Coder {
doneBOM = !doBOM;
}
public boolean canEncode(char c) {
return !Character.isSurrogate(c);
}
public boolean canEncode(CharSequence cs) {
return Unicode.isValidUnicode(cs);
}
}
}

View File

@ -424,6 +424,10 @@ public final class UTF_8 extends Unicode {
return !Character.isSurrogate(c);
}
public boolean canEncode(CharSequence cs) {
return Unicode.isValidUnicode(cs);
}
public boolean isLegalReplacement(byte[] repl) {
return ((repl.length == 1 && repl[0] >= 0) ||
super.isLegalReplacement(repl));

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -95,4 +95,23 @@ abstract class Unicode extends Charset
|| (cs.name().equals("x-Johab"))
|| (cs.name().equals("Shift_JIS")));
}
static boolean isValidUnicode(CharSequence cs) {
int length = cs.length();
for (int i = 0; i < length;) {
char c = cs.charAt(i++);
if (Character.isHighSurrogate(c)) {
if (i == length) {
return false;
}
char low = cs.charAt(i++);
if (!Character.isLowSurrogate(low)) {
return false;
}
} else if (Character.isLowSurrogate(c)) {
return false;
}
}
return true;
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -108,4 +108,8 @@ public abstract class UnicodeEncoder extends CharsetEncoder {
public boolean canEncode(char c) {
return ! Character.isSurrogate(c);
}
public boolean canEncode(CharSequence cs) {
return Unicode.isValidUnicode(cs);
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2010, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -66,14 +66,21 @@ public class CanEncode {
Charset cs = Charset.forName(csn);
CharsetEncoder ce = cs.newEncoder();
if (cs.name().equals("US-ASCII")) {
ck(ce, 'x', true);
ck(ce, '\u00B6', false);
ck(ce, "x", true);
ck(ce, "\u00B6", false);
ck(ce, "xyzzy", true);
ck(ce, "xy\u00B6", false);
}
// Basic multilingual plane
boolean utf = csn.startsWith("UTF-");
ck(ce, 'x', true);
ck(ce, '\u00B6', utf);
ck(ce, "", true);
ck(ce, "x", true);
ck(ce, "\u00B6", utf);
ck(ce, "xyzzy", true);
ck(ce, "xy\u00B6", utf);
// Paired surrogates
ck(ce, "\uD83D\uDE00", utf);
ck(ce, "XX\uD83D\uDE00", utf);
ck(ce, "\uD83D\uDE00XX", utf);
ck(ce, "X\uD83D\uDE00X", utf);
// Unpaired surrogates should never be encodable
ck(ce, '\ud800', false);
@ -81,15 +88,36 @@ public class CanEncode {
ck(ce, '\udffe', false);
ck(ce, '\udfff', false);
ck(ce, "\ud800", false);
ck(ce, "XX\ud800", false);
ck(ce, "\ud800XX", false);
ck(ce, "X\ud800X", false);
ck(ce, "\ud801", false);
ck(ce, "XX\ud801", false);
ck(ce, "\ud801XX", false);
ck(ce, "X\ud801X", false);
ck(ce, "\udffe", false);
ck(ce, "XX\udffe", false);
ck(ce, "\udffeXX", false);
ck(ce, "X\udffeX", false);
ck(ce, "\udfff", false);
ck(ce, "XX\udfff", false);
ck(ce, "\udfffXX", false);
ck(ce, "X\udfffX", false);
if (errors > 0) {
throw new RuntimeException(errors + " errors for Charset " + csn);
}
}
public static void main(String[] args) throws Exception {
test("US-ASCII");
test("UTF-8");
test("UTF-16");
test("UTF-16LE");
test("UTF-16BE");
test("UTF-32");
test("UTF-32LE");
test("UTF-32BE");
}
}

View File

@ -65,6 +65,9 @@ public class CharsetCanEncode {
// sun.nio.cs.UTF_16LE
private CharsetEncoder utf16le = Charset.forName("UTF-16LE").newEncoder();
// sun.nio.cs.UTF_32LE
private CharsetEncoder utf32le = Charset.forName("UTF-32LE").newEncoder();
@Benchmark
public boolean asciiCanEncodeCharYes() {
return ascii.canEncode('D');
@ -184,4 +187,24 @@ public class CharsetCanEncode {
public boolean utf16leCanEncodeStringNo() {
return utf16le.canEncode(String.valueOf(Character.MIN_SURROGATE));
}
@Benchmark
public boolean utf32leCanEncodeCharYes() {
return utf32le.canEncode('D');
}
@Benchmark
public boolean utf32leCanEncodeStringYes() {
return utf32le.canEncode("D");
}
@Benchmark
public boolean utf32leCanEncodeCharNo() {
return utf32le.canEncode(Character.MIN_SURROGATE);
}
@Benchmark
public boolean utf32leCanEncodeStringNo() {
return utf32le.canEncode(String.valueOf(Character.MIN_SURROGATE));
}
}