mirror of
https://github.com/openjdk/jdk.git
synced 2026-04-03 11:38:44 +00:00
8381015: CharsetEncoder.canEncode(CharSequence) is slow for UTF-8, UTF-16, UTF-32
Reviewed-by: naoto, vyazici
This commit is contained in:
parent
20c3082aac
commit
f46a698113
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -185,5 +185,12 @@ class UTF_32Coder {
|
||||
doneBOM = !doBOM;
|
||||
}
|
||||
|
||||
public boolean canEncode(char c) {
|
||||
return !Character.isSurrogate(c);
|
||||
}
|
||||
|
||||
public boolean canEncode(CharSequence cs) {
|
||||
return Unicode.isValidUnicode(cs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -424,6 +424,10 @@ public final class UTF_8 extends Unicode {
|
||||
return !Character.isSurrogate(c);
|
||||
}
|
||||
|
||||
public boolean canEncode(CharSequence cs) {
|
||||
return Unicode.isValidUnicode(cs);
|
||||
}
|
||||
|
||||
public boolean isLegalReplacement(byte[] repl) {
|
||||
return ((repl.length == 1 && repl[0] >= 0) ||
|
||||
super.isLegalReplacement(repl));
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -95,4 +95,23 @@ abstract class Unicode extends Charset
|
||||
|| (cs.name().equals("x-Johab"))
|
||||
|| (cs.name().equals("Shift_JIS")));
|
||||
}
|
||||
|
||||
static boolean isValidUnicode(CharSequence cs) {
|
||||
int length = cs.length();
|
||||
for (int i = 0; i < length;) {
|
||||
char c = cs.charAt(i++);
|
||||
if (Character.isHighSurrogate(c)) {
|
||||
if (i == length) {
|
||||
return false;
|
||||
}
|
||||
char low = cs.charAt(i++);
|
||||
if (!Character.isLowSurrogate(low)) {
|
||||
return false;
|
||||
}
|
||||
} else if (Character.isLowSurrogate(c)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2000, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -108,4 +108,8 @@ public abstract class UnicodeEncoder extends CharsetEncoder {
|
||||
public boolean canEncode(char c) {
|
||||
return ! Character.isSurrogate(c);
|
||||
}
|
||||
|
||||
public boolean canEncode(CharSequence cs) {
|
||||
return Unicode.isValidUnicode(cs);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2010, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -66,14 +66,21 @@ public class CanEncode {
|
||||
Charset cs = Charset.forName(csn);
|
||||
CharsetEncoder ce = cs.newEncoder();
|
||||
|
||||
if (cs.name().equals("US-ASCII")) {
|
||||
ck(ce, 'x', true);
|
||||
ck(ce, '\u00B6', false);
|
||||
ck(ce, "x", true);
|
||||
ck(ce, "\u00B6", false);
|
||||
ck(ce, "xyzzy", true);
|
||||
ck(ce, "xy\u00B6", false);
|
||||
}
|
||||
// Basic multilingual plane
|
||||
boolean utf = csn.startsWith("UTF-");
|
||||
ck(ce, 'x', true);
|
||||
ck(ce, '\u00B6', utf);
|
||||
ck(ce, "", true);
|
||||
ck(ce, "x", true);
|
||||
ck(ce, "\u00B6", utf);
|
||||
ck(ce, "xyzzy", true);
|
||||
ck(ce, "xy\u00B6", utf);
|
||||
|
||||
// Paired surrogates
|
||||
ck(ce, "\uD83D\uDE00", utf);
|
||||
ck(ce, "XX\uD83D\uDE00", utf);
|
||||
ck(ce, "\uD83D\uDE00XX", utf);
|
||||
ck(ce, "X\uD83D\uDE00X", utf);
|
||||
|
||||
// Unpaired surrogates should never be encodable
|
||||
ck(ce, '\ud800', false);
|
||||
@ -81,15 +88,36 @@ public class CanEncode {
|
||||
ck(ce, '\udffe', false);
|
||||
ck(ce, '\udfff', false);
|
||||
ck(ce, "\ud800", false);
|
||||
ck(ce, "XX\ud800", false);
|
||||
ck(ce, "\ud800XX", false);
|
||||
ck(ce, "X\ud800X", false);
|
||||
ck(ce, "\ud801", false);
|
||||
ck(ce, "XX\ud801", false);
|
||||
ck(ce, "\ud801XX", false);
|
||||
ck(ce, "X\ud801X", false);
|
||||
ck(ce, "\udffe", false);
|
||||
ck(ce, "XX\udffe", false);
|
||||
ck(ce, "\udffeXX", false);
|
||||
ck(ce, "X\udffeX", false);
|
||||
ck(ce, "\udfff", false);
|
||||
ck(ce, "XX\udfff", false);
|
||||
ck(ce, "\udfffXX", false);
|
||||
ck(ce, "X\udfffX", false);
|
||||
|
||||
if (errors > 0) {
|
||||
throw new RuntimeException(errors + " errors for Charset " + csn);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
test("US-ASCII");
|
||||
test("UTF-8");
|
||||
test("UTF-16");
|
||||
test("UTF-16LE");
|
||||
test("UTF-16BE");
|
||||
test("UTF-32");
|
||||
test("UTF-32LE");
|
||||
test("UTF-32BE");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -65,6 +65,9 @@ public class CharsetCanEncode {
|
||||
// sun.nio.cs.UTF_16LE
|
||||
private CharsetEncoder utf16le = Charset.forName("UTF-16LE").newEncoder();
|
||||
|
||||
// sun.nio.cs.UTF_32LE
|
||||
private CharsetEncoder utf32le = Charset.forName("UTF-32LE").newEncoder();
|
||||
|
||||
@Benchmark
|
||||
public boolean asciiCanEncodeCharYes() {
|
||||
return ascii.canEncode('D');
|
||||
@ -184,4 +187,24 @@ public class CharsetCanEncode {
|
||||
public boolean utf16leCanEncodeStringNo() {
|
||||
return utf16le.canEncode(String.valueOf(Character.MIN_SURROGATE));
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean utf32leCanEncodeCharYes() {
|
||||
return utf32le.canEncode('D');
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean utf32leCanEncodeStringYes() {
|
||||
return utf32le.canEncode("D");
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean utf32leCanEncodeCharNo() {
|
||||
return utf32le.canEncode(Character.MIN_SURROGATE);
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public boolean utf32leCanEncodeStringNo() {
|
||||
return utf32le.canEncode(String.valueOf(Character.MIN_SURROGATE));
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user