8372353: API to compute the byte length of a String encoded in a given Charset

Reviewed-by: rriggs, naoto, vyazici
This commit is contained in:
Liam Miller-Cushon 2026-03-04 17:33:32 +00:00
parent 8b91537f10
commit 0fbf58d8ff
5 changed files with 235 additions and 13 deletions

View File

@ -1133,6 +1133,34 @@ public final class String
return Arrays.copyOf(dst, dp);
}
// This follows the implementation of encodeASCII and encode8859_1
private static int encodedLengthASCIIor8859_1(byte coder, byte[] val) {
if (coder == LATIN1) {
return val.length;
}
int len = val.length >> 1;
int dp = 0;
int sp = 0;
int sl = len;
while (sp < sl) {
char c = StringUTF16.getChar(val, sp);
if (c >= Character.MIN_HIGH_SURROGATE) {
break;
}
dp++;
sp++;
}
while (sp < sl) {
char c = StringUTF16.getChar(val, sp++);
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
sp++;
}
dp++;
}
return dp;
}
//------------------------------ utf8 ------------------------------------
/**
@ -1467,6 +1495,27 @@ public final class String
return Arrays.copyOf(dst, dp);
}
// This follows the implementation of encodeUTF8
private static int encodedLengthUTF8(byte coder, byte[] val) {
if (coder == UTF16) {
return encodedLengthUTF8_UTF16(val, null);
}
int positives = StringCoding.countPositives(val, 0, val.length);
if (positives == val.length) {
return positives;
}
int dp = positives;
for (int i = dp; i < val.length; i++) {
byte c = val[i];
if (c < 0) {
dp += 2;
} else {
dp++;
}
}
return dp;
}
/**
* {@return the byte array obtained by first decoding {@code val} with
* UTF-16, and then encoding the result with UTF-8}
@ -1484,11 +1533,8 @@ public final class String
int sl = val.length >> 1;
// UTF-8 encoded can be as much as 3 times the string length
// For very large estimate, (as in overflow of 32 bit int), precompute the exact size
long allocLen = (sl * 3 < 0) ? computeSizeUTF8_UTF16(val, exClass) : sl * 3;
if (allocLen > (long)Integer.MAX_VALUE) {
throw new OutOfMemoryError("Required length exceeds implementation limit");
}
byte[] dst = new byte[(int) allocLen];
int allocLen = (sl * 3 < 0) ? encodedLengthUTF8_UTF16(val, exClass) : sl * 3;
byte[] dst = new byte[allocLen];
while (sp < sl) {
// ascii fast loop;
char c = StringUTF16.getChar(val, sp);
@ -1547,11 +1593,20 @@ public final class String
* @param <E> The exception type parameter to enable callers to avoid
* having to declare the exception
*/
private static <E extends Exception> long computeSizeUTF8_UTF16(byte[] val, Class<E> exClass) throws E {
private static <E extends Exception> int encodedLengthUTF8_UTF16(byte[] val, Class<E> exClass) throws E {
long dp = 0L;
int sp = 0;
int sl = val.length >> 1;
while (sp < sl) {
// ascii fast loop;
char c = StringUTF16.getChar(val, sp);
if (c >= '\u0080') {
break;
}
dp++;
sp++;
}
while (sp < sl) {
char c = StringUTF16.getChar(val, sp++);
if (c < 0x80) {
@ -1580,7 +1635,10 @@ public final class String
dp += 3;
}
}
return dp;
if (dp > (long)Integer.MAX_VALUE) {
throw new OutOfMemoryError("Required length exceeds implementation limit");
}
return (int) dp;
}
/**
@ -2045,6 +2103,29 @@ public final class String
return encode(Charset.defaultCharset(), coder(), value);
}
/**
* {@return the length in bytes of this {@code String} encoded with the given {@link Charset}}
*
* <p>The returned length accounts for the replacement of malformed-input and unmappable-character
* sequences with the charset's default replacement byte array. The result will be the same value
* as {@link #getBytes(Charset) getBytes(cs).length}.
*
* @apiNote This method provides equivalent or better performance than {@link #getBytes(Charset)
* getBytes(cs).length}.
*
* @param cs The {@link Charset} used to the compute the length
* @since 27
*/
public int encodedLength(Charset cs) {
Objects.requireNonNull(cs);
if (cs == UTF_8.INSTANCE) {
return encodedLengthUTF8(coder, value);
} else if (cs == ISO_8859_1.INSTANCE || cs == US_ASCII.INSTANCE) {
return encodedLengthASCIIor8859_1(coder, value);
}
return getBytes(cs).length;
}
boolean bytesCompatible(Charset charset, int srcIndex, int numChars) {
if (isLatin1()) {
if (charset == ISO_8859_1.INSTANCE) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 1999, 2006, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1999, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -22,7 +22,7 @@
*/
/* @test
* @bug 4085160 4139951 5005831
* @bug 4085160 4139951 5005831 8372353
* @summary Test that required character encodings are supported
*/
@ -106,6 +106,10 @@ public class Encodings {
if (!equals(bs, bytes))
throw new Exception(charset + ": String.getBytes failed");
/* String.encodedLength(Charset charset) */
if (bs.length != str.encodedLength(charset))
throw new Exception(charset + ": String.encodedLength failed");
// Calls to String.getBytes(Charset) shouldn't automatically
// use the cached thread-local encoder.
if (charset.name().equals("UTF-16BE")) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2002, 2006, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2002, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -23,7 +23,7 @@
/**
* @test
* @bug 4472841 4703640 4705681 4705683 4833095 5005831
* @bug 4472841 4703640 4705681 4705683 4833095 5005831 8372353
* @summary Verify that constructor exceptions are thrown as expected.
*/
@ -397,6 +397,14 @@ public class Exceptions {
}});
}
private static void encodedLength() {
System.out.println("encodedLength(Charset charset)");
tryCatch(" null", NullPointerException.class, new Runnable() {
public void run() {
"foo".encodedLength((Charset)null);
}});
}
private static void contentEquals() {
System.out.println("contentEquals(StringBuffer sb)");
tryCatch(" null", NullPointerException.class, new Runnable() {
@ -640,6 +648,7 @@ public class Exceptions {
// getBytes(Locale)
// getBytes(String)
// getBytes(Charset)
encodedLength(); // encodedLength(Charset)
contentEquals(); // contentEquals(StringBuffer)
compareTo(); // compareTo(String), compareTo(Object)
compareToIgnoreCase();// compareToIgnoreCase(String)

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -22,7 +22,7 @@
*/
/* @test
* @bug 6636323 6636319 7040220 7096080 7183053 8080248 8054307
* @bug 6636323 6636319 7040220 7096080 7183053 8080248 8054307 8372353
* @summary Test if StringCoding and NIO result have the same de/encoding result
* @library /test/lib
* @modules java.base/sun.nio.cs
@ -169,6 +169,12 @@ public class TestStringCoding {
if (!Arrays.equals(baSC, baNIO)) {
throw new RuntimeException("getBytes(cs) failed -> " + cs.name());
}
//encodedLength(cs);
int encodedLength = str.encodedLength(cs);
if (baSC.length != encodedLength) {
throw new RuntimeException(String.format("encodedLength failed (%d != %d) -> %s",
baSC.length, encodedLength, cs.name()));
}
return baSC;
}

View File

@ -0,0 +1,122 @@
/*
* Copyright (c) 2026, Google LLC. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.lang.foreign;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.TimeUnit;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.annotations.State;
@Warmup(time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(time = 1, timeUnit = TimeUnit.SECONDS)
@Fork(1)
@State(Scope.Benchmark)
public class StringLoopJmhBenchmark {
@Param({"10", "100", "1000", "100000"})
int stringLength;
@Param({"ASCII", "LATIN1", "UTF16"})
String encoding;
String stringData;
@Setup
public void setUp() {
stringData = "";
// Character at the _end_ to affect if we hit
// - ASCII = compact strings and compatible with UTF-8
// - LATIN1 = compact strings but not compatible with UTF-8
// - UTF16 = 2-byte char storage and not compatible with UTF-8
String c;
if (encoding.equals("ASCII")) {
c = "a";
} else if (encoding.equals("LATIN1")) {
c = "\u00C4";
} else if (encoding.equals("UTF16")) {
c = "\u2603";
} else {
throw new IllegalArgumentException("Unknown encoding: " + encoding);
}
var stringDataBuilder = new StringBuilder(stringLength + 1);
while (stringDataBuilder.length() < stringLength) {
stringDataBuilder.append((char) (Math.random() * 26) + 'a');
}
stringData = stringDataBuilder.append(c).toString();
}
@Benchmark
public int utf8LenByLoop() {
final String s = stringData;
final int len = s.length();
// ASCII prefix strings.
int idx = 0;
for (char c; idx < len && (c = s.charAt(idx)) < 0x80; ++idx) {}
// Entire string was ASCII.
if (idx == len) {
return len;
}
int utf8Len = len;
for (char c; idx < len; ++idx) {
c = s.charAt(idx);
if (c < 0x80) {
utf8Len++;
} else if (c < 0x800) {
utf8Len += 2;
} else {
utf8Len += 3;
if (Character.isSurrogate(c)) {
int cp = Character.codePointAt(s, idx);
if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
throw new RuntimeException("Unpaired surrogate");
}
idx++;
}
}
}
return utf8Len;
}
@Benchmark
public int getBytes() throws Exception {
return stringData.getBytes(StandardCharsets.UTF_8).length;
}
@Benchmark
public int encodedLength() throws Exception {
return stringData.encodedLength(StandardCharsets.UTF_8);
}
}