mirror of
https://github.com/openjdk/jdk.git
synced 2026-06-08 03:25:05 +00:00
8372353: API to compute the byte length of a String encoded in a given Charset
Reviewed-by: rriggs, naoto, vyazici
This commit is contained in:
parent
8b91537f10
commit
0fbf58d8ff
@ -1133,6 +1133,34 @@ public final class String
|
||||
return Arrays.copyOf(dst, dp);
|
||||
}
|
||||
|
||||
// This follows the implementation of encodeASCII and encode8859_1
|
||||
private static int encodedLengthASCIIor8859_1(byte coder, byte[] val) {
|
||||
if (coder == LATIN1) {
|
||||
return val.length;
|
||||
}
|
||||
int len = val.length >> 1;
|
||||
int dp = 0;
|
||||
int sp = 0;
|
||||
int sl = len;
|
||||
while (sp < sl) {
|
||||
char c = StringUTF16.getChar(val, sp);
|
||||
if (c >= Character.MIN_HIGH_SURROGATE) {
|
||||
break;
|
||||
}
|
||||
dp++;
|
||||
sp++;
|
||||
}
|
||||
while (sp < sl) {
|
||||
char c = StringUTF16.getChar(val, sp++);
|
||||
if (Character.isHighSurrogate(c) && sp < sl &&
|
||||
Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
|
||||
sp++;
|
||||
}
|
||||
dp++;
|
||||
}
|
||||
return dp;
|
||||
}
|
||||
|
||||
//------------------------------ utf8 ------------------------------------
|
||||
|
||||
/**
|
||||
@ -1467,6 +1495,27 @@ public final class String
|
||||
return Arrays.copyOf(dst, dp);
|
||||
}
|
||||
|
||||
// This follows the implementation of encodeUTF8
|
||||
private static int encodedLengthUTF8(byte coder, byte[] val) {
|
||||
if (coder == UTF16) {
|
||||
return encodedLengthUTF8_UTF16(val, null);
|
||||
}
|
||||
int positives = StringCoding.countPositives(val, 0, val.length);
|
||||
if (positives == val.length) {
|
||||
return positives;
|
||||
}
|
||||
int dp = positives;
|
||||
for (int i = dp; i < val.length; i++) {
|
||||
byte c = val[i];
|
||||
if (c < 0) {
|
||||
dp += 2;
|
||||
} else {
|
||||
dp++;
|
||||
}
|
||||
}
|
||||
return dp;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@return the byte array obtained by first decoding {@code val} with
|
||||
* UTF-16, and then encoding the result with UTF-8}
|
||||
@ -1484,11 +1533,8 @@ public final class String
|
||||
int sl = val.length >> 1;
|
||||
// UTF-8 encoded can be as much as 3 times the string length
|
||||
// For very large estimate, (as in overflow of 32 bit int), precompute the exact size
|
||||
long allocLen = (sl * 3 < 0) ? computeSizeUTF8_UTF16(val, exClass) : sl * 3;
|
||||
if (allocLen > (long)Integer.MAX_VALUE) {
|
||||
throw new OutOfMemoryError("Required length exceeds implementation limit");
|
||||
}
|
||||
byte[] dst = new byte[(int) allocLen];
|
||||
int allocLen = (sl * 3 < 0) ? encodedLengthUTF8_UTF16(val, exClass) : sl * 3;
|
||||
byte[] dst = new byte[allocLen];
|
||||
while (sp < sl) {
|
||||
// ascii fast loop;
|
||||
char c = StringUTF16.getChar(val, sp);
|
||||
@ -1547,11 +1593,20 @@ public final class String
|
||||
* @param <E> The exception type parameter to enable callers to avoid
|
||||
* having to declare the exception
|
||||
*/
|
||||
private static <E extends Exception> long computeSizeUTF8_UTF16(byte[] val, Class<E> exClass) throws E {
|
||||
private static <E extends Exception> int encodedLengthUTF8_UTF16(byte[] val, Class<E> exClass) throws E {
|
||||
long dp = 0L;
|
||||
int sp = 0;
|
||||
int sl = val.length >> 1;
|
||||
|
||||
while (sp < sl) {
|
||||
// ascii fast loop;
|
||||
char c = StringUTF16.getChar(val, sp);
|
||||
if (c >= '\u0080') {
|
||||
break;
|
||||
}
|
||||
dp++;
|
||||
sp++;
|
||||
}
|
||||
while (sp < sl) {
|
||||
char c = StringUTF16.getChar(val, sp++);
|
||||
if (c < 0x80) {
|
||||
@ -1580,7 +1635,10 @@ public final class String
|
||||
dp += 3;
|
||||
}
|
||||
}
|
||||
return dp;
|
||||
if (dp > (long)Integer.MAX_VALUE) {
|
||||
throw new OutOfMemoryError("Required length exceeds implementation limit");
|
||||
}
|
||||
return (int) dp;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -2045,6 +2103,29 @@ public final class String
|
||||
return encode(Charset.defaultCharset(), coder(), value);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@return the length in bytes of this {@code String} encoded with the given {@link Charset}}
|
||||
*
|
||||
* <p>The returned length accounts for the replacement of malformed-input and unmappable-character
|
||||
* sequences with the charset's default replacement byte array. The result will be the same value
|
||||
* as {@link #getBytes(Charset) getBytes(cs).length}.
|
||||
*
|
||||
* @apiNote This method provides equivalent or better performance than {@link #getBytes(Charset)
|
||||
* getBytes(cs).length}.
|
||||
*
|
||||
* @param cs The {@link Charset} used to the compute the length
|
||||
* @since 27
|
||||
*/
|
||||
public int encodedLength(Charset cs) {
|
||||
Objects.requireNonNull(cs);
|
||||
if (cs == UTF_8.INSTANCE) {
|
||||
return encodedLengthUTF8(coder, value);
|
||||
} else if (cs == ISO_8859_1.INSTANCE || cs == US_ASCII.INSTANCE) {
|
||||
return encodedLengthASCIIor8859_1(coder, value);
|
||||
}
|
||||
return getBytes(cs).length;
|
||||
}
|
||||
|
||||
boolean bytesCompatible(Charset charset, int srcIndex, int numChars) {
|
||||
if (isLatin1()) {
|
||||
if (charset == ISO_8859_1.INSTANCE) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1999, 2006, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1999, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -22,7 +22,7 @@
|
||||
*/
|
||||
|
||||
/* @test
|
||||
* @bug 4085160 4139951 5005831
|
||||
* @bug 4085160 4139951 5005831 8372353
|
||||
* @summary Test that required character encodings are supported
|
||||
*/
|
||||
|
||||
@ -106,6 +106,10 @@ public class Encodings {
|
||||
if (!equals(bs, bytes))
|
||||
throw new Exception(charset + ": String.getBytes failed");
|
||||
|
||||
/* String.encodedLength(Charset charset) */
|
||||
if (bs.length != str.encodedLength(charset))
|
||||
throw new Exception(charset + ": String.encodedLength failed");
|
||||
|
||||
// Calls to String.getBytes(Charset) shouldn't automatically
|
||||
// use the cached thread-local encoder.
|
||||
if (charset.name().equals("UTF-16BE")) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2002, 2006, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2002, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -23,7 +23,7 @@
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 4472841 4703640 4705681 4705683 4833095 5005831
|
||||
* @bug 4472841 4703640 4705681 4705683 4833095 5005831 8372353
|
||||
* @summary Verify that constructor exceptions are thrown as expected.
|
||||
*/
|
||||
|
||||
@ -397,6 +397,14 @@ public class Exceptions {
|
||||
}});
|
||||
}
|
||||
|
||||
private static void encodedLength() {
|
||||
System.out.println("encodedLength(Charset charset)");
|
||||
tryCatch(" null", NullPointerException.class, new Runnable() {
|
||||
public void run() {
|
||||
"foo".encodedLength((Charset)null);
|
||||
}});
|
||||
}
|
||||
|
||||
private static void contentEquals() {
|
||||
System.out.println("contentEquals(StringBuffer sb)");
|
||||
tryCatch(" null", NullPointerException.class, new Runnable() {
|
||||
@ -640,6 +648,7 @@ public class Exceptions {
|
||||
// getBytes(Locale)
|
||||
// getBytes(String)
|
||||
// getBytes(Charset)
|
||||
encodedLength(); // encodedLength(Charset)
|
||||
contentEquals(); // contentEquals(StringBuffer)
|
||||
compareTo(); // compareTo(String), compareTo(Object)
|
||||
compareToIgnoreCase();// compareToIgnoreCase(String)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2000, 2026, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -22,7 +22,7 @@
|
||||
*/
|
||||
|
||||
/* @test
|
||||
* @bug 6636323 6636319 7040220 7096080 7183053 8080248 8054307
|
||||
* @bug 6636323 6636319 7040220 7096080 7183053 8080248 8054307 8372353
|
||||
* @summary Test if StringCoding and NIO result have the same de/encoding result
|
||||
* @library /test/lib
|
||||
* @modules java.base/sun.nio.cs
|
||||
@ -169,6 +169,12 @@ public class TestStringCoding {
|
||||
if (!Arrays.equals(baSC, baNIO)) {
|
||||
throw new RuntimeException("getBytes(cs) failed -> " + cs.name());
|
||||
}
|
||||
//encodedLength(cs);
|
||||
int encodedLength = str.encodedLength(cs);
|
||||
if (baSC.length != encodedLength) {
|
||||
throw new RuntimeException(String.format("encodedLength failed (%d != %d) -> %s",
|
||||
baSC.length, encodedLength, cs.name()));
|
||||
}
|
||||
return baSC;
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Copyright (c) 2026, Google LLC. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
package org.openjdk.bench.java.lang.foreign;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Fork;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
|
||||
@Warmup(time = 1, timeUnit = TimeUnit.SECONDS)
|
||||
@Measurement(time = 1, timeUnit = TimeUnit.SECONDS)
|
||||
@Fork(1)
|
||||
@State(Scope.Benchmark)
|
||||
public class StringLoopJmhBenchmark {
|
||||
@Param({"10", "100", "1000", "100000"})
|
||||
int stringLength;
|
||||
|
||||
@Param({"ASCII", "LATIN1", "UTF16"})
|
||||
String encoding;
|
||||
|
||||
String stringData;
|
||||
|
||||
@Setup
|
||||
public void setUp() {
|
||||
stringData = "";
|
||||
|
||||
// Character at the _end_ to affect if we hit
|
||||
// - ASCII = compact strings and compatible with UTF-8
|
||||
// - LATIN1 = compact strings but not compatible with UTF-8
|
||||
// - UTF16 = 2-byte char storage and not compatible with UTF-8
|
||||
String c;
|
||||
if (encoding.equals("ASCII")) {
|
||||
c = "a";
|
||||
} else if (encoding.equals("LATIN1")) {
|
||||
c = "\u00C4";
|
||||
} else if (encoding.equals("UTF16")) {
|
||||
c = "\u2603";
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown encoding: " + encoding);
|
||||
}
|
||||
|
||||
var stringDataBuilder = new StringBuilder(stringLength + 1);
|
||||
while (stringDataBuilder.length() < stringLength) {
|
||||
stringDataBuilder.append((char) (Math.random() * 26) + 'a');
|
||||
}
|
||||
stringData = stringDataBuilder.append(c).toString();
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int utf8LenByLoop() {
|
||||
final String s = stringData;
|
||||
final int len = s.length();
|
||||
|
||||
// ASCII prefix strings.
|
||||
int idx = 0;
|
||||
for (char c; idx < len && (c = s.charAt(idx)) < 0x80; ++idx) {}
|
||||
|
||||
// Entire string was ASCII.
|
||||
if (idx == len) {
|
||||
return len;
|
||||
}
|
||||
|
||||
int utf8Len = len;
|
||||
for (char c; idx < len; ++idx) {
|
||||
c = s.charAt(idx);
|
||||
if (c < 0x80) {
|
||||
utf8Len++;
|
||||
} else if (c < 0x800) {
|
||||
utf8Len += 2;
|
||||
} else {
|
||||
utf8Len += 3;
|
||||
if (Character.isSurrogate(c)) {
|
||||
int cp = Character.codePointAt(s, idx);
|
||||
if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
|
||||
throw new RuntimeException("Unpaired surrogate");
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return utf8Len;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int getBytes() throws Exception {
|
||||
return stringData.getBytes(StandardCharsets.UTF_8).length;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public int encodedLength() throws Exception {
|
||||
return stringData.encodedLength(StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user