diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index fc05febdb45..c6c08ed4473 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -1133,6 +1133,34 @@ public final class String return Arrays.copyOf(dst, dp); } + // This follows the implementation of encodeASCII and encode8859_1 + private static int encodedLengthASCIIor8859_1(byte coder, byte[] val) { + if (coder == LATIN1) { + return val.length; + } + int len = val.length >> 1; + int dp = 0; + int sp = 0; + int sl = len; + while (sp < sl) { + char c = StringUTF16.getChar(val, sp); + if (c >= Character.MIN_HIGH_SURROGATE) { + break; + } + dp++; + sp++; + } + while (sp < sl) { + char c = StringUTF16.getChar(val, sp++); + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { + sp++; + } + dp++; + } + return dp; + } + //------------------------------ utf8 ------------------------------------ /** @@ -1467,6 +1495,27 @@ public final class String return Arrays.copyOf(dst, dp); } + // This follows the implementation of encodeUTF8 + private static int encodedLengthUTF8(byte coder, byte[] val) { + if (coder == UTF16) { + return encodedLengthUTF8_UTF16(val, null); + } + int positives = StringCoding.countPositives(val, 0, val.length); + if (positives == val.length) { + return positives; + } + int dp = positives; + for (int i = dp; i < val.length; i++) { + byte c = val[i]; + if (c < 0) { + dp += 2; + } else { + dp++; + } + } + return dp; + } + /** * {@return the byte array obtained by first decoding {@code val} with * UTF-16, and then encoding the result with UTF-8} @@ -1484,11 +1533,8 @@ public final class String int sl = val.length >> 1; // UTF-8 encoded can be as much as 3 times the string length // For very large estimate, (as in overflow of 32 bit int), precompute the exact size - long allocLen = (sl * 3 < 0) ? computeSizeUTF8_UTF16(val, exClass) : sl * 3; - if (allocLen > (long)Integer.MAX_VALUE) { - throw new OutOfMemoryError("Required length exceeds implementation limit"); - } - byte[] dst = new byte[(int) allocLen]; + int allocLen = (sl * 3 < 0) ? encodedLengthUTF8_UTF16(val, exClass) : sl * 3; + byte[] dst = new byte[allocLen]; while (sp < sl) { // ascii fast loop; char c = StringUTF16.getChar(val, sp); @@ -1547,11 +1593,20 @@ public final class String * @param The exception type parameter to enable callers to avoid * having to declare the exception */ - private static long computeSizeUTF8_UTF16(byte[] val, Class exClass) throws E { + private static int encodedLengthUTF8_UTF16(byte[] val, Class exClass) throws E { long dp = 0L; int sp = 0; int sl = val.length >> 1; + while (sp < sl) { + // ascii fast loop; + char c = StringUTF16.getChar(val, sp); + if (c >= '\u0080') { + break; + } + dp++; + sp++; + } while (sp < sl) { char c = StringUTF16.getChar(val, sp++); if (c < 0x80) { @@ -1580,7 +1635,10 @@ public final class String dp += 3; } } - return dp; + if (dp > (long)Integer.MAX_VALUE) { + throw new OutOfMemoryError("Required length exceeds implementation limit"); + } + return (int) dp; } /** @@ -2045,6 +2103,29 @@ public final class String return encode(Charset.defaultCharset(), coder(), value); } + /** + * {@return the length in bytes of this {@code String} encoded with the given {@link Charset}} + * + *

The returned length accounts for the replacement of malformed-input and unmappable-character + * sequences with the charset's default replacement byte array. The result will be the same value + * as {@link #getBytes(Charset) getBytes(cs).length}. + * + * @apiNote This method provides equivalent or better performance than {@link #getBytes(Charset) + * getBytes(cs).length}. + * + * @param cs The {@link Charset} used to the compute the length + * @since 27 + */ + public int encodedLength(Charset cs) { + Objects.requireNonNull(cs); + if (cs == UTF_8.INSTANCE) { + return encodedLengthUTF8(coder, value); + } else if (cs == ISO_8859_1.INSTANCE || cs == US_ASCII.INSTANCE) { + return encodedLengthASCIIor8859_1(coder, value); + } + return getBytes(cs).length; + } + boolean bytesCompatible(Charset charset, int srcIndex, int numChars) { if (isLatin1()) { if (charset == ISO_8859_1.INSTANCE) { diff --git a/test/jdk/java/lang/String/Encodings.java b/test/jdk/java/lang/String/Encodings.java index 4714815026e..7974157ede0 100644 --- a/test/jdk/java/lang/String/Encodings.java +++ b/test/jdk/java/lang/String/Encodings.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2006, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,7 +22,7 @@ */ /* @test - * @bug 4085160 4139951 5005831 + * @bug 4085160 4139951 5005831 8372353 * @summary Test that required character encodings are supported */ @@ -106,6 +106,10 @@ public class Encodings { if (!equals(bs, bytes)) throw new Exception(charset + ": String.getBytes failed"); + /* String.encodedLength(Charset charset) */ + if (bs.length != str.encodedLength(charset)) + throw new Exception(charset + ": String.encodedLength failed"); + // Calls to String.getBytes(Charset) shouldn't automatically // use the cached thread-local encoder. if (charset.name().equals("UTF-16BE")) { diff --git a/test/jdk/java/lang/String/Exceptions.java b/test/jdk/java/lang/String/Exceptions.java index 3ba7792f424..15ffe3eac20 100644 --- a/test/jdk/java/lang/String/Exceptions.java +++ b/test/jdk/java/lang/String/Exceptions.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002, 2006, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,7 +23,7 @@ /** * @test - * @bug 4472841 4703640 4705681 4705683 4833095 5005831 + * @bug 4472841 4703640 4705681 4705683 4833095 5005831 8372353 * @summary Verify that constructor exceptions are thrown as expected. */ @@ -397,6 +397,14 @@ public class Exceptions { }}); } + private static void encodedLength() { + System.out.println("encodedLength(Charset charset)"); + tryCatch(" null", NullPointerException.class, new Runnable() { + public void run() { + "foo".encodedLength((Charset)null); + }}); + } + private static void contentEquals() { System.out.println("contentEquals(StringBuffer sb)"); tryCatch(" null", NullPointerException.class, new Runnable() { @@ -640,6 +648,7 @@ public class Exceptions { // getBytes(Locale) // getBytes(String) // getBytes(Charset) + encodedLength(); // encodedLength(Charset) contentEquals(); // contentEquals(StringBuffer) compareTo(); // compareTo(String), compareTo(Object) compareToIgnoreCase();// compareToIgnoreCase(String) diff --git a/test/jdk/sun/nio/cs/TestStringCoding.java b/test/jdk/sun/nio/cs/TestStringCoding.java index d708ef180a2..b81ffb07d20 100644 --- a/test/jdk/sun/nio/cs/TestStringCoding.java +++ b/test/jdk/sun/nio/cs/TestStringCoding.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,7 +22,7 @@ */ /* @test - * @bug 6636323 6636319 7040220 7096080 7183053 8080248 8054307 + * @bug 6636323 6636319 7040220 7096080 7183053 8080248 8054307 8372353 * @summary Test if StringCoding and NIO result have the same de/encoding result * @library /test/lib * @modules java.base/sun.nio.cs @@ -169,6 +169,12 @@ public class TestStringCoding { if (!Arrays.equals(baSC, baNIO)) { throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); } + //encodedLength(cs); + int encodedLength = str.encodedLength(cs); + if (baSC.length != encodedLength) { + throw new RuntimeException(String.format("encodedLength failed (%d != %d) -> %s", + baSC.length, encodedLength, cs.name())); + } return baSC; } diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java b/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java new file mode 100644 index 00000000000..1733b73886e --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2026, Google LLC. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.java.lang.foreign; + +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.annotations.State; + +@Warmup(time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +@State(Scope.Benchmark) +public class StringLoopJmhBenchmark { + @Param({"10", "100", "1000", "100000"}) + int stringLength; + + @Param({"ASCII", "LATIN1", "UTF16"}) + String encoding; + + String stringData; + + @Setup + public void setUp() { + stringData = ""; + + // Character at the _end_ to affect if we hit + // - ASCII = compact strings and compatible with UTF-8 + // - LATIN1 = compact strings but not compatible with UTF-8 + // - UTF16 = 2-byte char storage and not compatible with UTF-8 + String c; + if (encoding.equals("ASCII")) { + c = "a"; + } else if (encoding.equals("LATIN1")) { + c = "\u00C4"; + } else if (encoding.equals("UTF16")) { + c = "\u2603"; + } else { + throw new IllegalArgumentException("Unknown encoding: " + encoding); + } + + var stringDataBuilder = new StringBuilder(stringLength + 1); + while (stringDataBuilder.length() < stringLength) { + stringDataBuilder.append((char) (Math.random() * 26) + 'a'); + } + stringData = stringDataBuilder.append(c).toString(); + } + + @Benchmark + public int utf8LenByLoop() { + final String s = stringData; + final int len = s.length(); + + // ASCII prefix strings. + int idx = 0; + for (char c; idx < len && (c = s.charAt(idx)) < 0x80; ++idx) {} + + // Entire string was ASCII. + if (idx == len) { + return len; + } + + int utf8Len = len; + for (char c; idx < len; ++idx) { + c = s.charAt(idx); + if (c < 0x80) { + utf8Len++; + } else if (c < 0x800) { + utf8Len += 2; + } else { + utf8Len += 3; + if (Character.isSurrogate(c)) { + int cp = Character.codePointAt(s, idx); + if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + throw new RuntimeException("Unpaired surrogate"); + } + idx++; + } + } + } + return utf8Len; + } + + @Benchmark + public int getBytes() throws Exception { + return stringData.getBytes(StandardCharsets.UTF_8).length; + } + + @Benchmark + public int encodedLength() throws Exception { + return stringData.encodedLength(StandardCharsets.UTF_8); + } +}