8372353: API to compute the byte length of a String encoded in a given Charset

Reviewed-by: rriggs, naoto, vyazici
2026-07-25 18:33:08 +00:00 · 2026-03-04 17:33:32 +00:00 · 2026-03-04 17:33:32 +00:00 · 0fbf58d8ff
commit 0fbf58d8ff
parent 8b91537f10
5 changed files with 235 additions and 13 deletions
--- a/src/java.base/share/classes/java/lang/String.java
+++ b/src/java.base/share/classes/java/lang/String.java
@ -1133,6 +1133,34 @@ public final class String
        return Arrays.copyOf(dst, dp);
    }

+    // This follows the implementation of encodeASCII and encode8859_1
+    private static int encodedLengthASCIIor8859_1(byte coder, byte[] val) {
+        if (coder == LATIN1) {
+            return val.length;
+        }
+        int len = val.length >> 1;
+        int dp = 0;
+        int sp = 0;
+        int sl = len;
+        while (sp < sl) {
+            char c = StringUTF16.getChar(val, sp);
+            if (c >= Character.MIN_HIGH_SURROGATE) {
+                break;
+            }
+            dp++;
+            sp++;
+        }
+        while (sp < sl) {
+            char c = StringUTF16.getChar(val, sp++);
+            if (Character.isHighSurrogate(c) && sp < sl &&
+                    Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
+                sp++;
+            }
+            dp++;
+        }
+        return dp;
+    }
+
    //------------------------------ utf8 ------------------------------------

    /**
@ -1467,6 +1495,27 @@ public final class String
        return Arrays.copyOf(dst, dp);
    }

+    // This follows the implementation of encodeUTF8
+    private static int encodedLengthUTF8(byte coder, byte[] val) {
+        if (coder == UTF16) {
+            return encodedLengthUTF8_UTF16(val, null);
+        }
+        int positives = StringCoding.countPositives(val, 0, val.length);
+        if (positives == val.length) {
+            return positives;
+        }
+        int dp = positives;
+        for (int i = dp; i < val.length; i++) {
+            byte c = val[i];
+            if (c < 0) {
+                dp += 2;
+            } else {
+                dp++;
+            }
+        }
+        return dp;
+    }
+
    /**
     * {@return the byte array obtained by first decoding {@code val} with
     * UTF-16, and then encoding the result with UTF-8}
@ -1484,11 +1533,8 @@ public final class String
        int sl = val.length >> 1;
        // UTF-8 encoded can be as much as 3 times the string length
        // For very large estimate, (as in overflow of 32 bit int), precompute the exact size
-        long allocLen = (sl * 3 < 0) ? computeSizeUTF8_UTF16(val, exClass) : sl * 3;
-        if (allocLen > (long)Integer.MAX_VALUE) {
-            throw new OutOfMemoryError("Required length exceeds implementation limit");
-        }
-        byte[] dst = new byte[(int) allocLen];
+        int allocLen = (sl * 3 < 0) ? encodedLengthUTF8_UTF16(val, exClass) : sl * 3;
+        byte[] dst = new byte[allocLen];
        while (sp < sl) {
            // ascii fast loop;
            char c = StringUTF16.getChar(val, sp);
@ -1547,11 +1593,20 @@ public final class String
     * @param <E> The exception type parameter to enable callers to avoid
     *           having to declare the exception
     */
-    private static <E extends Exception> long computeSizeUTF8_UTF16(byte[] val, Class<E> exClass) throws E {
+    private static <E extends Exception> int encodedLengthUTF8_UTF16(byte[] val, Class<E> exClass) throws E {
        long dp = 0L;
        int sp = 0;
        int sl = val.length >> 1;

+        while (sp < sl) {
+            // ascii fast loop;
+            char c = StringUTF16.getChar(val, sp);
+            if (c >= '\u0080') {
+                break;
+            }
+            dp++;
+            sp++;
+        }
        while (sp < sl) {
            char c = StringUTF16.getChar(val, sp++);
            if (c < 0x80) {
@ -1580,7 +1635,10 @@ public final class String
                dp += 3;
            }
        }
-        return dp;
+        if (dp > (long)Integer.MAX_VALUE) {
+            throw new OutOfMemoryError("Required length exceeds implementation limit");
+        }
+        return (int) dp;
    }

    /**
@ -2045,6 +2103,29 @@ public final class String
        return encode(Charset.defaultCharset(), coder(), value);
    }

+    /**
+     * {@return the length in bytes of this {@code String} encoded with the given {@link Charset}}
+     *
+     * <p>The returned length accounts for the replacement of malformed-input and unmappable-character
+     * sequences with the charset's default replacement byte array. The result will be the same value
+     * as {@link #getBytes(Charset) getBytes(cs).length}.
+     *
+     * @apiNote This method provides equivalent or better performance than {@link #getBytes(Charset)
+     *          getBytes(cs).length}.
+     *
+     * @param cs The {@link Charset} used to the compute the length
+     * @since 27
+     */
+    public int encodedLength(Charset cs) {
+        Objects.requireNonNull(cs);
+        if (cs == UTF_8.INSTANCE) {
+            return encodedLengthUTF8(coder, value);
+        } else if (cs == ISO_8859_1.INSTANCE || cs == US_ASCII.INSTANCE) {
+            return encodedLengthASCIIor8859_1(coder, value);
+        }
+        return getBytes(cs).length;
+    }
+
    boolean bytesCompatible(Charset charset, int srcIndex, int numChars) {
        if (isLatin1()) {
            if (charset == ISO_8859_1.INSTANCE) {
--- a/test/jdk/java/lang/String/Encodings.java
+++ b/test/jdk/java/lang/String/Encodings.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2006, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2026, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -22,7 +22,7 @@
 */

 /* @test
- * @bug 4085160 4139951 5005831
+ * @bug 4085160 4139951 5005831 8372353
 * @summary Test that required character encodings are supported
 */

@ -106,6 +106,10 @@ public class Encodings {
        if (!equals(bs, bytes))
            throw new Exception(charset + ": String.getBytes failed");

+        /* String.encodedLength(Charset charset) */
+        if (bs.length != str.encodedLength(charset))
+            throw new Exception(charset + ": String.encodedLength failed");
+
        // Calls to String.getBytes(Charset) shouldn't automatically
        // use the cached thread-local encoder.
        if (charset.name().equals("UTF-16BE")) {
--- a/test/jdk/java/lang/String/Exceptions.java
+++ b/test/jdk/java/lang/String/Exceptions.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2006, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2026, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -23,7 +23,7 @@

 /**
 * @test
- * @bug 4472841 4703640 4705681 4705683 4833095 5005831
+ * @bug 4472841 4703640 4705681 4705683 4833095 5005831 8372353
 * @summary Verify that constructor exceptions are thrown as expected.
 */

@ -397,6 +397,14 @@ public class Exceptions {
                }});
    }

+    private static void encodedLength() {
+        System.out.println("encodedLength(Charset charset)");
+        tryCatch("  null", NullPointerException.class, new Runnable() {
+                public void run() {
+                    "foo".encodedLength((Charset)null);
+                }});
+    }
+
    private static void contentEquals() {
        System.out.println("contentEquals(StringBuffer sb)");
        tryCatch("  null", NullPointerException.class, new Runnable() {
@ -640,6 +648,7 @@ public class Exceptions {
                              //   getBytes(Locale)
                              //   getBytes(String)
                              //   getBytes(Charset)
+        encodedLength();      // encodedLength(Charset)
        contentEquals();      // contentEquals(StringBuffer)
        compareTo();          // compareTo(String), compareTo(Object)
        compareToIgnoreCase();// compareToIgnoreCase(String)
--- a/test/jdk/sun/nio/cs/TestStringCoding.java
+++ b/test/jdk/sun/nio/cs/TestStringCoding.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2026, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -22,7 +22,7 @@
 */

 /* @test
- * @bug 6636323 6636319 7040220 7096080 7183053 8080248 8054307
+ * @bug 6636323 6636319 7040220 7096080 7183053 8080248 8054307 8372353
 * @summary Test if StringCoding and NIO result have the same de/encoding result
 * @library /test/lib
 * @modules java.base/sun.nio.cs
@ -169,6 +169,12 @@ public class TestStringCoding {
        if (!Arrays.equals(baSC, baNIO)) {
            throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());
        }
+        //encodedLength(cs);
+        int encodedLength = str.encodedLength(cs);
+        if (baSC.length != encodedLength) {
+            throw new RuntimeException(String.format("encodedLength failed (%d != %d) -> %s",
+                    baSC.length, encodedLength, cs.name()));
+        }
        return baSC;
    }

--- a/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java
+++ b/test/micro/org/openjdk/bench/java/lang/foreign/StringLoopJmhBenchmark.java
@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2026, Google LLC. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.java.lang.foreign;
+
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.TimeUnit;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.annotations.State;
+
+@Warmup(time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(1)
+@State(Scope.Benchmark)
+public class StringLoopJmhBenchmark {
+  @Param({"10", "100", "1000", "100000"})
+  int stringLength;
+
+  @Param({"ASCII", "LATIN1", "UTF16"})
+  String encoding;
+
+  String stringData;
+
+  @Setup
+  public void setUp() {
+    stringData = "";
+
+    // Character at the _end_ to affect if we hit
+    // - ASCII = compact strings and compatible with UTF-8
+    // - LATIN1 = compact strings but not compatible with UTF-8
+    // - UTF16 = 2-byte char storage and not compatible with UTF-8
+    String c;
+    if (encoding.equals("ASCII")) {
+      c = "a";
+    } else if (encoding.equals("LATIN1")) {
+      c = "\u00C4";
+    } else if (encoding.equals("UTF16")) {
+      c = "\u2603";
+    } else {
+      throw new IllegalArgumentException("Unknown encoding: " + encoding);
+    }
+
+    var stringDataBuilder = new StringBuilder(stringLength + 1);
+    while (stringDataBuilder.length() < stringLength) {
+      stringDataBuilder.append((char) (Math.random() * 26) + 'a');
+    }
+    stringData = stringDataBuilder.append(c).toString();
+  }
+
+  @Benchmark
+  public int utf8LenByLoop() {
+    final String s = stringData;
+    final int len = s.length();
+
+    // ASCII prefix strings.
+    int idx = 0;
+    for (char c; idx < len && (c = s.charAt(idx)) < 0x80; ++idx) {}
+
+    // Entire string was ASCII.
+    if (idx == len) {
+      return len;
+    }
+
+    int utf8Len = len;
+    for (char c; idx < len; ++idx) {
+      c = s.charAt(idx);
+      if (c < 0x80) {
+        utf8Len++;
+      } else if (c < 0x800) {
+        utf8Len += 2;
+      } else {
+        utf8Len += 3;
+        if (Character.isSurrogate(c)) {
+          int cp = Character.codePointAt(s, idx);
+          if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+            throw new RuntimeException("Unpaired surrogate");
+          }
+          idx++;
+        }
+      }
+    }
+    return utf8Len;
+  }
+
+  @Benchmark
+  public int getBytes() throws Exception {
+    return stringData.getBytes(StandardCharsets.UTF_8).length;
+  }
+
+  @Benchmark
+  public int encodedLength() throws Exception {
+    return stringData.encodedLength(StandardCharsets.UTF_8);
+  }
+}