From 00cc9be854659f4e844d4572325d371ab05aa0bf Mon Sep 17 00:00:00 2001
From: Ehsan Behrangi <ehsan.behrangi@arm.com>
Date: Fri, 5 Jun 2026 12:22:15 +0100
Subject: [PATCH] 8381560: AArch64: Optimize String.equals intrinsic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change improves the AArch64 implementation of String.equals by
introducing SIMD-based fast paths using SVE and NEON.

SVE implementation:
- Uses predicated loads and comparisons for short lengths (len < VL)
- Uses a full predicated loop for longer inputs
- Handles the tail via an overlapped compare at (base + len - VL)

NEON implementation:
- Uses an 8-byte pre-read to simplify tail handling and eliminate
  4/2/1-byte scalar branches
- Processes 16-byte chunks using LDP pair loads
- Uses CMP/CCMP to collapse comparisons into a single branch on mismatch

These changes reduce branch pressure and improve throughput for both
short and long strings.

Correctness:
- The implementation preserves existing semantics and matches behavior
  for all lengths

Testing:
- Updated and extended intrinsic tests to cover boundary conditions
  and mismatch positions

Benchmark:
Across evaluated macrobenchmarks (DaCapo and Renaissance), most workloads
spend <0.5% of CPU time in String.equals. DaCapo biojava is a notable
exception (~8–9%). In biojava, most String.equals calls are on very short
strings (1–2 bytes), where SVE shows ~1% end-to-end improvement, while
NEON is largely neutral or shows a small regression (~1%).

Measured using JMH on AArch64 (Arm Neoverse V2 CPU).
Values are relative (%) vs baseline. Negative values indicate regressions.
Mismatch results are reported across first(DF), middle(DM),
and last(DL) difference positions.

SVE results:
Length | L1_EQ  L1_DF  L1_DM  L1_DL | U16_EQ U16_DF U16_DM U16_DL | Avg
-------+----------------------------+-----------------------------+------
0      | 19.63                      | 20.05                      | 19.84
1      | 16.59  17.81  16.57  18.34 | 16.02   0.71   0.42   1.39 | 10.98
2      | 16.44   1.32   0.30  -0.16 | 15.90  -5.17  -4.55  -1.09 |  2.87
3      | 26.58   1.60   1.43  27.07 | 30.34  -8.86  -7.06  14.08 | 10.65
7      | 41.47  -2.94  -3.37  39.82 | 24.02  -8.82  -6.27  20.48 | 13.05
8      | 19.08  -1.16  -3.50  -0.90 | 22.49  -9.75  17.50  13.13 |  7.11
9      | 20.17  -4.12  -5.17  19.03 |  9.25  -2.24  21.35   3.39 |  7.71
15     | 19.48  -3.83  -4.50  19.01 | 29.26 -10.06  11.76  17.07 |  9.77
16     | 19.04  -3.15  16.41  16.85 | 38.37 -11.12  13.18  27.70 | 14.66
17     |  8.95  -2.40   5.68   6.38 | 16.32  -1.61   7.49  11.44 |  6.53
31     | 28.87  -0.01  19.79  23.37 | 41.43  -7.57  23.85  35.89 | 20.70
32     | 32.58   3.38  12.39  26.90 | 46.01 -10.99  20.53  44.15 | 21.87
33     | 11.62 -15.20   6.04  13.27 | 32.27  -9.38  20.33  32.28 | 11.40
63     | 44.66 -11.59  37.20  42.56 | 55.41 -10.57  43.19  55.90 | 32.10
64     | 53.99  -2.19  27.04  51.79 | 59.36  -8.72  35.41  60.32 | 34.63
65     | 33.79 -14.01  23.95  29.15 | 48.91 -11.58  36.54  50.03 | 24.60
127    | 62.10  -3.79  47.51  62.79 | 58.13  -8.89  60.68  60.90 | 42.43
128    | 67.38  -2.47  38.62  67.09 | 62.83  -0.38  51.72  61.87 | 43.33
129    | 52.02  -1.42  39.17  49.20 | 55.04  -9.52  53.23  52.81 | 36.32
256    | 66.11  -1.38  56.12  64.93 | 70.67  -3.68  53.67  74.54 | 47.62

Average:
         33.03  -2.40  17.46  30.34 | 37.60  -7.27  23.84  33.49 | 20.91

NEON results:
Length | L1_EQ  L1_DF  L1_DM  L1_DL | U16_EQ U16_DF U16_DM U16_DL | Avg
-------+----------------------------+-----------------------------+------
0      |  9.22                      |  8.69                      |  8.95
1      |  3.07   3.59   1.34   5.42 |  6.36  -6.20  -6.71 -10.59 | -0.47
2      |  3.23  -4.79  -5.67  -4.09 |  8.06  -8.43  -9.89  -9.20 | -3.85
3      | 12.80  -4.16  -3.95  11.28 | 11.94 -14.50 -14.41  11.83 |  1.36
7      | 31.00  -7.21 -12.76  33.59 |  4.73 -17.67 -17.38   1.65 |  1.99
8      |  4.43  -7.20  -4.70  -6.73 |  2.71 -18.05  -3.17  -4.05 | -4.59
9      | -9.33 -19.90 -16.27  -1.80 | 16.65 -23.72   4.26   8.78 | -5.17
15     | -6.96 -16.17 -15.60  -4.01 |  7.46 -24.60  -3.19  77.82 |  1.84
16     |  2.48 -16.38  -2.56  -3.62 |  9.08 -19.29  -5.45  77.93 |  5.27
17     |  4.88 -18.85  -0.18  19.35 | 18.43 -19.80  -8.37  84.96 | 10.05
31     |  6.92 -21.13  -4.62  60.71 | 24.42 -21.81   9.48 188.59 | 30.32
32     |  7.75 -24.20  -5.29  68.23 | 25.33 -20.57   4.17 183.65 | 29.88
33     | 20.23 -20.42 -11.33  98.60 | 23.76 -24.76   5.97 188.57 | 35.08
63     | 30.25 -22.30  14.29 152.37 | 25.02 -28.37  21.43 419.68 | 76.55
64     | 28.99 -22.91   9.03 185.51 | 38.20 -22.82  19.76 446.60 | 85.29
65     | 16.13 -21.77   1.45 211.38 | 27.94 -24.79  17.50 446.80 | 84.33
127    | 33.69 -28.94  28.75 429.23 | 41.75 -24.86  37.35 832.68 |168.71
128    | 26.28 -29.03  24.13 432.87 | 43.48 -18.53  26.44 810.20 |164.48
129    | 27.73 -20.30  20.84 439.01 | 44.09 -22.35  30.09 827.38 |168.31
256    | 53.30 -20.27  26.09 841.37 | 56.66 -21.07  47.41 1604.98|323.56

Average:
         15.30 -16.97   2.26 156.24 | 22.24 -20.12   8.17 325.70 | 59.10

Observations:
- SVE shows consistent improvements across all tested lengths, with gains
  increasing as input size grows
- NEON improves equal-string performance across all lengths
- NEON shows regressions for short mismatched inputs due to the loss
  of the scalar tbz-based early-exit sequence, which efficiently
  detects mismatches at small sizes and at early positions
- The scalar implementation relies on a branchy 4/2/1 tbz ladder,
  which is efficient for early mismatches but suboptimal for equal
  strings
- The NEON implementation replaces this with a branchless SIMD
  approach and performs upfront comparisons of the first and last
  8 bytes, improving throughput and late-mismatch detection
---
 src/hotspot/cpu/aarch64/aarch64.ad            |  44 ++++-
 .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp |  37 ++++
 .../cpu/aarch64/c2_MacroAssembler_aarch64.hpp |   4 +
 .../cpu/aarch64/macroAssembler_aarch64.cpp    | 158 +++++++++++-------
 .../cpu/aarch64/macroAssembler_aarch64.hpp    |   2 +-
 .../openjdk/bench/java/lang/StringEquals.java |  50 ++++++
 6 files changed, 225 insertions(+), 70 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index f31514e666c..d4fcd4c9238 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -1,7 +1,7 @@
 //
 // Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2014, 2024, Red Hat, Inc. All rights reserved.
-// Copyright 2025 Arm Limited and/or its affiliates.
+// Copyright 2025, 2026 Arm Limited and/or its affiliates.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -16028,18 +16028,50 @@ instruct stringU_indexof_char_sve(iRegP_R1 str1, iRegI_R2 cnt1, iRegI_R3 ch,
   ins_pipe(pipe_class_memory);
 %}
 
-instruct string_equalsL(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
-                        iRegI_R0 result, rFlagsReg cr)
+instruct string_equalsL_sve(
+    iRegP_R1   str1,      // str1 (kill)
+    iRegP_R3   str2,      // str2 (kill)
+    iRegI_R4   cnt,       // int length (kill)
+    iRegI_R0   result,    // boolean
+    vecA ztmp1, vecA ztmp2, // SVE z registers
+    pRegGov pg, pReg pdata, // SVE predicate registers
+    rFlagsReg  cr)
 %{
-  predicate(((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL);
+  predicate(UseSVE > 0 && ((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (StrEquals (Binary str1 str2) cnt));
-  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL cr);
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP ztmp1, TEMP ztmp2, TEMP pg, TEMP pdata, KILL cr);
+
+  format %{ "String Equals $str1,$str2,$cnt -> $result" %}
+  ins_encode %{
+    // Count is in 8-bit bytes; non-Compact chars are 16 bits.
+    __ string_equals_sve($str1$$Register, $str2$$Register,
+                     $result$$Register, $cnt$$Register,
+                     $ztmp1$$FloatRegister,
+                     $ztmp2$$FloatRegister, $pg$$PRegister,
+                     $pdata$$PRegister);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+instruct string_equalsL(
+    iRegP_R1   str1,      // str1 (kill)
+    iRegP_R3   str2,      // str2 (kill)
+    iRegI_R4   cnt,       // int length (kill)
+    iRegI_R0   result,    // boolean
+    iRegINoSp  str1_hi,     // temp: str1 high 8B
+    iRegINoSp  str2_hi,     // temp: str2 high 8B (reused as shift amount in SMALL)
+    rFlagsReg  cr)
+%{
+  predicate(UseSVE == 0 && ((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL);
+  match(Set result (StrEquals (Binary str1 str2) cnt));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP str1_hi, TEMP str2_hi, KILL cr);
 
   format %{ "String Equals $str1,$str2,$cnt -> $result" %}
   ins_encode %{
     // Count is in 8-bit bytes; non-Compact chars are 16 bits.
     __ string_equals($str1$$Register, $str2$$Register,
-                     $result$$Register, $cnt$$Register);
+                     $result$$Register, $cnt$$Register,
+                     $str1_hi$$Register, $str2_hi$$Register);
   %}
   ins_pipe(pipe_class_memory);
 %}
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
index cb9e308197e..16096f995f2 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@@ -1124,6 +1124,43 @@ void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
   BIND(DONE);
 }
 
+void C2_MacroAssembler::string_equals_sve(Register a1, Register a2,
+                                      Register result, Register cnt1,
+                                      FloatRegister ztmp1, FloatRegister ztmp2,
+                                      PRegister pg, PRegister pdata) {
+  Label LOOP, TAIL, END;
+  Register vec_len = rscratch1;
+  Register tmp_cnt1     = rscratch2;
+  sve_cntb(vec_len);
+  // Keep original cnt1 for the len <= VL tail decision.
+  // If length(cnt1) <= VL go to the tail
+  subs(tmp_cnt1, cnt1, vec_len);
+  br(Assembler::LE, TAIL);
+  sve_ptrue(pg, B);
+  bind(LOOP);
+    sve_ld1b(ztmp1, B, pg, Address(a1));
+    sve_ld1b(ztmp2, B, pg, Address(a2));
+    add(a1, a1, vec_len);
+    add(a2, a2, vec_len);
+    sve_cmp(Assembler::NE, pdata, B, pg, ztmp1, ztmp2);
+    br(Assembler::NE, END);
+    subs(tmp_cnt1, tmp_cnt1, vec_len);
+    br(Assembler::HI, LOOP);
+  // Final overlapped full-VL compare.
+  sve_ld1b(ztmp1, B, pg, Address(a1, tmp_cnt1));
+  sve_ld1b(ztmp2, B, pg, Address(a2, tmp_cnt1));
+  sve_cmp(Assembler::NE, pdata, B, pg, ztmp1, ztmp2);
+  b(END);
+
+  bind(TAIL);
+    sve_whilelt(pg, B, zr, cnt1);
+    sve_ld1b(ztmp1, B, pg, Address(a1));
+    sve_ld1b(ztmp2, B, pg, Address(a2));
+    sve_cmp(Assembler::NE, pdata, B, pg, ztmp1, ztmp2);
+  bind(END);
+    cset(result, Assembler::EQ);
+}
+
 // Compare strings.
 void C2_MacroAssembler::string_compare(Register str1, Register str2,
     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
index f96d3ffb863..6f0b5e43a19 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@@ -88,6 +88,10 @@
   void fast_lock(Register object, Register box, Register t1, Register t2, Register t3);
   void fast_unlock(Register object, Register box, Register t1, Register t2, Register t3);
 
+  void string_equals_sve(Register a1, Register a2, Register result, Register cnt1,
+                           FloatRegister ztmp1, FloatRegister ztmp2,
+                           PRegister pgtmp, PRegister ptmp);
+
   void string_compare(Register str1, Register str2,
                       Register cnt1, Register cnt2, Register result,
                       Register tmp1, Register tmp2, FloatRegister vtmp1,
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
index eb658ba4e30..9baf1356d21 100644
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
+ * Copyright 2026 Arm Limited and/or its affiliates.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -6105,22 +6106,43 @@ address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
 }
 
 // Compare Strings
-
-// For Strings we're passed the address of the first characters in a1
-// and a2 and the length in cnt1.
-// There are two implementations.  For arrays >= 8 bytes, all
-// comparisons (including the final one, which may overlap) are
-// performed 8 bytes at a time.  For strings < 8 bytes, we compare a
-// halfword, then a short, and then a byte.
+//
+// Inputs:
+//   a1, a2  - byte addresses of the first elements
+//   cnt1    - byte length
+//
+// Invariants and memory contract:
+//   - cnt1 is the number of bytes to compare.
+//   - The 8 bytes immediately preceding a1/a2 are readable
+//     (Java object header guarantee). This allows a pre-read at
+//     (base + len - 8) even when len < 8.
+//   - No read is performed beyond (base + len - 1).
+//
+// Strategy:
+//   1) Preload the final 8-byte window at (base + len - 8).
+//      This covers the last up to 8 bytes and serves as a fast-fail check.
+//   2) For len <= 8, handle entirely in SMALL using shift/mask logic.
+//   3) For medium sizes (9..23) and post-loop remainders,
+//      TAIL15 compares head and tail windows with overlap as needed.
+//   4) For larger inputs (>= 24), MAINLOOP processes 16-byte blocks
+//      using LDP + CMP/CCMP to allow a single branch on inequality.
+//      Any remaining <16 bytes fall back to TAIL15.
+//
+// SMALL path:
+//   For lengths <= 8, the preloaded 8-byte window is shifted
+//   so that only the valid low-order bytes participate in comparison.
 
 void MacroAssembler::string_equals(Register a1, Register a2,
-                                   Register result, Register cnt1)
+                                   Register result, Register cnt1,
+                                   Register a1_hi, Register a2_hi)
 {
-  Label SAME, DONE, SHORT, NEXT_WORD;
-  Register tmp1 = rscratch1;
-  Register tmp2 = rscratch2;
+  Label MAINLOOP, TAIL15, SMALL, END, DONE, SMALL2;
+  Register a1_low = rscratch1;
+  Register a2_low = rscratch2;
 
-  assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
+  assert_different_registers(a1, a2, cnt1, a1_hi, a2_hi, a1_low, a2_low);
+  assert(result != a1, "result must not alias a1");
+  assert(result != a2, "result must not alias a2");
 
 #ifndef PRODUCT
   {
@@ -6130,61 +6152,71 @@ void MacroAssembler::string_equals(Register a1, Register a2,
   }
 #endif
 
-  mov(result, false);
+  subs(cnt1, cnt1, 8);
+  ldr(a1_low, Address(a1, cnt1));       // Load last 8 bytes from a1
+  ldr(a2_low, Address(a2, cnt1));       // Load last 8 bytes from a2
+  br(Assembler::LE, SMALL);
+  subs(cnt1, cnt1, 16);
+  br(Assembler::LT, TAIL15);
+  cmp(a1_low, a2_low);
+  br(Assembler::NE, END);
+  // ---- MAINLOOP: process two 8B via ldp/ccmp ----
+  bind(MAINLOOP);
+    ldp(a1_low, a1_hi, Address(post(a1,16)));       // A1: low/high 8B
+    ldp(a2_low, a2_hi, Address(post(a2,16)));       // A2: low/high 8B
+    cmp(a1_low, a2_low);
+    ccmp(a1_hi, a2_hi, /*nzcv=*/0, Assembler::EQ);
+    br(Assembler::NE, END);
+    subs(cnt1, cnt1, 16);
+    br(Assembler::HS, MAINLOOP);           // while remaining >= 16
 
-  // Check for short strings, i.e. smaller than wordSize.
-  subs(cnt1, cnt1, wordSize);
-  br(Assembler::LT, SHORT);
-  // Main 8 byte comparison loop.
-  bind(NEXT_WORD); {
-    ldr(tmp1, Address(post(a1, wordSize)));
-    ldr(tmp2, Address(post(a2, wordSize)));
-    subs(cnt1, cnt1, wordSize);
-    eor(tmp1, tmp1, tmp2);
-    cbnz(tmp1, DONE);
-  } br(GT, NEXT_WORD);
-  // Last longword.  In the case where length == 4 we compare the
-  // same longword twice, but that's still faster than another
-  // conditional branch.
-  // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
-  // length == 4.
-  ldr(tmp1, Address(a1, cnt1));
-  ldr(tmp2, Address(a2, cnt1));
-  eor(tmp2, tmp1, tmp2);
-  cbnz(tmp2, DONE);
-  b(SAME);
+  adds(zr, cnt1, 16);           // If cnt1 == -16, skip tail handling.
+  br(Assembler::EQ, END);
 
-  bind(SHORT);
-  Label TAIL03, TAIL01;
+  // ---- TAIL15: medium sizes and post-loop tail.
+  // Entered when (initial len < 24) or when MAINLOOP leaves a <16B tail.
+  // At entry, cnt1 is in [-15 .. -1] ----
+  bind(TAIL15);
+    // cnt1 := remaining length - 8 ; if remaining lengths <= 8 goto SMALL2
+    adds(cnt1, cnt1, 8);
+    br(Assembler::LE, SMALL2);
+    cmp(a1_low, a2_low);
 
-  tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
-  {
-    ldrw(tmp1, Address(post(a1, 4)));
-    ldrw(tmp2, Address(post(a2, 4)));
-    eorw(tmp1, tmp1, tmp2);
-    cbnzw(tmp1, DONE);
-  }
-  bind(TAIL03);
-  tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
-  {
-    ldrh(tmp1, Address(post(a1, 2)));
-    ldrh(tmp2, Address(post(a2, 2)));
-    eorw(tmp1, tmp1, tmp2);
-    cbnzw(tmp1, DONE);
-  }
-  bind(TAIL01);
-  tbz(cnt1, 0, SAME); // 0-1 bytes left.
-    {
-    ldrb(tmp1, a1);
-    ldrb(tmp2, a2);
-    eorw(tmp1, tmp1, tmp2);
-    cbnzw(tmp1, DONE);
-  }
-  // Arrays are equal.
-  bind(SAME);
-  mov(result, true);
+    // We have more than 8 bytes unchecked and 8 bytes from end previously read
+    // One ldp can cover all remained bytes
+    ldp(a1_low, a1_hi, Address(a1));         // A1 high 8B
+    ldp(a2_low, a2_hi, Address(a2));         // A2 high 8B
+    ccmp(a1_hi, a2_hi, 0, Assembler::EQ);
+    ccmp(a1_low, a2_low, /*nzcv=*/0, Assembler::EQ);
+    b(END);
+  // Tail <= 16B case: compare head 8 bytes and tail 8 bytes (tail 8 bytes was preloaded).
+  bind(SMALL2);
+    ldr(a1_hi, Address(a1));
+    ldr(a2_hi, Address(a2));
+    cmp(a1_low, a2_low);
+    ccmp(a1_hi, a2_hi, /*nzcv=*/0, Assembler::EQ);
+    b(END);
+  // For lengths <= 8 we avoid 4/2/1-byte tail branches and extra loads.
+  // Compute shift = (8 - len) * 8 and right-shift the preloaded 8B window
+  // so that only the valid low-order len bytes remain for comparison.
+  //
+  // The load at (base + len - 8) produces an 8B window ending at the last
+  // string byte. When len < 8, the leading bytes in this window are
+  // outside the logical string. On little-endian AArch64, lower-address
+  // bytes occupy the least significant bits of the 64-bit word, so a
+  // logical right shift cleanly discards those unused prefix bytes.
+  //
+  // a2_hi is reused as a temporary register holding the shift amount.
+  bind(SMALL);
+    neg(a2_hi, cnt1, LSL, 3);
+    lsrv(a1_low, a1_low, a2_hi);
+    lsrv(a2_low, a2_low, a2_hi);
+    adds(zr, cnt1, 8);         // Prepare flags for length==0 handling
+    ccmp(a1_low, a2_low, /*nzcv=*/4, Assembler::NE);
+
+  bind(END);
+    cset(result, Assembler::EQ);
 
-  // That's it.
   bind(DONE);
   BLOCK_COMMENT("} string_equals");
 }
diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
index b1050b45731..c39b129a43d 100644
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@@ -1519,7 +1519,7 @@ public:
            vpowm  == v13, "registers must match aarch64.ad"); \
   } while (0)
 
-  void string_equals(Register a1, Register a2, Register result, Register cnt1);
+  void string_equals(Register a1, Register a2, Register result, Register cnt1, Register a1_hi, Register a2_hi);
 
   void fill_words(Register base, Register cnt, Register value);
   address zero_words(Register base, uint64_t cnt);
diff --git a/test/micro/org/openjdk/bench/java/lang/StringEquals.java b/test/micro/org/openjdk/bench/java/lang/StringEquals.java
index b0db6a7037e..b3fa02569e1 100644
--- a/test/micro/org/openjdk/bench/java/lang/StringEquals.java
+++ b/test/micro/org/openjdk/bench/java/lang/StringEquals.java
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2026 Arm Limited and/or its affiliates.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -43,6 +44,45 @@ public class StringEquals {
     public String test5 = new String(test4); // equal to test4, but not same
     public String test6 = new String("0123456780");
     public String test7 = new String("0123\u01FE");
+    // string with parameterizable size
+    public String test8;
+    // same chars as test8, but different object; forces the intrinsic to read
+    // the entire string to check equality
+    public String test9;
+    // same chars as test8, except at length + diff_pos; set diff_pos to the
+    // worst case for the intrinsic being tested (usually -1, but could be -9
+    // if the intrinsic reads the last 8B first, or -length if the intrinsic
+    // reads the string backwards
+    public String test10;
+
+    @Param({"30"})  // can be used at runtime to define a length sweep
+    public int size;
+
+    @Param({"-1"})  // set to the worst location for the intrinsic under test
+    public int diff_pos;
+
+    @Setup
+    public void setup() {
+        if(size > 0) {
+            test8 = "a".repeat(size);
+            // NOTE 1: can't do test9 = new String(test8) or they'll share byte
+            // arrays, which improves cache hit rate of the equal-string case
+            test9 = "a".repeat(size);
+            StringBuilder sb = new StringBuilder("a".repeat(size));
+            sb.setCharAt(Math.max(test8.length() + diff_pos, 0), 'b');
+            test10 = sb.toString();
+        }
+        else {
+            // NOTE 2: can't use "a".repeat(0) or it returns the "" literal,
+            // which will early-exit from String.equals()
+            // NOTE 3: can't use no-arg String ctor or they'll share the byte
+            // array of the "" literal, which improves cache hit rate for
+            // intrinsics that read backwards into the object header
+            test8 = new String(new char [] {});
+            test9 = new String(new char [] {});
+            test10 = new String(new char [] {});
+        }
+    }
 
     @Benchmark
     public boolean different() {
@@ -54,6 +94,16 @@ public class StringEquals {
         return test.equals(test3);
     }
 
+    @Benchmark
+    public boolean differentParam() {
+        return test8.equals(test10);
+    }
+
+    @Benchmark
+    public boolean equalParam() {
+        return test8.equals(test9);
+    }
+
     @Benchmark
     public boolean almostEqual() {
         return test.equals(test6);