diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index f31514e666c..d4fcd4c9238 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -1,7 +1,7 @@ // // Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2014, 2024, Red Hat, Inc. All rights reserved. -// Copyright 2025 Arm Limited and/or its affiliates. +// Copyright 2025, 2026 Arm Limited and/or its affiliates. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -16028,18 +16028,50 @@ instruct stringU_indexof_char_sve(iRegP_R1 str1, iRegI_R2 cnt1, iRegI_R3 ch, ins_pipe(pipe_class_memory); %} -instruct string_equalsL(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt, - iRegI_R0 result, rFlagsReg cr) +instruct string_equalsL_sve( + iRegP_R1 str1, // str1 (kill) + iRegP_R3 str2, // str2 (kill) + iRegI_R4 cnt, // int length (kill) + iRegI_R0 result, // boolean + vecA ztmp1, vecA ztmp2, // SVE z registers + pRegGov pg, pReg pdata, // SVE predicate registers + rFlagsReg cr) %{ - predicate(((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL); + predicate(UseSVE > 0 && ((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrEquals (Binary str1 str2) cnt)); - effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL cr); + effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP ztmp1, TEMP ztmp2, TEMP pg, TEMP pdata, KILL cr); + + format %{ "String Equals $str1,$str2,$cnt -> $result" %} + ins_encode %{ + // Count is in 8-bit bytes; non-Compact chars are 16 bits. + __ string_equals_sve($str1$$Register, $str2$$Register, + $result$$Register, $cnt$$Register, + $ztmp1$$FloatRegister, + $ztmp2$$FloatRegister, $pg$$PRegister, + $pdata$$PRegister); + %} + ins_pipe(pipe_class_memory); +%} + +instruct string_equalsL( + iRegP_R1 str1, // str1 (kill) + iRegP_R3 str2, // str2 (kill) + iRegI_R4 cnt, // int length (kill) + iRegI_R0 result, // boolean + iRegINoSp str1_hi, // temp: str1 high 8B + iRegINoSp str2_hi, // temp: str2 high 8B (reused as shift amount in SMALL) + rFlagsReg cr) +%{ + predicate(UseSVE == 0 && ((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL); + match(Set result (StrEquals (Binary str1 str2) cnt)); + effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP str1_hi, TEMP str2_hi, KILL cr); format %{ "String Equals $str1,$str2,$cnt -> $result" %} ins_encode %{ // Count is in 8-bit bytes; non-Compact chars are 16 bits. __ string_equals($str1$$Register, $str2$$Register, - $result$$Register, $cnt$$Register); + $result$$Register, $cnt$$Register, + $str1_hi$$Register, $str2_hi$$Register); %} ins_pipe(pipe_class_memory); %} diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index cb9e308197e..16096f995f2 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -1124,6 +1124,43 @@ void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, BIND(DONE); } +void C2_MacroAssembler::string_equals_sve(Register a1, Register a2, + Register result, Register cnt1, + FloatRegister ztmp1, FloatRegister ztmp2, + PRegister pg, PRegister pdata) { + Label LOOP, TAIL, END; + Register vec_len = rscratch1; + Register tmp_cnt1 = rscratch2; + sve_cntb(vec_len); + // Keep original cnt1 for the len <= VL tail decision. + // If length(cnt1) <= VL go to the tail + subs(tmp_cnt1, cnt1, vec_len); + br(Assembler::LE, TAIL); + sve_ptrue(pg, B); + bind(LOOP); + sve_ld1b(ztmp1, B, pg, Address(a1)); + sve_ld1b(ztmp2, B, pg, Address(a2)); + add(a1, a1, vec_len); + add(a2, a2, vec_len); + sve_cmp(Assembler::NE, pdata, B, pg, ztmp1, ztmp2); + br(Assembler::NE, END); + subs(tmp_cnt1, tmp_cnt1, vec_len); + br(Assembler::HI, LOOP); + // Final overlapped full-VL compare. + sve_ld1b(ztmp1, B, pg, Address(a1, tmp_cnt1)); + sve_ld1b(ztmp2, B, pg, Address(a2, tmp_cnt1)); + sve_cmp(Assembler::NE, pdata, B, pg, ztmp1, ztmp2); + b(END); + + bind(TAIL); + sve_whilelt(pg, B, zr, cnt1); + sve_ld1b(ztmp1, B, pg, Address(a1)); + sve_ld1b(ztmp2, B, pg, Address(a2)); + sve_cmp(Assembler::NE, pdata, B, pg, ztmp1, ztmp2); + bind(END); + cset(result, Assembler::EQ); +} + // Compare strings. void C2_MacroAssembler::string_compare(Register str1, Register str2, Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp index f96d3ffb863..6f0b5e43a19 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp @@ -88,6 +88,10 @@ void fast_lock(Register object, Register box, Register t1, Register t2, Register t3); void fast_unlock(Register object, Register box, Register t1, Register t2, Register t3); + void string_equals_sve(Register a1, Register a2, Register result, Register cnt1, + FloatRegister ztmp1, FloatRegister ztmp2, + PRegister pgtmp, PRegister ptmp); + void string_compare(Register str1, Register str2, Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, FloatRegister vtmp1, diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index eb658ba4e30..9baf1356d21 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -1,6 +1,7 @@ /* * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved. + * Copyright 2026 Arm Limited and/or its affiliates. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -6105,22 +6106,43 @@ address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, } // Compare Strings - -// For Strings we're passed the address of the first characters in a1 -// and a2 and the length in cnt1. -// There are two implementations. For arrays >= 8 bytes, all -// comparisons (including the final one, which may overlap) are -// performed 8 bytes at a time. For strings < 8 bytes, we compare a -// halfword, then a short, and then a byte. +// +// Inputs: +// a1, a2 - byte addresses of the first elements +// cnt1 - byte length +// +// Invariants and memory contract: +// - cnt1 is the number of bytes to compare. +// - The 8 bytes immediately preceding a1/a2 are readable +// (Java object header guarantee). This allows a pre-read at +// (base + len - 8) even when len < 8. +// - No read is performed beyond (base + len - 1). +// +// Strategy: +// 1) Preload the final 8-byte window at (base + len - 8). +// This covers the last up to 8 bytes and serves as a fast-fail check. +// 2) For len <= 8, handle entirely in SMALL using shift/mask logic. +// 3) For medium sizes (9..23) and post-loop remainders, +// TAIL15 compares head and tail windows with overlap as needed. +// 4) For larger inputs (>= 24), MAINLOOP processes 16-byte blocks +// using LDP + CMP/CCMP to allow a single branch on inequality. +// Any remaining <16 bytes fall back to TAIL15. +// +// SMALL path: +// For lengths <= 8, the preloaded 8-byte window is shifted +// so that only the valid low-order bytes participate in comparison. void MacroAssembler::string_equals(Register a1, Register a2, - Register result, Register cnt1) + Register result, Register cnt1, + Register a1_hi, Register a2_hi) { - Label SAME, DONE, SHORT, NEXT_WORD; - Register tmp1 = rscratch1; - Register tmp2 = rscratch2; + Label MAINLOOP, TAIL15, SMALL, END, DONE, SMALL2; + Register a1_low = rscratch1; + Register a2_low = rscratch2; - assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); + assert_different_registers(a1, a2, cnt1, a1_hi, a2_hi, a1_low, a2_low); + assert(result != a1, "result must not alias a1"); + assert(result != a2, "result must not alias a2"); #ifndef PRODUCT { @@ -6130,61 +6152,71 @@ void MacroAssembler::string_equals(Register a1, Register a2, } #endif - mov(result, false); + subs(cnt1, cnt1, 8); + ldr(a1_low, Address(a1, cnt1)); // Load last 8 bytes from a1 + ldr(a2_low, Address(a2, cnt1)); // Load last 8 bytes from a2 + br(Assembler::LE, SMALL); + subs(cnt1, cnt1, 16); + br(Assembler::LT, TAIL15); + cmp(a1_low, a2_low); + br(Assembler::NE, END); + // ---- MAINLOOP: process two 8B via ldp/ccmp ---- + bind(MAINLOOP); + ldp(a1_low, a1_hi, Address(post(a1,16))); // A1: low/high 8B + ldp(a2_low, a2_hi, Address(post(a2,16))); // A2: low/high 8B + cmp(a1_low, a2_low); + ccmp(a1_hi, a2_hi, /*nzcv=*/0, Assembler::EQ); + br(Assembler::NE, END); + subs(cnt1, cnt1, 16); + br(Assembler::HS, MAINLOOP); // while remaining >= 16 - // Check for short strings, i.e. smaller than wordSize. - subs(cnt1, cnt1, wordSize); - br(Assembler::LT, SHORT); - // Main 8 byte comparison loop. - bind(NEXT_WORD); { - ldr(tmp1, Address(post(a1, wordSize))); - ldr(tmp2, Address(post(a2, wordSize))); - subs(cnt1, cnt1, wordSize); - eor(tmp1, tmp1, tmp2); - cbnz(tmp1, DONE); - } br(GT, NEXT_WORD); - // Last longword. In the case where length == 4 we compare the - // same longword twice, but that's still faster than another - // conditional branch. - // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when - // length == 4. - ldr(tmp1, Address(a1, cnt1)); - ldr(tmp2, Address(a2, cnt1)); - eor(tmp2, tmp1, tmp2); - cbnz(tmp2, DONE); - b(SAME); + adds(zr, cnt1, 16); // If cnt1 == -16, skip tail handling. + br(Assembler::EQ, END); - bind(SHORT); - Label TAIL03, TAIL01; + // ---- TAIL15: medium sizes and post-loop tail. + // Entered when (initial len < 24) or when MAINLOOP leaves a <16B tail. + // At entry, cnt1 is in [-15 .. -1] ---- + bind(TAIL15); + // cnt1 := remaining length - 8 ; if remaining lengths <= 8 goto SMALL2 + adds(cnt1, cnt1, 8); + br(Assembler::LE, SMALL2); + cmp(a1_low, a2_low); - tbz(cnt1, 2, TAIL03); // 0-7 bytes left. - { - ldrw(tmp1, Address(post(a1, 4))); - ldrw(tmp2, Address(post(a2, 4))); - eorw(tmp1, tmp1, tmp2); - cbnzw(tmp1, DONE); - } - bind(TAIL03); - tbz(cnt1, 1, TAIL01); // 0-3 bytes left. - { - ldrh(tmp1, Address(post(a1, 2))); - ldrh(tmp2, Address(post(a2, 2))); - eorw(tmp1, tmp1, tmp2); - cbnzw(tmp1, DONE); - } - bind(TAIL01); - tbz(cnt1, 0, SAME); // 0-1 bytes left. - { - ldrb(tmp1, a1); - ldrb(tmp2, a2); - eorw(tmp1, tmp1, tmp2); - cbnzw(tmp1, DONE); - } - // Arrays are equal. - bind(SAME); - mov(result, true); + // We have more than 8 bytes unchecked and 8 bytes from end previously read + // One ldp can cover all remained bytes + ldp(a1_low, a1_hi, Address(a1)); // A1 high 8B + ldp(a2_low, a2_hi, Address(a2)); // A2 high 8B + ccmp(a1_hi, a2_hi, 0, Assembler::EQ); + ccmp(a1_low, a2_low, /*nzcv=*/0, Assembler::EQ); + b(END); + // Tail <= 16B case: compare head 8 bytes and tail 8 bytes (tail 8 bytes was preloaded). + bind(SMALL2); + ldr(a1_hi, Address(a1)); + ldr(a2_hi, Address(a2)); + cmp(a1_low, a2_low); + ccmp(a1_hi, a2_hi, /*nzcv=*/0, Assembler::EQ); + b(END); + // For lengths <= 8 we avoid 4/2/1-byte tail branches and extra loads. + // Compute shift = (8 - len) * 8 and right-shift the preloaded 8B window + // so that only the valid low-order len bytes remain for comparison. + // + // The load at (base + len - 8) produces an 8B window ending at the last + // string byte. When len < 8, the leading bytes in this window are + // outside the logical string. On little-endian AArch64, lower-address + // bytes occupy the least significant bits of the 64-bit word, so a + // logical right shift cleanly discards those unused prefix bytes. + // + // a2_hi is reused as a temporary register holding the shift amount. + bind(SMALL); + neg(a2_hi, cnt1, LSL, 3); + lsrv(a1_low, a1_low, a2_hi); + lsrv(a2_low, a2_low, a2_hi); + adds(zr, cnt1, 8); // Prepare flags for length==0 handling + ccmp(a1_low, a2_low, /*nzcv=*/4, Assembler::NE); + + bind(END); + cset(result, Assembler::EQ); - // That's it. bind(DONE); BLOCK_COMMENT("} string_equals"); } diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index b1050b45731..c39b129a43d 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1519,7 +1519,7 @@ public: vpowm == v13, "registers must match aarch64.ad"); \ } while (0) - void string_equals(Register a1, Register a2, Register result, Register cnt1); + void string_equals(Register a1, Register a2, Register result, Register cnt1, Register a1_hi, Register a2_hi); void fill_words(Register base, Register cnt, Register value); address zero_words(Register base, uint64_t cnt); diff --git a/test/micro/org/openjdk/bench/java/lang/StringEquals.java b/test/micro/org/openjdk/bench/java/lang/StringEquals.java index b0db6a7037e..b3fa02569e1 100644 --- a/test/micro/org/openjdk/bench/java/lang/StringEquals.java +++ b/test/micro/org/openjdk/bench/java/lang/StringEquals.java @@ -1,5 +1,6 @@ /* * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright 2026 Arm Limited and/or its affiliates. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -43,6 +44,45 @@ public class StringEquals { public String test5 = new String(test4); // equal to test4, but not same public String test6 = new String("0123456780"); public String test7 = new String("0123\u01FE"); + // string with parameterizable size + public String test8; + // same chars as test8, but different object; forces the intrinsic to read + // the entire string to check equality + public String test9; + // same chars as test8, except at length + diff_pos; set diff_pos to the + // worst case for the intrinsic being tested (usually -1, but could be -9 + // if the intrinsic reads the last 8B first, or -length if the intrinsic + // reads the string backwards + public String test10; + + @Param({"30"}) // can be used at runtime to define a length sweep + public int size; + + @Param({"-1"}) // set to the worst location for the intrinsic under test + public int diff_pos; + + @Setup + public void setup() { + if(size > 0) { + test8 = "a".repeat(size); + // NOTE 1: can't do test9 = new String(test8) or they'll share byte + // arrays, which improves cache hit rate of the equal-string case + test9 = "a".repeat(size); + StringBuilder sb = new StringBuilder("a".repeat(size)); + sb.setCharAt(Math.max(test8.length() + diff_pos, 0), 'b'); + test10 = sb.toString(); + } + else { + // NOTE 2: can't use "a".repeat(0) or it returns the "" literal, + // which will early-exit from String.equals() + // NOTE 3: can't use no-arg String ctor or they'll share the byte + // array of the "" literal, which improves cache hit rate for + // intrinsics that read backwards into the object header + test8 = new String(new char [] {}); + test9 = new String(new char [] {}); + test10 = new String(new char [] {}); + } + } @Benchmark public boolean different() { @@ -54,6 +94,16 @@ public class StringEquals { return test.equals(test3); } + @Benchmark + public boolean differentParam() { + return test8.equals(test10); + } + + @Benchmark + public boolean equalParam() { + return test8.equals(test9); + } + @Benchmark public boolean almostEqual() { return test.equals(test6);