From e91a6ec49c80ea53bb6f1eb43c924f188803de7e Mon Sep 17 00:00:00 2001 From: Fei Yang Date: Tue, 4 Feb 2025 14:03:07 +0000 Subject: [PATCH] 8347489: RISC-V: Misaligned memory access with COH Reviewed-by: mli, vkempik --- .../cpu/riscv/c2_MacroAssembler_riscv.cpp | 103 ++++++++++++++++-- .../cpu/riscv/macroAssembler_riscv.cpp | 2 +- src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 51 ++++++--- 3 files changed, 130 insertions(+), 26 deletions(-) diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index 7564df6b861..c3296aeb76b 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -1409,6 +1409,14 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2, load_chr_insn str1_load_chr = str1_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; load_chr_insn str2_load_chr = str2_isL ? (load_chr_insn)&MacroAssembler::lbu : (load_chr_insn)&MacroAssembler::lhu; + int base_offset1 = arrayOopDesc::base_offset_in_bytes(T_BYTE); + int base_offset2 = arrayOopDesc::base_offset_in_bytes(T_CHAR); + + assert((base_offset1 % (UseCompactObjectHeaders ? 4 : + (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); + assert((base_offset2 % (UseCompactObjectHeaders ? 4 : + (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); + BLOCK_COMMENT("string_compare {"); // Bizarrely, the counts are passed in bytes, regardless of whether they @@ -1426,6 +1434,24 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2, mv(cnt2, cnt1); bind(L); + // Load 4 bytes once to compare for alignment before main loop. Note that this + // is only possible for LL/UU case. We need to resort to load_long_misaligned + // for both LU and UL cases. + if (str1_isL == str2_isL) { // LL or UU + beq(str1, str2, DONE); + int base_offset = isLL ? base_offset1 : base_offset2; + if (AvoidUnalignedAccesses && (base_offset % 8) != 0) { + mv(t0, minCharsInWord / 2); + ble(cnt2, t0, SHORT_STRING); + lwu(tmp1, Address(str1)); + lwu(tmp2, Address(str2)); + bne(tmp1, tmp2, DIFFERENCE); + addi(str1, str1, 4); + addi(str2, str2, 4); + subi(cnt2, cnt2, minCharsInWord / 2); + } + } + // A very short string mv(t0, minCharsInWord); ble(cnt2, t0, SHORT_STRING); @@ -1434,8 +1460,14 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2, // load first parts of strings and finish initialization while loading { if (str1_isL == str2_isL) { // LL or UU - // check if str1 and str2 is same pointer - beq(str1, str2, DONE); +#ifdef ASSERT + Label align_ok; + orr(t0, str1, str2); + andi(t0, t0, 0x7); + beqz(t0, align_ok); + stop("bad alignment"); + bind(align_ok); +#endif // load 8 bytes once to compare ld(tmp1, Address(str1)); ld(tmp2, Address(str2)); @@ -1452,7 +1484,7 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2, sub(cnt2, zr, cnt2); } else if (isLU) { // LU case lwu(tmp1, Address(str1)); - ld(tmp2, Address(str2)); + load_long_misaligned(tmp2, Address(str2), tmp3, (base_offset2 % 8) != 0 ? 4 : 8); mv(t0, STUB_THRESHOLD); bge(cnt2, t0, STUB); subi(cnt2, cnt2, 4); @@ -1465,11 +1497,11 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2, sub(cnt2, zr, cnt2); addi(cnt1, cnt1, 4); } else { // UL case - ld(tmp1, Address(str1)); + load_long_misaligned(tmp1, Address(str1), tmp3, (base_offset2 % 8) != 0 ? 4 : 8); lwu(tmp2, Address(str2)); mv(t0, STUB_THRESHOLD); bge(cnt2, t0, STUB); - addi(cnt2, cnt2, -4); + subi(cnt2, cnt2, 4); slli(t0, cnt2, 1); sub(cnt1, zr, t0); add(str1, str1, t0); @@ -1486,6 +1518,7 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2, // main loop bind(NEXT_WORD); if (str1_isL == str2_isL) { // LL or UU + // both of the two loads are 8-byte aligned add(t0, str1, cnt2); ld(tmp1, Address(t0)); add(t0, str2, cnt2); @@ -1495,7 +1528,7 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2, add(t0, str1, cnt1); lwu(tmp1, Address(t0)); add(t0, str2, cnt2); - ld(tmp2, Address(t0)); + load_long_misaligned(tmp2, Address(t0), tmp3, (base_offset2 % 8) != 0 ? 4 : 8); addi(cnt1, cnt1, 4); inflate_lo32(tmp3, tmp1); mv(tmp1, tmp3); @@ -1504,7 +1537,7 @@ void C2_MacroAssembler::string_compare(Register str1, Register str2, add(t0, str2, cnt2); lwu(tmp2, Address(t0)); add(t0, str1, cnt1); - ld(tmp1, Address(t0)); + load_long_misaligned(tmp1, Address(t0), tmp3, (base_offset2 % 8) != 0 ? 4 : 8); inflate_lo32(tmp3, tmp2); mv(tmp2, tmp3); addi(cnt1, cnt1, 8); @@ -1637,6 +1670,9 @@ void C2_MacroAssembler::arrays_equals(Register a1, Register a2, int length_offset = arrayOopDesc::length_offset_in_bytes(); int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); + assert((base_offset % (UseCompactObjectHeaders ? 4 : + (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); + Register cnt1 = tmp3; Register cnt2 = tmp1; // cnt2 only used in array length compare Label DONE, SAME, NEXT_WORD, SHORT, TAIL03, TAIL01; @@ -1660,10 +1696,31 @@ void C2_MacroAssembler::arrays_equals(Register a1, Register a2, la(a1, Address(a1, base_offset)); la(a2, Address(a2, base_offset)); + + // Load 4 bytes once to compare for alignment before main loop. + if (AvoidUnalignedAccesses && (base_offset % 8) != 0) { + subi(cnt1, cnt1, elem_per_word / 2); + bltz(cnt1, TAIL03); + lwu(tmp1, Address(a1)); + lwu(tmp2, Address(a2)); + addi(a1, a1, 4); + addi(a2, a2, 4); + bne(tmp1, tmp2, DONE); + } + // Check for short strings, i.e. smaller than wordSize. subi(cnt1, cnt1, elem_per_word); bltz(cnt1, SHORT); +#ifdef ASSERT + Label align_ok; + orr(t0, a1, a2); + andi(t0, t0, 0x7); + beqz(t0, align_ok); + stop("bad alignment"); + bind(align_ok); +#endif + // Main 8 byte comparison loop. bind(NEXT_WORD); { ld(tmp1, Address(a1)); @@ -1729,20 +1786,45 @@ void C2_MacroAssembler::arrays_equals(Register a1, Register a2, void C2_MacroAssembler::string_equals(Register a1, Register a2, Register result, Register cnt1) { - Label SAME, DONE, SHORT, NEXT_WORD; + Label SAME, DONE, SHORT, NEXT_WORD, TAIL03, TAIL01; Register tmp1 = t0; Register tmp2 = t1; assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2); + int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE); + + assert((base_offset % (UseCompactObjectHeaders ? 4 : + (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); + BLOCK_COMMENT("string_equals {"); mv(result, false); + // Load 4 bytes once to compare for alignment before main loop. + if (AvoidUnalignedAccesses && (base_offset % 8) != 0) { + subi(cnt1, cnt1, 4); + bltz(cnt1, TAIL03); + lwu(tmp1, Address(a1)); + lwu(tmp2, Address(a2)); + addi(a1, a1, 4); + addi(a2, a2, 4); + bne(tmp1, tmp2, DONE); + } + // Check for short strings, i.e. smaller than wordSize. subi(cnt1, cnt1, wordSize); bltz(cnt1, SHORT); +#ifdef ASSERT + Label align_ok; + orr(t0, a1, a2); + andi(t0, t0, 0x7); + beqz(t0, align_ok); + stop("bad alignment"); + bind(align_ok); +#endif + // Main 8 byte comparison loop. bind(NEXT_WORD); { ld(tmp1, Address(a1)); @@ -1757,8 +1839,6 @@ void C2_MacroAssembler::string_equals(Register a1, Register a2, beqz(tmp1, SAME); bind(SHORT); - Label TAIL03, TAIL01; - // 0-7 bytes left. test_bit(tmp1, cnt1, 2); beqz(tmp1, TAIL03); @@ -2512,6 +2592,9 @@ void C2_MacroAssembler::arrays_equals_v(Register a1, Register a2, Register resul int length_offset = arrayOopDesc::length_offset_in_bytes(); int base_offset = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); + assert((base_offset % (UseCompactObjectHeaders ? 4 : + (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); + BLOCK_COMMENT("arrays_equals_v {"); // if (a1 == a2), return true diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp index 61261a17213..9b53dd0f0ef 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp @@ -4307,7 +4307,7 @@ void MacroAssembler::population_count(Register dst, Register src, { bind(loop); addi(dst, dst, 1); - addi(tmp2, tmp1, -1); + subi(tmp2, tmp1, 1); andr(tmp1, tmp1, tmp2); bnez(tmp1, loop); } diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index d9880b1f85b..bb536c3e111 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -2449,17 +2449,28 @@ class StubGenerator: public StubCodeGenerator { } // code for comparing 8 characters of strings with Latin1 and Utf16 encoding - void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) { + void compare_string_8_x_LU(Register tmpL, Register tmpU, + Register strL, Register strU, Label& DIFF) { const Register tmp = x30, tmpLval = x12; + + int base_offset = arrayOopDesc::base_offset_in_bytes(T_CHAR); + + assert((base_offset % (UseCompactObjectHeaders ? 4 : + (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); + + // strL is 8-byte aligned __ ld(tmpLval, Address(strL)); __ addi(strL, strL, wordSize); - __ ld(tmpU, Address(strU)); + + // compare first 4 characters + __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8); __ addi(strU, strU, wordSize); __ inflate_lo32(tmpL, tmpLval); __ xorr(tmp, tmpU, tmpL); __ bnez(tmp, DIFF); - __ ld(tmpU, Address(strU)); + // compare second 4 characters + __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8); __ addi(strU, strU, wordSize); __ inflate_hi32(tmpL, tmpLval); __ xorr(tmp, tmpU, tmpL); @@ -2493,6 +2504,14 @@ class StubGenerator: public StubCodeGenerator { const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14, tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12; + int base_offset1 = arrayOopDesc::base_offset_in_bytes(T_BYTE); + int base_offset2 = arrayOopDesc::base_offset_in_bytes(T_CHAR); + + assert((base_offset1 % (UseCompactObjectHeaders ? 4 : + (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); + assert((base_offset2 % (UseCompactObjectHeaders ? 4 : + (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); + // cnt2 == amount of characters left to compare // Check already loaded first 4 symbols __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2); @@ -2509,17 +2528,19 @@ class StubGenerator: public StubCodeGenerator { tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison - // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL - // cnt2 is >= 68 here, no need to check it for >= 0 - __ lwu(tmpL, Address(strL)); - __ addi(strL, strL, wordSize / 2); - __ ld(tmpU, Address(strU)); - __ addi(strU, strU, wordSize); - __ inflate_lo32(tmp3, tmpL); - __ mv(tmpL, tmp3); - __ xorr(tmp3, tmpU, tmpL); - __ bnez(tmp3, CALCULATE_DIFFERENCE); - __ addi(cnt2, cnt2, -wordSize / 2); + if (AvoidUnalignedAccesses && (base_offset1 % 8) == 0) { + // Load another 4 bytes from strL to make sure main loop is 8-byte aligned + // cnt2 is >= 68 here, no need to check it for >= 0 + __ lwu(tmpL, Address(strL)); + __ addi(strL, strL, wordSize / 2); + __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset2 % 8) != 0 ? 4 : 8); + __ addi(strU, strU, wordSize); + __ inflate_lo32(tmp3, tmpL); + __ mv(tmpL, tmp3); + __ xorr(tmp3, tmpU, tmpL); + __ bnez(tmp3, CALCULATE_DIFFERENCE); + __ subi(cnt2, cnt2, wordSize / 2); + } // we are now 8-bytes aligned on strL __ subi(cnt2, cnt2, wordSize * 2); @@ -4493,7 +4514,7 @@ class StubGenerator: public StubCodeGenerator { if (multi_block) { int total_adds = vset_sew == Assembler::e32 ? 240 : 608; - __ addi(consts, consts, -total_adds); + __ subi(consts, consts, total_adds); __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); __ ble(ofs, limit, multi_block_loop); __ mv(c_rarg0, ofs); // return ofs