From 52c2ea65c4ab6a2d25e9de465fbf20034fe79028 Mon Sep 17 00:00:00 2001 From: Hamlin Li Date: Wed, 2 Oct 2024 07:48:22 +0000 Subject: [PATCH] 8340732: RISC-V: Refactor crc32 scalar version Reviewed-by: fyang --- .../cpu/riscv/macroAssembler_riscv.cpp | 60 ++++++++----------- src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 17 ++---- 2 files changed, 30 insertions(+), 47 deletions(-) diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp index 32a446959a2..b99ba542423 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp @@ -1564,7 +1564,10 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table0, Register table1, Register table2, Register table3, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) { assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); - Label L_by16_loop, L_vector_entry, L_unroll_loop, L_unroll_loop_entry, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; + Label L_vector_entry, + L_unroll_loop, + L_by4_loop_entry, L_by4_loop, + L_by1_loop, L_exit; const int64_t single_table_size = 256; const int64_t unroll = 16; @@ -1585,21 +1588,17 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, bge(len, tmp1, L_vector_entry); } #endif // COMPILER2 - subw(len, len, unroll_words); - bge(len, zr, L_unroll_loop_entry); - addiw(len, len, unroll_words-4); - bge(len, zr, L_by4_loop); - addiw(len, len, 4); - bgt(len, zr, L_by1_loop); - j(L_exit); + mv(tmp1, unroll_words); + blt(len, tmp1, L_by4_loop_entry); + + const Register loop_buf_end = tmp3; align(CodeEntryAlignment); - bind(L_unroll_loop_entry); - const Register buf_end = tmp3; - add(buf_end, buf, len); // buf_end will be used as endpoint for loop below + // Entry for L_unroll_loop + add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below andi(len, len, unroll_words-1); // len = (len % unroll_words) - sub(len, len, unroll_words); // Length after all iterations + sub(loop_buf_end, loop_buf_end, len); bind(L_unroll_loop); for (int i = 0; i < unroll; i++) { ld(tmp1, Address(buf, i*wordSize)); @@ -1608,57 +1607,50 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, } addi(buf, buf, unroll_words); - ble(buf, buf_end, L_unroll_loop); - addiw(len, len, unroll_words-4); - bge(len, zr, L_by4_loop); - addiw(len, len, 4); - bgt(len, zr, L_by1_loop); - j(L_exit); + blt(buf, loop_buf_end, L_unroll_loop); + bind(L_by4_loop_entry); + mv(tmp1, 4); + blt(len, tmp1, L_by1_loop); + add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below + andi(len, len, 3); + sub(loop_buf_end, loop_buf_end, len); bind(L_by4_loop); lwu(tmp1, Address(buf)); update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false); - subw(len, len, 4); addi(buf, buf, 4); - bge(len, zr, L_by4_loop); - addiw(len, len, 4); - ble(len, zr, L_exit); + blt(buf, loop_buf_end, L_by4_loop); bind(L_by1_loop); + beqz(len, L_exit); + subw(len, len, 1); lwu(tmp1, Address(buf)); andi(tmp2, tmp1, right_8_bits); update_byte_crc32(crc, tmp2, table0); - ble(len, zr, L_exit); + beqz(len, L_exit); subw(len, len, 1); srli(tmp2, tmp1, 8); andi(tmp2, tmp2, right_8_bits); update_byte_crc32(crc, tmp2, table0); - ble(len, zr, L_exit); + beqz(len, L_exit); subw(len, len, 1); srli(tmp2, tmp1, 16); andi(tmp2, tmp2, right_8_bits); update_byte_crc32(crc, tmp2, table0); - ble(len, zr, L_exit); - - srli(tmp2, tmp1, 24); - andi(tmp2, tmp2, right_8_bits); - update_byte_crc32(crc, tmp2, table0); #ifdef COMPILER2 // put vector code here, otherwise "offset is too large" error occurs. if (UseRVV) { - j(L_exit); // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`. + // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`. + j(L_exit); bind(L_vector_entry); vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3); - addiw(len, len, -4); - bge(len, zr, L_by4_loop); - addiw(len, len, 4); - bgt(len, zr, L_by1_loop); + bgtz(len, L_by4_loop_entry); } #endif // COMPILER2 diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index d4ec76da943..5970111088f 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -6092,26 +6092,17 @@ static const int64_t right_3_bits = right_n_bits(3); address start = __ pc(); + // input parameters const Register crc = c_rarg0; // crc const Register buf = c_rarg1; // source java byte array address const Register len = c_rarg2; // length - const Register table0 = c_rarg3; // crc_table address - const Register table1 = c_rarg4; - const Register table2 = c_rarg5; - const Register table3 = c_rarg6; - - const Register tmp1 = c_rarg7; - const Register tmp2 = t2; - const Register tmp3 = x28; // t3 - const Register tmp4 = x29; // t4 - const Register tmp5 = x30; // t5 - const Register tmp6 = x31; // t6 BLOCK_COMMENT("Entry:"); __ enter(); // required for proper stackwalking of RuntimeStub frame - __ kernel_crc32(crc, buf, len, table0, table1, table2, - table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); + __ kernel_crc32(crc, buf, len, + c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables + c_rarg7, t2, x28, x29, x30, x31); // misc tmps __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret();