From 08d563ba15047020fd5f5fea80547e18898bbab2 Mon Sep 17 00:00:00 2001 From: Fei Yang Date: Fri, 29 Nov 2024 07:50:14 +0000 Subject: [PATCH] 8345110: RISC-V: Optimize and and clean up byte reverse assembler routines Reviewed-by: mli, rehn --- .../cpu/riscv/macroAssembler_riscv.cpp | 93 +++---------------- .../cpu/riscv/macroAssembler_riscv.hpp | 6 +- src/hotspot/cpu/riscv/riscv_b.ad | 24 +++-- src/hotspot/cpu/riscv/templateTable_riscv.cpp | 28 +++--- 4 files changed, 51 insertions(+), 100 deletions(-) diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp index c39a086838d..46e6bc1b534 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp @@ -2461,41 +2461,6 @@ void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tm } } - -// reverse bytes in halfword in lower 16 bits and sign-extend -// Rd[15:0] = Rs[7:0] Rs[15:8] (sign-extend to 64 bits) -void MacroAssembler::revb_h_h(Register Rd, Register Rs, Register tmp) { - if (UseZbb) { - rev8(Rd, Rs); - srai(Rd, Rd, 48); - return; - } - assert_different_registers(Rs, tmp); - assert_different_registers(Rd, tmp); - srli(tmp, Rs, 8); - andi(tmp, tmp, 0xFF); - slli(Rd, Rs, 56); - srai(Rd, Rd, 48); // sign-extend - orr(Rd, Rd, tmp); -} - -// reverse bytes in lower word and sign-extend -// Rd[31:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] (sign-extend to 64 bits) -void MacroAssembler::revb_w_w(Register Rd, Register Rs, Register tmp1, Register tmp2) { - if (UseZbb) { - rev8(Rd, Rs); - srai(Rd, Rd, 32); - return; - } - assert_different_registers(Rs, tmp1, tmp2); - assert_different_registers(Rd, tmp1, tmp2); - revb_h_w_u(Rd, Rs, tmp1, tmp2); - slli(tmp2, Rd, 48); - srai(tmp2, tmp2, 32); // sign-extend - srli(Rd, Rd, 16); - orr(Rd, Rd, tmp2); -} - // reverse bytes in halfword in lower 16 bits and zero-extend // Rd[15:0] = Rs[7:0] Rs[15:8] (zero-extend to 64 bits) void MacroAssembler::revb_h_h_u(Register Rd, Register Rs, Register tmp) { @@ -2532,56 +2497,28 @@ void MacroAssembler::revb_h_w_u(Register Rd, Register Rs, Register tmp1, Registe orr(Rd, Rd, tmp2); } -// This method is only used for revb_h -// Rd = Rs[47:0] Rs[55:48] Rs[63:56] -void MacroAssembler::revb_h_helper(Register Rd, Register Rs, Register tmp1, Register tmp2) { - assert_different_registers(Rs, tmp1, tmp2); - assert_different_registers(Rd, tmp1); - srli(tmp1, Rs, 48); - andi(tmp2, tmp1, 0xFF); - slli(tmp2, tmp2, 8); - srli(tmp1, tmp1, 8); - orr(tmp1, tmp1, tmp2); - slli(Rd, Rs, 16); - orr(Rd, Rd, tmp1); -} - -// reverse bytes in each halfword -// Rd[63:0] = Rs[55:48] Rs[63:56] Rs[39:32] Rs[47:40] Rs[23:16] Rs[31:24] Rs[7:0] Rs[15:8] -void MacroAssembler::revb_h(Register Rd, Register Rs, Register tmp1, Register tmp2) { - if (UseZbb) { - assert_different_registers(Rs, tmp1); - assert_different_registers(Rd, tmp1); - rev8(Rd, Rs); - zero_extend(tmp1, Rd, 32); - roriw(tmp1, tmp1, 16); - slli(tmp1, tmp1, 32); - srli(Rd, Rd, 32); - roriw(Rd, Rd, 16); - zero_extend(Rd, Rd, 32); - orr(Rd, Rd, tmp1); - return; - } - assert_different_registers(Rs, tmp1, tmp2); - assert_different_registers(Rd, tmp1, tmp2); - revb_h_helper(Rd, Rs, tmp1, tmp2); - for (int i = 0; i < 3; ++i) { - revb_h_helper(Rd, Rd, tmp1, tmp2); - } -} - -// reverse bytes in each word -// Rd[63:0] = Rs[39:32] Rs[47:40] Rs[55:48] Rs[63:56] Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] +// reverse bytes in lower word, sign-extend +// Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] void MacroAssembler::revb_w(Register Rd, Register Rs, Register tmp1, Register tmp2) { if (UseZbb) { rev8(Rd, Rs); - rori(Rd, Rd, 32); + srai(Rd, Rd, 32); return; } assert_different_registers(Rs, tmp1, tmp2); assert_different_registers(Rd, tmp1, tmp2); - revb(Rd, Rs, tmp1, tmp2); - ror_imm(Rd, Rd, 32); + andi(tmp1, Rs, 0xFF); + slli(tmp1, tmp1, 8); + for (int step = 8; step < 24; step += 8) { + srli(tmp2, Rs, step); + andi(tmp2, tmp2, 0xFF); + orr(tmp1, tmp1, tmp2); + slli(tmp1, tmp1, 8); + } + srli(Rd, Rs, 24); + andi(Rd, Rd, 0xFF); + orr(Rd, tmp1, Rd); + sign_extend(Rd, Rd, 32); } // reverse bytes in doubleword diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp index cbf69e93c5c..9c38e8424d7 100644 --- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp @@ -913,13 +913,9 @@ public: void orn(Register Rd, Register Rs1, Register Rs2); // revb - void revb_h_h(Register Rd, Register Rs, Register tmp = t0); // reverse bytes in halfword in lower 16 bits, sign-extend - void revb_w_w(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1); // reverse bytes in lower word, sign-extend void revb_h_h_u(Register Rd, Register Rs, Register tmp = t0); // reverse bytes in halfword in lower 16 bits, zero-extend void revb_h_w_u(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1); // reverse bytes in halfwords in lower 32 bits, zero-extend - void revb_h_helper(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2= t1); // reverse bytes in upper 16 bits (48:63) and move to lower - void revb_h(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2= t1); // reverse bytes in each halfword - void revb_w(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2= t1); // reverse bytes in each word + void revb_w(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2= t1); // reverse bytes in lower word, sign-extend void revb(Register Rd, Register Rs, Register tmp1 = t0, Register tmp2 = t1); // reverse bytes in doubleword void ror_imm(Register dst, Register src, uint32_t shift, Register tmp = t0); diff --git a/src/hotspot/cpu/riscv/riscv_b.ad b/src/hotspot/cpu/riscv/riscv_b.ad index 92e616a3063..6b7645858d8 100644 --- a/src/hotspot/cpu/riscv/riscv_b.ad +++ b/src/hotspot/cpu/riscv/riscv_b.ad @@ -181,11 +181,15 @@ instruct bytes_reverse_int_b(iRegINoSp dst, iRegIorL2I src) %{ match(Set dst (ReverseBytesI src)); ins_cost(ALU_COST * 2); - format %{ "revb_w_w $dst, $src\t#@bytes_reverse_int_b" %} + format %{ + "rev8 $dst, $src\t#@bytes_reverse_int_b\t\n" + "srai $dst, $dst, 32\t\n" + %} ins_encode %{ assert(UseZbb, "must be"); - __ revb_w_w(as_Register($dst$$reg), as_Register($src$$reg)); + __ rev8(as_Register($dst$$reg), as_Register($src$$reg)); + __ srai(as_Register($dst$$reg), as_Register($dst$$reg), 32); %} ins_pipe(ialu_reg); @@ -209,11 +213,15 @@ instruct bytes_reverse_unsigned_short_b(iRegINoSp dst, iRegIorL2I src) %{ match(Set dst (ReverseBytesUS src)); ins_cost(ALU_COST * 2); - format %{ "revb_h_h_u $dst, $src\t#@bytes_reverse_unsigned_short_b" %} + format %{ + "rev8 $dst, $src\t#@bytes_reverse_unsigned_short_b\t\n" + "srli $dst, $dst, 48\t\n" + %} ins_encode %{ assert(UseZbb, "must be"); - __ revb_h_h_u(as_Register($dst$$reg), as_Register($src$$reg)); + __ rev8(as_Register($dst$$reg), as_Register($src$$reg)); + __ srli(as_Register($dst$$reg), as_Register($dst$$reg), 48); %} ins_pipe(ialu_reg); @@ -223,11 +231,15 @@ instruct bytes_reverse_short_b(iRegINoSp dst, iRegIorL2I src) %{ match(Set dst (ReverseBytesS src)); ins_cost(ALU_COST * 2); - format %{ "revb_h_h $dst, $src\t#@bytes_reverse_short_b" %} + format %{ + "rev8 $dst, $src\t#@bytes_reverse_short_b\t\n" + "srai $dst, $dst, 48\t\n" + %} ins_encode %{ assert(UseZbb, "must be"); - __ revb_h_h(as_Register($dst$$reg), as_Register($src$$reg)); + __ rev8(as_Register($dst$$reg), as_Register($src$$reg)); + __ srai(as_Register($dst$$reg), as_Register($dst$$reg), 48); %} ins_pipe(ialu_reg); diff --git a/src/hotspot/cpu/riscv/templateTable_riscv.cpp b/src/hotspot/cpu/riscv/templateTable_riscv.cpp index 62dc952bde0..5ae6805872a 100644 --- a/src/hotspot/cpu/riscv/templateTable_riscv.cpp +++ b/src/hotspot/cpu/riscv/templateTable_riscv.cpp @@ -1621,13 +1621,14 @@ void TemplateTable::branch(bool is_jsr, bool is_wide) { // load branch displacement if (!is_wide) { + // Convert the 16-bit value into native byte-ordering and sign-extend __ lb(x12, at_bcp(1)); __ lbu(t1, at_bcp(2)); __ slli(x12, x12, 8); __ add(x12, x12, t1); } else { __ lwu(x12, at_bcp(1)); - __ revb_w_w(x12, x12); // reverse bytes in word and sign-extend + __ revb_w(x12, x12); } // Handle all the JSR stuff here, then exit. @@ -1892,8 +1893,8 @@ void TemplateTable::tableswitch() { // load lo & hi __ lwu(x12, Address(x11, BytesPerInt)); __ lwu(x13, Address(x11, 2 * BytesPerInt)); - __ revb_w_w(x12, x12); // reverse bytes in word (32bit) and sign-extend - __ revb_w_w(x13, x13); // reverse bytes in word (32bit) and sign-extend + __ revb_w(x12, x12); + __ revb_w(x13, x13); // check against lo & hi __ blt(x10, x12, default_case); __ bgt(x10, x13, default_case); @@ -1904,7 +1905,7 @@ void TemplateTable::tableswitch() { __ profile_switch_case(x10, x11, x12); // continue execution __ bind(continue_execution); - __ revb_w_w(x13, x13); // reverse bytes in word (32bit) and sign-extend + __ revb_w(x13, x13); __ add(xbcp, xbcp, x13); __ load_unsigned_byte(t0, Address(xbcp)); __ dispatch_only(vtos, /*generate_poll*/true); @@ -1924,7 +1925,7 @@ void TemplateTable::fast_linearswitch() { transition(itos, vtos); Label loop_entry, loop, found, continue_execution; // bswap x10 so we can avoid bswapping the table entries - __ revb_w_w(x10, x10); // reverse bytes in word (32bit) and sign-extend + __ revb_w(x10, x10); // align xbcp __ la(x9, at_bcp(BytesPerInt)); // btw: should be able to get rid of // this instruction (change offsets @@ -1932,6 +1933,9 @@ void TemplateTable::fast_linearswitch() { __ andi(x9, x9, -BytesPerInt); // set counter __ lwu(x11, Address(x9, BytesPerInt)); + // Convert the 32-bit npairs (number of pairs) into native byte-ordering + // We can use sign-extension here because npairs must be greater than or + // equal to 0 per JVM spec on 'lookupswitch' bytecode. __ revb_w(x11, x11); __ j(loop_entry); // table search @@ -1953,7 +1957,7 @@ void TemplateTable::fast_linearswitch() { __ profile_switch_case(x11, x10, x9); // continue execution __ bind(continue_execution); - __ revb_w_w(x13, x13); // reverse bytes in word (32bit) and sign-extend + __ revb_w(x13, x13); __ add(xbcp, xbcp, x13); __ lbu(t0, Address(xbcp, 0)); __ dispatch_only(vtos, /*generate_poll*/true); @@ -2005,7 +2009,9 @@ void TemplateTable::fast_binaryswitch() { __ mv(i, zr); // i = 0 __ lwu(j, Address(array, -BytesPerInt)); // j = length(array) - // Convert j into native byteordering + // Convert the 32-bit npairs (number of pairs) into native byte-ordering + // We can use sign-extension here because npairs must be greater than or + // equal to 0 per JVM spec on 'lookupswitch' bytecode. __ revb_w(j, j); // And start @@ -2024,7 +2030,7 @@ void TemplateTable::fast_binaryswitch() { // Convert array[h].match to native byte-ordering before compare __ shadd(temp, h, array, temp, 3); __ lwu(temp, Address(temp, 0)); - __ revb_w_w(temp, temp); // reverse bytes in word (32bit) and sign-extend + __ revb_w(temp, temp); Label L_done, L_greater; __ bge(key, temp, L_greater); @@ -2047,14 +2053,14 @@ void TemplateTable::fast_binaryswitch() { // Convert array[i].match to native byte-ordering before compare __ shadd(temp, i, array, temp, 3); __ lwu(temp, Address(temp, 0)); - __ revb_w_w(temp, temp); // reverse bytes in word (32bit) and sign-extend + __ revb_w(temp, temp); __ bne(key, temp, default_case); // entry found -> j = offset __ shadd(temp, i, array, temp, 3); __ lwu(j, Address(temp, BytesPerInt)); __ profile_switch_case(i, key, array); - __ revb_w_w(j, j); // reverse bytes in word (32bit) and sign-extend + __ revb_w(j, j); __ add(temp, xbcp, j); __ load_unsigned_byte(t0, Address(temp, 0)); @@ -2067,7 +2073,7 @@ void TemplateTable::fast_binaryswitch() { __ bind(default_case); __ profile_switch_default(i); __ lwu(j, Address(array, -2 * BytesPerInt)); - __ revb_w_w(j, j); // reverse bytes in word (32bit) and sign-extend + __ revb_w(j, j); __ add(temp, xbcp, j); __ load_unsigned_byte(t0, Address(temp, 0));