From 133419177d8ddcfafe0b2bd25ee918bdb3b16d3f Mon Sep 17 00:00:00 2001 From: Hamlin Li Date: Mon, 25 Nov 2024 13:54:44 +0000 Subject: [PATCH] 8334474: RISC-V: verify perf of ExpandBits/CompressBits (rvv) Reviewed-by: fyang, rehn, luhenry --- .../cpu/riscv/c2_MacroAssembler_riscv.cpp | 77 ------------ .../cpu/riscv/c2_MacroAssembler_riscv.hpp | 10 -- src/hotspot/cpu/riscv/riscv.ad | 63 ---------- src/hotspot/cpu/riscv/riscv_v.ad | 110 ------------------ .../intrinsics/TestBitShuffleOpers.java | 3 +- 5 files changed, 1 insertion(+), 262 deletions(-) diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index bf553b35770..49efb619093 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -2339,83 +2339,6 @@ void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, Basi vfsgnj_vv(dst, one, dst, v0_t); } -void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) { - Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; - // intrinsic is enabled when MaxVectorSize >= 16 - Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; - long len = is_long ? 64 : 32; - - // load the src data(in bits) to be compressed. - vsetivli(x0, 1, sew, Assembler::m1); - vmv_s_x(v0, src); - // reset the src data(in bytes) to zero. - mv(t0, len); - vsetvli(x0, t0, Assembler::e8, lmul); - vmv_v_i(v4, 0); - // convert the src data from bits to bytes. - vmerge_vim(v4, v4, 1); // v0 as the implicit mask register - // reset the dst data(in bytes) to zero. - vmv_v_i(v8, 0); - // load the mask data(in bits). - vsetivli(x0, 1, sew, Assembler::m1); - vmv_s_x(v0, mask); - // compress the src data(in bytes) to dst(in bytes). - vsetvli(x0, t0, Assembler::e8, lmul); - vcompress_vm(v8, v4, v0); - // convert the dst data from bytes to bits. - vmseq_vi(v0, v8, 1); - // store result back. - vsetivli(x0, 1, sew, Assembler::m1); - vmv_x_s(dst, v0); -} - -void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) { - compress_bits_v(dst, src, mask, /* is_long */ false); -} - -void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) { - compress_bits_v(dst, src, mask, /* is_long */ true); -} - -void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) { - Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; - // intrinsic is enabled when MaxVectorSize >= 16 - Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; - long len = is_long ? 64 : 32; - - // load the src data(in bits) to be expanded. - vsetivli(x0, 1, sew, Assembler::m1); - vmv_s_x(v0, src); - // reset the src data(in bytes) to zero. - mv(t0, len); - vsetvli(x0, t0, Assembler::e8, lmul); - vmv_v_i(v4, 0); - // convert the src data from bits to bytes. - vmerge_vim(v4, v4, 1); // v0 as implicit mask register - // reset the dst data(in bytes) to zero. - vmv_v_i(v12, 0); - // load the mask data(in bits). - vsetivli(x0, 1, sew, Assembler::m1); - vmv_s_x(v0, mask); - // expand the src data(in bytes) to dst(in bytes). - vsetvli(x0, t0, Assembler::e8, lmul); - viota_m(v8, v0); - vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register - // convert the dst data from bytes to bits. - vmseq_vi(v0, v12, 1); - // store result back. - vsetivli(x0, 1, sew, Assembler::m1); - vmv_x_s(dst, v0); -} - -void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) { - expand_bits_v(dst, src, mask, /* is_long */ false); -} - -void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) { - expand_bits_v(dst, src, mask, /* is_long */ true); -} - // j.l.Math.round(float) // Returns the closest int to the argument, with ties rounding to positive infinity. // We need to handle 3 special cases defined by java api spec: diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp index 8736294e72c..2d14f98780d 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp @@ -39,9 +39,6 @@ VectorRegister vrs, bool is_latin, Label& DONE, Assembler::LMUL lmul); - void compress_bits_v(Register dst, Register src, Register mask, bool is_long); - void expand_bits_v(Register dst, Register src, Register mask, bool is_long); - public: // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file. void fast_lock(Register object, Register box, @@ -184,13 +181,6 @@ // intrinsic methods implemented by rvv instructions - // compress bits, i.e. j.l.Integer/Long::compress. - void compress_bits_i_v(Register dst, Register src, Register mask); - void compress_bits_l_v(Register dst, Register src, Register mask); - // expand bits, i.e. j.l.Integer/Long::expand. - void expand_bits_i_v(Register dst, Register src, Register mask); - void expand_bits_l_v(Register dst, Register src, Register mask); - void java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, BasicType bt, uint vector_length); void java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, BasicType bt, uint vector_length); diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad index ae8565e1bcf..d0085125c76 100644 --- a/src/hotspot/cpu/riscv/riscv.ad +++ b/src/hotspot/cpu/riscv/riscv.ad @@ -942,26 +942,6 @@ reg_class v11_reg( V11, V11_H, V11_J, V11_K ); -// class for vector register v12 -reg_class v12_reg( - V12, V12_H, V12_J, V12_K -); - -// class for vector register v13 -reg_class v13_reg( - V13, V13_H, V13_J, V13_K -); - -// class for vector register v14 -reg_class v14_reg( - V14, V14_H, V14_J, V14_K -); - -// class for vector register v15 -reg_class v15_reg( - V15, V15_H, V15_J, V15_K -); - // class for condition codes reg_class reg_flags(RFLAGS); @@ -1896,9 +1876,6 @@ bool Matcher::match_rule_supported(int opcode) { } break; - case Op_ExpandBits: // fall through - case Op_CompressBits: // fall through - guarantee(UseRVV == (MaxVectorSize >= 16), "UseRVV and MaxVectorSize not matched"); case Op_StrCompressedCopy: // fall through case Op_StrInflatedCopy: // fall through case Op_CountPositives: // fall through @@ -3541,46 +3518,6 @@ operand vReg_V11() interface(REG_INTER); %} -operand vReg_V12() -%{ - constraint(ALLOC_IN_RC(v12_reg)); - match(VecA); - match(vReg); - op_cost(0); - format %{ %} - interface(REG_INTER); -%} - -operand vReg_V13() -%{ - constraint(ALLOC_IN_RC(v13_reg)); - match(VecA); - match(vReg); - op_cost(0); - format %{ %} - interface(REG_INTER); -%} - -operand vReg_V14() -%{ - constraint(ALLOC_IN_RC(v14_reg)); - match(VecA); - match(vReg); - op_cost(0); - format %{ %} - interface(REG_INTER); -%} - -operand vReg_V15() -%{ - constraint(ALLOC_IN_RC(v15_reg)); - match(VecA); - match(vReg); - op_cost(0); - format %{ %} - interface(REG_INTER); -%} - operand vRegMask() %{ constraint(ALLOC_IN_RC(vmask_reg)); diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index 510c0ff5d46..6894f3ce9fd 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -3843,116 +3843,6 @@ instruct vclearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy, ins_pipe(pipe_class_memory); %} -// CompressBits of Long & Integer - -instruct compressBitsI(iRegINoSp dst, iRegIorL2I src, iRegIorL2I mask, vRegMask_V0 v0, - vReg_V4 v4, vReg_V5 v5, vReg_V8 v8, vReg_V9 v9) %{ - match(Set dst (CompressBits src mask)); - effect(TEMP v0, TEMP v4, TEMP v5, TEMP v8, TEMP v9); - format %{ "vsetivli x0, 1, e32, m1, tu, mu\t#@compressBitsI\n\t" - "vmv.s.x $v0, $src\n\t" - "mv t0, 32\n\t" - "vsetvli x0, t0, e8, m2, tu, mu\n\t" - "vmv.v.i $v4, 0\n\t" - "vmerge.vim $v4, $v4, 1, $v0\n\t" - "vmv.v.i $v8, 0\n\t" - "vsetivli x0, 1, e32, m1, tu, mu\n\t" - "vmv.s.x $v0, $mask\n\t" - "vsetvli x0, t0, e8, m2, tu, mu\n\t" - "vcompress.vm $v8, $v4, $v0\n\t" - "vmseq.vi $v0, $v8, 1\n\t" - "vsetivli x0, 1, e32, m1, tu, mu\n\t" - "vmv.x.s $dst, $v0\t#@compressBitsI\n\t" - %} - ins_encode %{ - __ compress_bits_i_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg)); - %} - ins_pipe(pipe_slow); -%} - -instruct compressBitsL(iRegLNoSp dst, iRegL src, iRegL mask, vRegMask_V0 v0, - vReg_V4 v4, vReg_V5 v5, vReg_V6 v6, vReg_V7 v7, - vReg_V8 v8, vReg_V9 v9, vReg_V10 v10, vReg_V11 v11) %{ - match(Set dst (CompressBits src mask)); - effect(TEMP v0, TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11); - format %{ "vsetivli x0, 1, e64, m1, tu, mu\t#@compressBitsL\n\t" - "vmv.s.x $v0, $src\n\t" - "mv t0, 64\n\t" - "vsetvli x0, t0, e8, m4, tu, mu\n\t" - "vmv.v.i $v4, 0\n\t" - "vmerge.vim $v4, $v4, 1, $v0\n\t" - "vmv.v.i $v8, 0\n\t" - "vsetivli x0, 1, e64, m1, tu, mu\n\t" - "vmv.s.x $v0, $mask\n\t" - "vsetvli x0, t0, e8, m4, tu, mu\n\t" - "vcompress.vm $v8, $v4, $v0\n\t" - "vmseq.vi $v0, $v8, 1\n\t" - "vsetivli x0, 1, e64, m1, tu, mu\n\t" - "vmv.x.s $dst, $v0\t#@compressBitsL\n\t" - %} - ins_encode %{ - __ compress_bits_l_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg)); - %} - ins_pipe(pipe_slow); -%} - -// ExpandBits of Long & Integer - -instruct expandBitsI(iRegINoSp dst, iRegIorL2I src, iRegIorL2I mask, vRegMask_V0 v0, - vReg_V4 v4, vReg_V5 v5, vReg_V8 v8, vReg_V9 v9, vReg_V12 v12, vReg_V13 v13) %{ - match(Set dst (ExpandBits src mask)); - effect(TEMP v0, TEMP v4, TEMP v5, TEMP v8, TEMP v9, TEMP v12, TEMP v13); - format %{ "vsetivli x0, 1, e32, m1, tu, mu\t#@expandBitsI\n\t" - "vmv.s.x $v0, $src\n\t" - "mv t0, 32\n\t" - "vsetvli x0, t0, e8, m2, tu, mu\n\t" - "vmv.v.i $v4, 0\n\t" - "vmerge.vim $v4, $v4, 1, $v0\n\t" - "vmv.v.i $v12, 0\n\t" - "vsetivli x0, 1, e32, m1, tu, mu\n\t" - "vmv.s.x $v0, $mask\n\t" - "vsetvli x0, t0, e8, m2, tu, mu\n\t" - "viota.m $v8, $v0\n\t" - "vrgather.vv $v12, $v4, $v8, $v0.t\n\t" - "vmseq.vi $v0, $v12, 1\n\t" - "vsetivli x0, 1, e32, m1, tu, mu\n\t" - "vmv.x.s $dst, $v0\t#@expandBitsI\n\t" - %} - ins_encode %{ - __ expand_bits_i_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg)); - %} - ins_pipe(pipe_slow); -%} - -instruct expandBitsL(iRegLNoSp dst, iRegL src, iRegL mask, vRegMask_V0 v0, - vReg_V4 v4, vReg_V5 v5, vReg_V6 v6, vReg_V7 v7, - vReg_V8 v8, vReg_V9 v9, vReg_V10 v10, vReg_V11 v11, - vReg_V12 v12, vReg_V13 v13, vReg_V14 v14, vReg_V15 v15) %{ - match(Set dst (ExpandBits src mask)); - effect(TEMP v0, TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11, - TEMP v12, TEMP v13, TEMP v14, TEMP v15); - format %{ "vsetivli x0, 1, e64, m1, tu, mu\t#@expandBitsL\n\t" - "vmv.s.x $v0, $src\n\t" - "mv t0, 64\n\t" - "vsetvli x0, t0, e8, m4, tu, mu\n\t" - "vmv.v.i $v4, 0\n\t" - "vmerge.vim $v4, $v4, 1, $v0\n\t" - "vmv.v.i $v12, 0\n\t" - "vsetivli x0, 1, e64, m1, tu, mu\n\t" - "vmv.s.x $v0, $mask\n\t" - "vsetvli x0, t0, e8, m4, tu, mu\n\t" - "viota.m $v8, $v0\n\t" - "vrgather.vv $v12, $v4, $v8, $v0.t\n\t" - "vmseq.vi $v0, $v12, 1\n\t" - "vsetivli x0, 1, e64, m1, tu, mu\n\t" - "vmv.x.s $dst, $v0\t#@expandBitsL\n\t" - %} - ins_encode %{ - __ expand_bits_l_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg)); - %} - ins_pipe(pipe_slow); -%} - // Vector Load Const instruct vloadcon(vReg dst, immI0 src) %{ match(Set dst (VectorLoadConst src)); diff --git a/test/hotspot/jtreg/compiler/intrinsics/TestBitShuffleOpers.java b/test/hotspot/jtreg/compiler/intrinsics/TestBitShuffleOpers.java index 064ffeb41fb..e94d7bcc95b 100644 --- a/test/hotspot/jtreg/compiler/intrinsics/TestBitShuffleOpers.java +++ b/test/hotspot/jtreg/compiler/intrinsics/TestBitShuffleOpers.java @@ -30,8 +30,7 @@ * @requires (((os.arch=="x86" | os.arch=="amd64" | os.arch=="x86_64") & * (vm.cpu.features ~= ".*bmi2.*" & vm.cpu.features ~= ".*bmi1.*" & * vm.cpu.features ~= ".*sse2.*")) | - * (os.arch=="aarch64" & vm.cpu.features ~= ".*svebitperm.*") | - * (os.arch=="riscv64" & vm.cpu.features ~= ".*rvv.*")) + * (os.arch=="aarch64" & vm.cpu.features ~= ".*svebitperm.*")) * @library /test/lib / * @run driver compiler.intrinsics.TestBitShuffleOpers */