From 133419177d8ddcfafe0b2bd25ee918bdb3b16d3f Mon Sep 17 00:00:00 2001
From: Hamlin Li <mli@openjdk.org>
Date: Mon, 25 Nov 2024 13:54:44 +0000
Subject: [PATCH] 8334474: RISC-V: verify perf of ExpandBits/CompressBits (rvv)

Reviewed-by: fyang, rehn, luhenry
---
 .../cpu/riscv/c2_MacroAssembler_riscv.cpp     |  77 ------------
 .../cpu/riscv/c2_MacroAssembler_riscv.hpp     |  10 --
 src/hotspot/cpu/riscv/riscv.ad                |  63 ----------
 src/hotspot/cpu/riscv/riscv_v.ad              | 110 ------------------
 .../intrinsics/TestBitShuffleOpers.java       |   3 +-
 5 files changed, 1 insertion(+), 262 deletions(-)

diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
index bf553b35770..49efb619093 100644
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@@ -2339,83 +2339,6 @@ void C2_MacroAssembler::signum_fp_v(VectorRegister dst, VectorRegister one, Basi
   vfsgnj_vv(dst, one, dst, v0_t);
 }
 
-void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) {
-  Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
-  // intrinsic is enabled when MaxVectorSize >= 16
-  Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
-  long len = is_long ? 64 : 32;
-
-  // load the src data(in bits) to be compressed.
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_s_x(v0, src);
-  // reset the src data(in bytes) to zero.
-  mv(t0, len);
-  vsetvli(x0, t0, Assembler::e8, lmul);
-  vmv_v_i(v4, 0);
-  // convert the src data from bits to bytes.
-  vmerge_vim(v4, v4, 1); // v0 as the implicit mask register
-  // reset the dst data(in bytes) to zero.
-  vmv_v_i(v8, 0);
-  // load the mask data(in bits).
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_s_x(v0, mask);
-  // compress the src data(in bytes) to dst(in bytes).
-  vsetvli(x0, t0, Assembler::e8, lmul);
-  vcompress_vm(v8, v4, v0);
-  // convert the dst data from bytes to bits.
-  vmseq_vi(v0, v8, 1);
-  // store result back.
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_x_s(dst, v0);
-}
-
-void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) {
-  compress_bits_v(dst, src, mask, /* is_long */ false);
-}
-
-void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) {
-  compress_bits_v(dst, src, mask, /* is_long */ true);
-}
-
-void C2_MacroAssembler::expand_bits_v(Register dst, Register src, Register mask, bool is_long) {
-  Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32;
-  // intrinsic is enabled when MaxVectorSize >= 16
-  Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2;
-  long len = is_long ? 64 : 32;
-
-  // load the src data(in bits) to be expanded.
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_s_x(v0, src);
-  // reset the src data(in bytes) to zero.
-  mv(t0, len);
-  vsetvli(x0, t0, Assembler::e8, lmul);
-  vmv_v_i(v4, 0);
-  // convert the src data from bits to bytes.
-  vmerge_vim(v4, v4, 1); // v0 as implicit mask register
-  // reset the dst data(in bytes) to zero.
-  vmv_v_i(v12, 0);
-  // load the mask data(in bits).
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_s_x(v0, mask);
-  // expand the src data(in bytes) to dst(in bytes).
-  vsetvli(x0, t0, Assembler::e8, lmul);
-  viota_m(v8, v0);
-  vrgather_vv(v12, v4, v8, VectorMask::v0_t); // v0 as implicit mask register
-  // convert the dst data from bytes to bits.
-  vmseq_vi(v0, v12, 1);
-  // store result back.
-  vsetivli(x0, 1, sew, Assembler::m1);
-  vmv_x_s(dst, v0);
-}
-
-void C2_MacroAssembler::expand_bits_i_v(Register dst, Register src, Register mask) {
-  expand_bits_v(dst, src, mask, /* is_long */ false);
-}
-
-void C2_MacroAssembler::expand_bits_l_v(Register dst, Register src, Register mask) {
-  expand_bits_v(dst, src, mask, /* is_long */ true);
-}
-
 // j.l.Math.round(float)
 //  Returns the closest int to the argument, with ties rounding to positive infinity.
 // We need to handle 3 special cases defined by java api spec:
diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
index 8736294e72c..2d14f98780d 100644
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
@@ -39,9 +39,6 @@
                        VectorRegister vrs,
                        bool is_latin, Label& DONE, Assembler::LMUL lmul);
 
-  void compress_bits_v(Register dst, Register src, Register mask, bool is_long);
-  void expand_bits_v(Register dst, Register src, Register mask, bool is_long);
-
  public:
   // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
   void fast_lock(Register object, Register box,
@@ -184,13 +181,6 @@
 
   // intrinsic methods implemented by rvv instructions
 
-  // compress bits, i.e. j.l.Integer/Long::compress.
-  void compress_bits_i_v(Register dst, Register src, Register mask);
-  void compress_bits_l_v(Register dst, Register src, Register mask);
-  // expand bits, i.e. j.l.Integer/Long::expand.
-  void expand_bits_i_v(Register dst, Register src, Register mask);
-  void expand_bits_l_v(Register dst, Register src, Register mask);
-
   void java_round_float_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, BasicType bt, uint vector_length);
   void java_round_double_v(VectorRegister dst, VectorRegister src, FloatRegister ftmp, BasicType bt, uint vector_length);
 
diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad
index ae8565e1bcf..d0085125c76 100644
--- a/src/hotspot/cpu/riscv/riscv.ad
+++ b/src/hotspot/cpu/riscv/riscv.ad
@@ -942,26 +942,6 @@ reg_class v11_reg(
     V11, V11_H, V11_J, V11_K
 );
 
-// class for vector register v12
-reg_class v12_reg(
-    V12, V12_H, V12_J, V12_K
-);
-
-// class for vector register v13
-reg_class v13_reg(
-    V13, V13_H, V13_J, V13_K
-);
-
-// class for vector register v14
-reg_class v14_reg(
-    V14, V14_H, V14_J, V14_K
-);
-
-// class for vector register v15
-reg_class v15_reg(
-    V15, V15_H, V15_J, V15_K
-);
-
 // class for condition codes
 reg_class reg_flags(RFLAGS);
 
@@ -1896,9 +1876,6 @@ bool Matcher::match_rule_supported(int opcode) {
       }
       break;
 
-    case Op_ExpandBits:        // fall through
-    case Op_CompressBits:      // fall through
-      guarantee(UseRVV == (MaxVectorSize >= 16), "UseRVV and MaxVectorSize not matched");
     case Op_StrCompressedCopy: // fall through
     case Op_StrInflatedCopy:   // fall through
     case Op_CountPositives:    // fall through
@@ -3541,46 +3518,6 @@ operand vReg_V11()
   interface(REG_INTER);
 %}
 
-operand vReg_V12()
-%{
-  constraint(ALLOC_IN_RC(v12_reg));
-  match(VecA);
-  match(vReg);
-  op_cost(0);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-operand vReg_V13()
-%{
-  constraint(ALLOC_IN_RC(v13_reg));
-  match(VecA);
-  match(vReg);
-  op_cost(0);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-operand vReg_V14()
-%{
-  constraint(ALLOC_IN_RC(v14_reg));
-  match(VecA);
-  match(vReg);
-  op_cost(0);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-operand vReg_V15()
-%{
-  constraint(ALLOC_IN_RC(v15_reg));
-  match(VecA);
-  match(vReg);
-  op_cost(0);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
 operand vRegMask()
 %{
   constraint(ALLOC_IN_RC(vmask_reg));
diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad
index 510c0ff5d46..6894f3ce9fd 100644
--- a/src/hotspot/cpu/riscv/riscv_v.ad
+++ b/src/hotspot/cpu/riscv/riscv_v.ad
@@ -3843,116 +3843,6 @@ instruct vclearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy,
   ins_pipe(pipe_class_memory);
 %}
 
-// CompressBits of Long & Integer
-
-instruct compressBitsI(iRegINoSp dst, iRegIorL2I src, iRegIorL2I mask, vRegMask_V0 v0,
-                       vReg_V4 v4, vReg_V5 v5, vReg_V8 v8, vReg_V9 v9) %{
-  match(Set dst (CompressBits src mask));
-  effect(TEMP v0, TEMP v4, TEMP v5, TEMP v8, TEMP v9);
-  format %{ "vsetivli x0, 1, e32, m1, tu, mu\t#@compressBitsI\n\t"
-            "vmv.s.x $v0, $src\n\t"
-            "mv t0, 32\n\t"
-            "vsetvli x0, t0, e8, m2, tu, mu\n\t"
-            "vmv.v.i $v4, 0\n\t"
-            "vmerge.vim $v4, $v4, 1, $v0\n\t"
-            "vmv.v.i $v8, 0\n\t"
-            "vsetivli x0, 1, e32, m1, tu, mu\n\t"
-            "vmv.s.x $v0, $mask\n\t"
-            "vsetvli x0, t0, e8, m2, tu, mu\n\t"
-            "vcompress.vm $v8, $v4, $v0\n\t"
-            "vmseq.vi $v0, $v8, 1\n\t"
-            "vsetivli x0, 1, e32, m1, tu, mu\n\t"
-            "vmv.x.s $dst, $v0\t#@compressBitsI\n\t"
-          %}
-  ins_encode %{
-    __ compress_bits_i_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct compressBitsL(iRegLNoSp dst, iRegL src, iRegL mask, vRegMask_V0 v0,
-                       vReg_V4 v4, vReg_V5 v5, vReg_V6 v6, vReg_V7 v7,
-                       vReg_V8 v8, vReg_V9 v9, vReg_V10 v10, vReg_V11 v11) %{
-  match(Set dst (CompressBits src mask));
-  effect(TEMP v0, TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11);
-  format %{ "vsetivli x0, 1, e64, m1, tu, mu\t#@compressBitsL\n\t"
-            "vmv.s.x $v0, $src\n\t"
-            "mv t0, 64\n\t"
-            "vsetvli x0, t0, e8, m4, tu, mu\n\t"
-            "vmv.v.i $v4, 0\n\t"
-            "vmerge.vim $v4, $v4, 1, $v0\n\t"
-            "vmv.v.i $v8, 0\n\t"
-            "vsetivli x0, 1, e64, m1, tu, mu\n\t"
-            "vmv.s.x $v0, $mask\n\t"
-            "vsetvli x0, t0, e8, m4, tu, mu\n\t"
-            "vcompress.vm $v8, $v4, $v0\n\t"
-            "vmseq.vi $v0, $v8, 1\n\t"
-            "vsetivli x0, 1, e64, m1, tu, mu\n\t"
-            "vmv.x.s $dst, $v0\t#@compressBitsL\n\t"
-          %}
-  ins_encode %{
-    __ compress_bits_l_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-// ExpandBits of Long & Integer
-
-instruct expandBitsI(iRegINoSp dst, iRegIorL2I src, iRegIorL2I mask, vRegMask_V0 v0,
-                     vReg_V4 v4, vReg_V5 v5, vReg_V8 v8, vReg_V9 v9, vReg_V12 v12, vReg_V13 v13) %{
-  match(Set dst (ExpandBits src mask));
-  effect(TEMP v0, TEMP v4, TEMP v5, TEMP v8, TEMP v9, TEMP v12, TEMP v13);
-  format %{ "vsetivli x0, 1, e32, m1, tu, mu\t#@expandBitsI\n\t"
-            "vmv.s.x $v0, $src\n\t"
-            "mv t0, 32\n\t"
-            "vsetvli x0, t0, e8, m2, tu, mu\n\t"
-            "vmv.v.i $v4, 0\n\t"
-            "vmerge.vim $v4, $v4, 1, $v0\n\t"
-            "vmv.v.i $v12, 0\n\t"
-            "vsetivli x0, 1, e32, m1, tu, mu\n\t"
-            "vmv.s.x $v0, $mask\n\t"
-            "vsetvli x0, t0, e8, m2, tu, mu\n\t"
-            "viota.m $v8, $v0\n\t"
-            "vrgather.vv $v12, $v4, $v8, $v0.t\n\t"
-            "vmseq.vi $v0, $v12, 1\n\t"
-            "vsetivli x0, 1, e32, m1, tu, mu\n\t"
-            "vmv.x.s $dst, $v0\t#@expandBitsI\n\t"
-          %}
-  ins_encode %{
-    __ expand_bits_i_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct expandBitsL(iRegLNoSp dst, iRegL src, iRegL mask, vRegMask_V0 v0,
-                      vReg_V4 v4, vReg_V5 v5, vReg_V6 v6, vReg_V7 v7,
-                      vReg_V8 v8, vReg_V9 v9, vReg_V10 v10, vReg_V11 v11,
-                      vReg_V12 v12, vReg_V13 v13, vReg_V14 v14, vReg_V15 v15) %{
-  match(Set dst (ExpandBits src mask));
-  effect(TEMP v0, TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11,
-         TEMP v12, TEMP v13, TEMP v14, TEMP v15);
-  format %{ "vsetivli x0, 1, e64, m1, tu, mu\t#@expandBitsL\n\t"
-            "vmv.s.x $v0, $src\n\t"
-            "mv t0, 64\n\t"
-            "vsetvli x0, t0, e8, m4, tu, mu\n\t"
-            "vmv.v.i $v4, 0\n\t"
-            "vmerge.vim $v4, $v4, 1, $v0\n\t"
-            "vmv.v.i $v12, 0\n\t"
-            "vsetivli x0, 1, e64, m1, tu, mu\n\t"
-            "vmv.s.x $v0, $mask\n\t"
-            "vsetvli x0, t0, e8, m4, tu, mu\n\t"
-            "viota.m $v8, $v0\n\t"
-            "vrgather.vv $v12, $v4, $v8, $v0.t\n\t"
-            "vmseq.vi $v0, $v12, 1\n\t"
-            "vsetivli x0, 1, e64, m1, tu, mu\n\t"
-            "vmv.x.s $dst, $v0\t#@expandBitsL\n\t"
-          %}
-  ins_encode %{
-    __ expand_bits_l_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
 // Vector Load Const
 instruct vloadcon(vReg dst, immI0 src) %{
   match(Set dst (VectorLoadConst src));
diff --git a/test/hotspot/jtreg/compiler/intrinsics/TestBitShuffleOpers.java b/test/hotspot/jtreg/compiler/intrinsics/TestBitShuffleOpers.java
index 064ffeb41fb..e94d7bcc95b 100644
--- a/test/hotspot/jtreg/compiler/intrinsics/TestBitShuffleOpers.java
+++ b/test/hotspot/jtreg/compiler/intrinsics/TestBitShuffleOpers.java
@@ -30,8 +30,7 @@
  * @requires (((os.arch=="x86" | os.arch=="amd64" | os.arch=="x86_64") &
  *            (vm.cpu.features ~= ".*bmi2.*" & vm.cpu.features ~= ".*bmi1.*" &
  *             vm.cpu.features ~= ".*sse2.*")) |
- *            (os.arch=="aarch64" & vm.cpu.features ~= ".*svebitperm.*") |
- *            (os.arch=="riscv64" & vm.cpu.features ~= ".*rvv.*"))
+ *            (os.arch=="aarch64" & vm.cpu.features ~= ".*svebitperm.*"))
  * @library /test/lib /
  * @run driver compiler.intrinsics.TestBitShuffleOpers
  */