8366333: AArch64: Enhance SVE subword type implementation of vector compress

Co-authored-by: Jatin Bhateja <jbhateja@openjdk.org> Reviewed-by: jbhateja, xgong, galder, vlivanov
2026-01-28 12:09:14 +00:00 · 2025-10-21 01:20:38 +00:00 · 2025-10-21 01:20:38 +00:00 · 2de8d58552
commit 2de8d58552
parent 0522cf2ed9
10 changed files with 408 additions and 115 deletions
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@ -7081,29 +7081,31 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
 %}

 instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2,
-                    vReg tmp3, vReg tmp4, pReg ptmp, pRegGov pgtmp) %{
+                    vReg tmp3, pReg ptmp, pRegGov pgtmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
-  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ptmp, TEMP pgtmp);
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp);
  match(Set dst (CompressV src pg));
-  format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, tmp4, $ptmp, $pgtmp" %}
+  format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %}
  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    __ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
-                         $tmp1$$FloatRegister,$tmp2$$FloatRegister,
-                         $tmp3$$FloatRegister,$tmp4$$FloatRegister,
-                         $ptmp$$PRegister, $pgtmp$$PRegister);
+                         $tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister,
+                         $ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
 %}

-instruct vcompressS(vReg dst, vReg src, pReg pg,
-                    vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
+instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT);
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp);
  match(Set dst (CompressV src pg));
  format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %}
  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    __ sve_dup($tmp1$$FloatRegister, __ H, 0);
    __ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
-                          $tmp1$$FloatRegister,$tmp2$$FloatRegister, $pgtmp$$PRegister);
+                          $tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister,
+                          length_in_bytes);
  %}
  ins_pipe(pipe_slow);
 %}
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@ -5069,29 +5069,31 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
 %}

 instruct vcompressB(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2,
-                    vReg tmp3, vReg tmp4, pReg ptmp, pRegGov pgtmp) %{
+                    vReg tmp3, pReg ptmp, pRegGov pgtmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_BYTE);
-  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ptmp, TEMP pgtmp);
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP ptmp, TEMP pgtmp);
  match(Set dst (CompressV src pg));
-  format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, tmp4, $ptmp, $pgtmp" %}
+  format %{ "vcompressB $dst, $src, $pg\t# KILL $tmp1, $tmp2, $tmp3, $ptmp, $pgtmp" %}
  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
    __ sve_compress_byte($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
-                         $tmp1$$FloatRegister,$tmp2$$FloatRegister,
-                         $tmp3$$FloatRegister,$tmp4$$FloatRegister,
-                         $ptmp$$PRegister, $pgtmp$$PRegister);
+                         $tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister,
+                         $ptmp$$PRegister, $pgtmp$$PRegister, length_in_bytes);
  %}
  ins_pipe(pipe_slow);
 %}

-instruct vcompressS(vReg dst, vReg src, pReg pg,
-                    vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
+instruct vcompressS(vReg dst, vReg src, pReg pg, vReg tmp1, vReg tmp2, pRegGov pgtmp) %{
  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_SHORT);
  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP pgtmp);
  match(Set dst (CompressV src pg));
  format %{ "vcompressS $dst, $src, $pg\t# KILL $tmp1, $tmp2, $pgtmp" %}
  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    __ sve_dup($tmp1$$FloatRegister, __ H, 0);
    __ sve_compress_short($dst$$FloatRegister, $src$$FloatRegister, $pg$$PRegister,
-                          $tmp1$$FloatRegister,$tmp2$$FloatRegister, $pgtmp$$PRegister);
+                          $tmp1$$FloatRegister, $tmp2$$FloatRegister, $pgtmp$$PRegister,
+                          length_in_bytes);
  %}
  ins_pipe(pipe_slow);
 %}
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -3486,6 +3486,7 @@ public:
  INSN(sve_smaxv, 0b00000100, 0b001000001); // signed maximum reduction to scalar
  INSN(sve_smin,  0b00000100, 0b001010000); // signed minimum vectors
  INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar
+  INSN(sve_splice,0b00000101, 0b101100100); // splice two vectors under predicate control, destructive
  INSN(sve_sub,   0b00000100, 0b000001000); // vector sub
  INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar
  INSN(sve_umax,  0b00000100, 0b001001000); // unsigned maximum vectors
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@ -2203,114 +2203,117 @@ void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t l
 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
 // Any remaining elements of dst will be filled with zero.
 // Clobbers: rscratch1
-// Preserves: src, mask
+// Preserves: mask, vzr
 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
-                                           FloatRegister vtmp1, FloatRegister vtmp2,
-                                           PRegister pgtmp) {
+                                           FloatRegister vzr, FloatRegister vtmp,
+                                           PRegister pgtmp, unsigned vector_length_in_bytes) {
  assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
-  assert_different_registers(dst, src, vtmp1, vtmp2);
+  // When called by sve_compress_byte, src and vtmp may be the same register.
+  assert_different_registers(dst, src, vzr);
+  assert_different_registers(dst, vtmp, vzr);
  assert_different_registers(mask, pgtmp);
-
-  // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
-  //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
-  // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
-  sve_dup(vtmp2, H, 0);
+  // high <-- low
+  // Example input:   src   = hh gg ff ee dd cc bb aa, one character is 8 bits.
+  //                  mask  = 01 00 00 01 01 00 01 01, one character is 1 bit.
+  // Expected result: dst   = 00 00 00 hh ee dd bb aa

  // Extend lowest half to type INT.
-  // dst = 00004444 00003333 00002222 00001111
+  // dst   =  00dd  00cc  00bb  00aa
  sve_uunpklo(dst, S, src);
-  // pgtmp = 00000001 00000000 00000001 00000001
+  // pgtmp =  0001  0000  0001  0001
  sve_punpklo(pgtmp, mask);
  // Pack the active elements in size of type INT to the right,
  // and fill the remainings with zero.
-  // dst = 00000000 00004444 00002222 00001111
+  // dst   =  0000  00dd  00bb  00aa
  sve_compact(dst, S, dst, pgtmp);
  // Narrow the result back to type SHORT.
-  // dst = 0000 0000 0000 0000 0000 4444 2222 1111
-  sve_uzp1(dst, H, dst, vtmp2);
+  // dst   = 00 00 00 00 00 dd bb aa
+  sve_uzp1(dst, H, dst, vzr);
+
+  // Return if the vector length is no more than MaxVectorSize/2, since the
+  // highest half is invalid.
+  if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
+    return;
+  }
+
  // Count the active elements of lowest half.
  // rscratch1 = 3
  sve_cntp(rscratch1, S, ptrue, pgtmp);

  // Repeat to the highest half.
-  // pgtmp = 00000001 00000000 00000000 00000001
+  // pgtmp =  0001  0000  0000  0001
  sve_punpkhi(pgtmp, mask);
-  // vtmp1 = 00008888 00007777 00006666 00005555
-  sve_uunpkhi(vtmp1, S, src);
-  // vtmp1 = 00000000 00000000 00008888 00005555
-  sve_compact(vtmp1, S, vtmp1, pgtmp);
-  // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
-  sve_uzp1(vtmp1, H, vtmp1, vtmp2);
+  // vtmp  =  00hh  00gg  00ff  00ee
+  sve_uunpkhi(vtmp, S, src);
+  // vtmp  =  0000  0000  00hh  00ee
+  sve_compact(vtmp, S, vtmp, pgtmp);
+  // vtmp  = 00 00 00 00 00 00 hh ee
+  sve_uzp1(vtmp, H, vtmp, vzr);

-  // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
-  // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
-  // Left shift(cross lane) compressed high with TRUE_CNT lanes,
-  // TRUE_CNT is the number of active elements in the compressed low.
-  neg(rscratch1, rscratch1);
-  // vtmp2 = {4 3 2 1 0 -1 -2 -3}
-  sve_index(vtmp2, H, rscratch1, 1);
-  // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
-  sve_tbl(vtmp1, H, vtmp1, vtmp2);
-
-  // Combine the compressed high(after shifted) with the compressed low.
-  // dst = 0000 0000 0000 8888 5555 4444 2222 1111
-  sve_orr(dst, dst, vtmp1);
+  // pgtmp = 00 00 00 00 00 01 01 01
+  sve_whilelt(pgtmp, H, zr, rscratch1);
+  // Compressed low:  dst  = 00 00 00 00 00 dd bb aa
+  // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
+  // Combine the compressed low with the compressed high:
+  //                  dst  = 00 00 00 hh ee dd bb aa
+  sve_splice(dst, H, pgtmp, vtmp);
 }

 // Clobbers: rscratch1, rscratch2
 // Preserves: src, mask
 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
-                                          FloatRegister vtmp1, FloatRegister vtmp2,
-                                          FloatRegister vtmp3, FloatRegister vtmp4,
-                                          PRegister ptmp, PRegister pgtmp) {
+                                          FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
+                                          PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
  assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
-  assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
+  assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3);
  assert_different_registers(mask, ptmp, pgtmp);
-  // Example input:   src   = 88 77 66 55 44 33 22 11
-  //                  mask  = 01 00 00 01 01 00 01 01
-  // Expected result: dst   = 00 00 00 88 55 44 22 11
+  // high <-- low
+  // Example input:   src   = q p n m l k j i h g f e d c b a, one character is 8 bits.
+  //                  mask  = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
+  // Expected result: dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
+  FloatRegister vzr = vtmp3;
+  sve_dup(vzr, B, 0);

-  sve_dup(vtmp4, B, 0);
  // Extend lowest half to type SHORT.
-  // vtmp1 = 0044 0033 0022 0011
+  // vtmp1 =  0h  0g  0f  0e  0d  0c  0b  0a
  sve_uunpklo(vtmp1, H, src);
-  // ptmp = 0001 0000 0001 0001
+  // ptmp  =  00  01  00  00  00  01  00  01
  sve_punpklo(ptmp, mask);
+  // Pack the active elements in size of type SHORT to the right,
+  // and fill the remainings with zero.
+  // dst   =  00  00  00  00  00  0g  0c  0a
+  unsigned extended_size = vector_length_in_bytes << 1;
+  sve_compress_short(dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
+  // Narrow the result back to type BYTE.
+  // dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
+  sve_uzp1(dst, B, dst, vzr);
+
+  // Return if the vector length is no more than MaxVectorSize/2, since the
+  // highest half is invalid.
+  if (vector_length_in_bytes <= (MaxVectorSize >> 1)) {
+    return;
+  }
  // Count the active elements of lowest half.
  // rscratch2 = 3
  sve_cntp(rscratch2, H, ptrue, ptmp);
-  // Pack the active elements in size of type SHORT to the right,
-  // and fill the remainings with zero.
-  // dst = 0000 0044 0022 0011
-  sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
-  // Narrow the result back to type BYTE.
-  // dst = 00 00 00 00 00 44 22 11
-  sve_uzp1(dst, B, dst, vtmp4);

  // Repeat to the highest half.
-  // ptmp = 0001 0000 0000 0001
+  // ptmp  =  00  01  00  00  00  00  00  01
  sve_punpkhi(ptmp, mask);
-  // vtmp1 = 0088 0077 0066 0055
+  // vtmp2 =  0q  0p  0n  0m  0l  0k  0j  0i
  sve_uunpkhi(vtmp2, H, src);
-  // vtmp1 = 0000 0000 0088 0055
-  sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
+  // vtmp1 =  00  00  00  00  00  00  0p  0i
+  sve_compress_short(vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
+  // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
+  sve_uzp1(vtmp1, B, vtmp1, vzr);

-  sve_dup(vtmp4, B, 0);
-  // vtmp1 = 00 00 00 00 00 00 88 55
-  sve_uzp1(vtmp1, B, vtmp1, vtmp4);
-
-  // Compressed low:   dst   = 00 00 00 00 00 44 22 11
-  // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
-  // Left shift(cross lane) compressed high with TRUE_CNT lanes,
-  // TRUE_CNT is the number of active elements in the compressed low.
-  neg(rscratch2, rscratch2);
-  // vtmp2 = {4 3 2 1 0 -1 -2 -3}
-  sve_index(vtmp2, B, rscratch2, 1);
-  // vtmp1 = 00 00 00 88 55 00 00 00
-  sve_tbl(vtmp1, B, vtmp1, vtmp2);
-  // Combine the compressed high(after shifted) with the compressed low.
-  // dst = 00 00 00 88 55 44 22 11
-  sve_orr(dst, dst, vtmp1);
+  // ptmp  = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
+  sve_whilelt(ptmp, B, zr, rscratch2);
+  // Compressed low:  dst   = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
+  // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
+  // Combine the compressed low with the compressed high:
+  //                  dst   = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
+  sve_splice(dst, B, ptmp, vtmp1);
 }

 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@ -173,13 +173,12 @@
  // lowest-numbered elements of dst. Any remaining elements of dst will
  // be filled with zero.
  void sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
-                         FloatRegister vtmp1, FloatRegister vtmp2,
-                         FloatRegister vtmp3, FloatRegister vtmp4,
-                         PRegister ptmp, PRegister pgtmp);
+                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
+                         PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes);

  void sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
-                          FloatRegister vtmp1, FloatRegister vtmp2,
-                          PRegister pgtmp);
+                          FloatRegister vzr, FloatRegister vtmp,
+                          PRegister pgtmp, unsigned vector_length_in_bytes);

  void neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);

--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@ -2143,6 +2143,10 @@ generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",
                        ["facge",    "__ sve_fac(Assembler::GE, p1, __ H, p2, z4, z5);",   "facge\tp1.h, p2/z, z4.h, z5.h"],
                        ["facge",    "__ sve_fac(Assembler::GE, p1, __ S, p2, z4, z5);",   "facge\tp1.s, p2/z, z4.s, z5.s"],
                        ["facge",    "__ sve_fac(Assembler::GE, p1, __ D, p2, z4, z5);",   "facge\tp1.d, p2/z, z4.d, z5.d"],
+                        ["splice",   "__ sve_splice(z0, __ B, p0, z1);",                   "splice\tz0.b, p0, z0.b, z1.b"],
+                        ["splice",   "__ sve_splice(z0, __ H, p0, z1);",                   "splice\tz0.h, p0, z0.h, z1.h"],
+                        ["splice",   "__ sve_splice(z0, __ S, p0, z1);",                   "splice\tz0.s, p0, z0.s, z1.s"],
+                        ["splice",   "__ sve_splice(z0, __ D, p0, z1);",                   "splice\tz0.d, p0, z0.d, z1.d"],
                        # SVE2 instructions
                        ["histcnt",  "__ sve_histcnt(z16, __ S, p0, z16, z16);",           "histcnt\tz16.s, p0/z, z16.s, z16.s"],
                        ["histcnt",  "__ sve_histcnt(z17, __ D, p0, z17, z17);",           "histcnt\tz17.d, p0/z, z17.d, z17.d"],
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@ -1156,6 +1156,10 @@
    __ sve_fac(Assembler::GE, p1, __ H, p2, z4, z5);   //       facge   p1.h, p2/z, z4.h, z5.h
    __ sve_fac(Assembler::GE, p1, __ S, p2, z4, z5);   //       facge   p1.s, p2/z, z4.s, z5.s
    __ sve_fac(Assembler::GE, p1, __ D, p2, z4, z5);   //       facge   p1.d, p2/z, z4.d, z5.d
+    __ sve_splice(z0, __ B, p0, z1);                   //       splice  z0.b, p0, z0.b, z1.b
+    __ sve_splice(z0, __ H, p0, z1);                   //       splice  z0.h, p0, z0.h, z1.h
+    __ sve_splice(z0, __ S, p0, z1);                   //       splice  z0.s, p0, z0.s, z1.s
+    __ sve_splice(z0, __ D, p0, z1);                   //       splice  z0.d, p0, z0.d, z1.d
    __ sve_histcnt(z16, __ S, p0, z16, z16);           //       histcnt z16.s, p0/z, z16.s, z16.s
    __ sve_histcnt(z17, __ D, p0, z17, z17);           //       histcnt z17.d, p0/z, z17.d, z17.d

@ -1445,30 +1449,30 @@
    0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
    0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
    0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x140004b7,     0x94000000,
-    0x97ffffd4,     0x940004b4,     0x3400000a,     0x34fffa2a,
-    0x3400962a,     0x35000008,     0x35fff9c8,     0x350095c8,
-    0xb400000b,     0xb4fff96b,     0xb400956b,     0xb500001d,
-    0xb5fff91d,     0xb500951d,     0x10000013,     0x10fff8b3,
-    0x100094b3,     0x90000013,     0x36300016,     0x3637f836,
-    0x36309436,     0x3758000c,     0x375ff7cc,     0x375893cc,
+    0x14000000,     0x17ffffd7,     0x140004bb,     0x94000000,
+    0x97ffffd4,     0x940004b8,     0x3400000a,     0x34fffa2a,
+    0x340096aa,     0x35000008,     0x35fff9c8,     0x35009648,
+    0xb400000b,     0xb4fff96b,     0xb40095eb,     0xb500001d,
+    0xb5fff91d,     0xb500959d,     0x10000013,     0x10fff8b3,
+    0x10009533,     0x90000013,     0x36300016,     0x3637f836,
+    0x363094b6,     0x3758000c,     0x375ff7cc,     0x3758944c,
    0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
    0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
    0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
    0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x540091a0,     0x54000001,     0x54fff541,     0x54009141,
-    0x54000002,     0x54fff4e2,     0x540090e2,     0x54000002,
-    0x54fff482,     0x54009082,     0x54000003,     0x54fff423,
-    0x54009023,     0x54000003,     0x54fff3c3,     0x54008fc3,
-    0x54000004,     0x54fff364,     0x54008f64,     0x54000005,
-    0x54fff305,     0x54008f05,     0x54000006,     0x54fff2a6,
-    0x54008ea6,     0x54000007,     0x54fff247,     0x54008e47,
-    0x54000008,     0x54fff1e8,     0x54008de8,     0x54000009,
-    0x54fff189,     0x54008d89,     0x5400000a,     0x54fff12a,
-    0x54008d2a,     0x5400000b,     0x54fff0cb,     0x54008ccb,
-    0x5400000c,     0x54fff06c,     0x54008c6c,     0x5400000d,
-    0x54fff00d,     0x54008c0d,     0x5400000e,     0x54ffefae,
-    0x54008bae,     0x5400000f,     0x54ffef4f,     0x54008b4f,
+    0x54009220,     0x54000001,     0x54fff541,     0x540091c1,
+    0x54000002,     0x54fff4e2,     0x54009162,     0x54000002,
+    0x54fff482,     0x54009102,     0x54000003,     0x54fff423,
+    0x540090a3,     0x54000003,     0x54fff3c3,     0x54009043,
+    0x54000004,     0x54fff364,     0x54008fe4,     0x54000005,
+    0x54fff305,     0x54008f85,     0x54000006,     0x54fff2a6,
+    0x54008f26,     0x54000007,     0x54fff247,     0x54008ec7,
+    0x54000008,     0x54fff1e8,     0x54008e68,     0x54000009,
+    0x54fff189,     0x54008e09,     0x5400000a,     0x54fff12a,
+    0x54008daa,     0x5400000b,     0x54fff0cb,     0x54008d4b,
+    0x5400000c,     0x54fff06c,     0x54008cec,     0x5400000d,
+    0x54fff00d,     0x54008c8d,     0x5400000e,     0x54ffefae,
+    0x54008c2e,     0x5400000f,     0x54ffef4f,     0x54008bcf,
    0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
    0xd44cad80,     0xd503201f,     0xd503203f,     0xd503205f,
    0xd503209f,     0xd50320bf,     0xd503219f,     0xd50323bf,
@ -1689,7 +1693,8 @@
    0x05a14c00,     0x05e14c00,     0x05304001,     0x05314001,
    0x05a18610,     0x05e18610,     0x0420bc31,     0x05271e11,
    0x6545e891,     0x6585e891,     0x65c5e891,     0x6545c891,
-    0x6585c891,     0x65c5c891,     0x45b0c210,     0x45f1c231,
+    0x6585c891,     0x65c5c891,     0x052c8020,     0x056c8020,
+    0x05ac8020,     0x05ec8020,     0x45b0c210,     0x45f1c231,
    0x1e601000,     0x1e603000,     0x1e621000,     0x1e623000,
    0x1e641000,     0x1e643000,     0x1e661000,     0x1e663000,
    0x1e681000,     0x1e683000,     0x1e6a1000,     0x1e6a3000,
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@ -2840,6 +2840,36 @@ public class IRNode {
        vectorNode(EXPAND_BITS_VL, "ExpandBitsV", TYPE_LONG);
    }

+    public static final String COMPRESS_VB = VECTOR_PREFIX + "COMPRESS_VB" + POSTFIX;
+    static {
+        vectorNode(COMPRESS_VB, "CompressV", TYPE_BYTE);
+    }
+
+    public static final String COMPRESS_VS = VECTOR_PREFIX + "COMPRESS_VS" + POSTFIX;
+    static {
+        vectorNode(COMPRESS_VS, "CompressV", TYPE_SHORT);
+    }
+
+    public static final String COMPRESS_VI = VECTOR_PREFIX + "COMPRESS_VI" + POSTFIX;
+    static {
+        vectorNode(COMPRESS_VI, "CompressV", TYPE_INT);
+    }
+
+    public static final String COMPRESS_VL = VECTOR_PREFIX + "COMPRESS_VL" + POSTFIX;
+    static {
+        vectorNode(COMPRESS_VL, "CompressV", TYPE_LONG);
+    }
+
+    public static final String COMPRESS_VF = VECTOR_PREFIX + "COMPRESS_VF" + POSTFIX;
+    static {
+        vectorNode(COMPRESS_VF, "CompressV", TYPE_FLOAT);
+    }
+
+    public static final String COMPRESS_VD = VECTOR_PREFIX + "COMPRESS_VD" + POSTFIX;
+    static {
+        vectorNode(COMPRESS_VD, "CompressV", TYPE_DOUBLE);
+    }
+
    public static final String EXPAND_VB = VECTOR_PREFIX + "EXPAND_VB" + POSTFIX;
    static {
        vectorNode(EXPAND_VB, "ExpandV", TYPE_BYTE);
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java
@ -106,6 +106,7 @@ public class IREncodingPrinter {
        "avx512_fp16",
        "avx512_vnni",
        "avx512_vbmi",
+        "avx512_vbmi2",
        "avx10_2",
        "bmi2",
        // AArch64
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorCompressTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorCompressTest.java
@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.generators.*;
+import compiler.lib.ir_framework.*;
+import jdk.incubator.vector.*;
+import jdk.test.lib.Asserts;
+
+/**
+ * @test
+ * @bug 8366333
+ * @key randomness
+ * @library /test/lib /
+ * @summary IR test for VectorAPI compress
+ * @modules jdk.incubator.vector
+ *
+ * @run driver compiler.vectorapi.VectorCompressTest
+ */
+
+public class VectorCompressTest {
+    static final VectorSpecies<Byte> B_SPECIES = ByteVector.SPECIES_MAX;
+    static final VectorSpecies<Short> S_SPECIES = ShortVector.SPECIES_MAX;
+    static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_MAX;
+    static final VectorSpecies<Float> F_SPECIES = FloatVector.SPECIES_MAX;
+    static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_MAX;
+    static final VectorSpecies<Double> D_SPECIES = DoubleVector.SPECIES_MAX;
+    static final int LENGTH = 512;
+    static final Generators RD = Generators.G;
+    static byte[] ba, bb;
+    static short[] sa, sb;
+    static int[] ia, ib;
+    static long[] la, lb;
+    static float[] fa, fb;
+    static double[] da, db;
+    static boolean[] ma;
+
+    static {
+        ba = new byte[LENGTH];
+        bb = new byte[LENGTH];
+        sa = new short[LENGTH];
+        sb = new short[LENGTH];
+        ia = new int[LENGTH];
+        ib = new int[LENGTH];
+        la = new long[LENGTH];
+        lb = new long[LENGTH];
+        fa = new float[LENGTH];
+        fb = new float[LENGTH];
+        da = new double[LENGTH];
+        db = new double[LENGTH];
+        ma = new boolean[LENGTH];
+
+        Generator<Integer> iGen = RD.ints();
+        Generator<Long> lGen = RD.longs();
+        Generator<Float> fGen = RD.floats();
+        Generator<Double> dGen = RD.doubles();
+
+        for (int i = 0; i < LENGTH; i++) {
+            ba[i] = iGen.next().byteValue();
+            sa[i] = iGen.next().shortValue();
+            ma[i] = iGen.next() % 2 == 0;
+        }
+        RD.fill(iGen, ia);
+        RD.fill(lGen, la);
+        RD.fill(fGen, fa);
+        RD.fill(dGen, da);
+    }
+
+    @DontInline
+    static void verifyVectorCompressByte(int vlen) {
+        int index = 0;
+        for (int i = 0; i < vlen; i++) {
+            if (ma[i]) {
+                Asserts.assertEquals(ba[i], bb[index++]);
+            }
+        }
+        for (int i = index; i < vlen; i++) {
+            Asserts.assertEquals((byte)0, bb[i]);
+        }
+    }
+
+    @DontInline
+    static void verifyVectorCompressShort(int vlen) {
+        int index = 0;
+        for (int i = 0; i < vlen; i++) {
+            if (ma[i]) {
+                Asserts.assertEquals(sa[i], sb[index++]);
+            }
+        }
+        for (int i = index; i < vlen; i++) {
+            Asserts.assertEquals((short)0, sb[i]);
+        }
+    }
+
+    @DontInline
+    static void verifyVectorCompressInteger(int vlen) {
+        int index = 0;
+        for (int i = 0; i < vlen; i++) {
+            if (ma[i]) {
+                Asserts.assertEquals(ia[i], ib[index++]);
+            }
+        }
+        for (int i = index; i < vlen; i++) {
+            Asserts.assertEquals(0, ib[i]);
+        }
+    }
+
+    @DontInline
+    static void verifyVectorCompressLong(int vlen) {
+        int index = 0;
+        for (int i = 0; i < vlen; i++) {
+            if (ma[i]) {
+                Asserts.assertEquals(la[i], lb[index++]);
+            }
+        }
+        for (int i = index; i < vlen; i++) {
+            Asserts.assertEquals(0L, lb[i]);
+        }
+    }
+
+    @DontInline
+    static void verifyVectorCompressFloat(int vlen) {
+        int index = 0;
+        for (int i = 0; i < vlen; i++) {
+            if (ma[i]) {
+                Asserts.assertEquals(fa[i], fb[index++]);
+            }
+        }
+        for (int i = index; i < vlen; i++) {
+            Asserts.assertEquals(0.0f, fb[i]);
+        }
+    }
+
+    @DontInline
+    static void verifyVectorCompressDouble(int vlen) {
+        int index = 0;
+        for (int i = 0; i < vlen; i++) {
+            if (ma[i]) {
+                Asserts.assertEquals(da[i], db[index++]);
+            }
+        }
+        for (int i = index; i < vlen; i++) {
+            Asserts.assertEquals(0.0, db[i]);
+        }
+    }
+
+    @Test
+    @IR(counts = { IRNode.COMPRESS_VB, "= 1" },
+        applyIfCPUFeature = { "sve", "true" })
+    @IR(counts = { IRNode.COMPRESS_VB, "= 1" },
+        applyIfCPUFeatureAnd = {"avx512_vbmi2", "true", "avx512vl", "true"})
+    public static void testVectorCompressByte() {
+        ByteVector av = ByteVector.fromArray(B_SPECIES, ba, 0);
+        VectorMask<Byte> m = VectorMask.fromArray(B_SPECIES, ma, 0);
+        av.compress(m).intoArray(bb, 0);
+        verifyVectorCompressByte(B_SPECIES.length());
+    }
+
+    @Test
+    @IR(counts = { IRNode.COMPRESS_VS, "= 1" },
+        applyIfCPUFeature = { "sve", "true" })
+    @IR(counts = { IRNode.COMPRESS_VS, "= 1" },
+        applyIfCPUFeatureAnd = {"avx512_vbmi2", "true", "avx512vl", "true"})
+    public static void testVectorCompressShort() {
+        ShortVector av = ShortVector.fromArray(S_SPECIES, sa, 0);
+        VectorMask<Short> m = VectorMask.fromArray(S_SPECIES, ma, 0);
+        av.compress(m).intoArray(sb, 0);
+        verifyVectorCompressShort(S_SPECIES.length());
+    }
+
+    @Test
+    @IR(counts = { IRNode.COMPRESS_VI, "= 1" },
+        applyIfCPUFeature = { "sve", "true" })
+    @IR(counts = { IRNode.COMPRESS_VI, "= 1" },
+        applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"})
+    public static void testVectorCompressInt() {
+        IntVector av = IntVector.fromArray(I_SPECIES, ia, 0);
+        VectorMask<Integer> m = VectorMask.fromArray(I_SPECIES, ma, 0);
+        av.compress(m).intoArray(ib, 0);
+        verifyVectorCompressInteger(I_SPECIES.length());
+    }
+
+    @Test
+    @IR(counts = { IRNode.COMPRESS_VL, "= 1" },
+        applyIfCPUFeature = { "sve", "true" })
+    @IR(counts = { IRNode.COMPRESS_VL, "= 1" },
+        applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"})
+    public static void testVectorCompressLong() {
+        LongVector av = LongVector.fromArray(L_SPECIES, la, 0);
+        VectorMask<Long> m = VectorMask.fromArray(L_SPECIES, ma, 0);
+        av.compress(m).intoArray(lb, 0);
+        verifyVectorCompressLong(L_SPECIES.length());
+    }
+
+    @Test
+    @IR(counts = { IRNode.COMPRESS_VF, "= 1" },
+        applyIfCPUFeature = { "sve", "true" })
+    @IR(counts = { IRNode.COMPRESS_VF, "= 1" },
+        applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"})
+    public static void testVectorCompressFloat() {
+        FloatVector av = FloatVector.fromArray(F_SPECIES, fa, 0);
+        VectorMask<Float> m = VectorMask.fromArray(F_SPECIES, ma, 0);
+        av.compress(m).intoArray(fb, 0);
+        verifyVectorCompressFloat(F_SPECIES.length());
+    }
+
+    @Test
+    @IR(counts = { IRNode.COMPRESS_VD, "= 1" },
+        applyIfCPUFeature = { "sve", "true" })
+    @IR(counts = { IRNode.COMPRESS_VD, "= 1" },
+        applyIfCPUFeatureAnd = {"avx512f", "true", "avx512vl", "true"})
+    public static void testVectorCompressDouble() {
+        DoubleVector av = DoubleVector.fromArray(D_SPECIES, da, 0);
+        VectorMask<Double> m = VectorMask.fromArray(D_SPECIES, ma, 0);
+        av.compress(m).intoArray(db, 0);
+        verifyVectorCompressDouble(D_SPECIES.length());
+    }
+
+    public static void main(String[] args) {
+        TestFramework testFramework = new TestFramework();
+        testFramework.setDefaultWarmup(10000)
+                     .addFlags("--add-modules=jdk.incubator.vector")
+                     .start();
+    }
+}