From 676e6fd8d5152f4e0d14ae59ddd7aa0a7127ea58 Mon Sep 17 00:00:00 2001
From: Xiaohong Gong <xgong@openjdk.org>
Date: Thu, 13 Nov 2025 01:33:21 +0000
Subject: [PATCH] 8367292: VectorAPI: Optimize VectorMask.fromLong/toLong() for
 SVE

Reviewed-by: epeter, psandoz, haosun, sviswanathan
---
 src/hotspot/cpu/aarch64/aarch64_vector.ad     |  67 ++++--
 src/hotspot/cpu/aarch64/aarch64_vector_ad.m4  |  67 ++++--
 .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 218 +++++++++---------
 .../cpu/aarch64/c2_MacroAssembler_aarch64.hpp |  20 +-
 src/hotspot/cpu/arm/arm.ad                    |   4 +
 src/hotspot/cpu/ppc/ppc.ad                    |   4 +
 src/hotspot/cpu/riscv/riscv_v.ad              |   5 +
 src/hotspot/cpu/s390/s390.ad                  |   4 +
 src/hotspot/cpu/x86/x86.ad                    |   5 +
 src/hotspot/share/opto/matcher.hpp            |   6 +
 src/hotspot/share/opto/vectorIntrinsics.cpp   |   6 +-
 src/hotspot/share/opto/vectornode.cpp         |  16 +-
 .../compiler/lib/ir_framework/IRNode.java     |  10 +
 .../ir_framework/test/IREncodingPrinter.java  |   1 +
 .../vectorapi/VectorMaskFromLongTest.java     | 147 ++++++------
 .../vectorapi/VectorMaskToLongTest.java       | 146 ++++++++++--
 16 files changed, 470 insertions(+), 256 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index 9809d096233..842784d1a29 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -393,6 +393,32 @@ source %{
     return false;
   }
 
+  bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) {
+    // Only SVE supports the predicate feature.
+    if (UseSVE == 0) {
+      // On architectures that do not support predicate, masks are stored in
+      // general vector registers (TypeVect) with sizes ranging from TypeVectA
+      // to TypeVectX based on the vector size in bytes.
+      assert(vt->isa_vectmask() == nullptr, "mask type is not matched");
+      return false;
+    }
+
+    assert(vt->isa_vectmask() != nullptr, "expected TypeVectMask on SVE");
+    switch (opcode) {
+      case Op_VectorMaskToLong:
+      case Op_VectorLongToMask:
+        // These operations lack native SVE predicate instructions and are
+        // implemented using general vector instructions instead. Use vector
+        // registers rather than predicate registers to save the mask for
+        // better performance.
+        return false;
+      default:
+        // By default, the mask operations are implemented with predicate
+        // instructions with a predicate input/output.
+        return true;
+    }
+  }
+
   // Assert that the given node is not a variable shift.
   bool assert_not_var_shift(const Node* n) {
     assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
@@ -6249,31 +6275,44 @@ instruct vmask_tolong_neon(iRegLNoSp dst, vReg src) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct vmask_tolong_sve(iRegLNoSp dst, pReg src, vReg tmp1, vReg tmp2) %{
-  predicate(UseSVE > 0);
+instruct vmask_tolong_sve(iRegLNoSp dst, vReg src, vReg tmp) %{
+  predicate(UseSVE > 0 && !VM_Version::supports_svebitperm());
+  match(Set dst (VectorMaskToLong src));
+  effect(TEMP tmp);
+  format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp" %}
+  ins_encode %{
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    __ sve_vmask_tolong($dst$$Register, $src$$FloatRegister,
+                        $tmp$$FloatRegister, Matcher::vector_length(this, $src));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_tolong_sve2(iRegLNoSp dst, vReg src, vReg tmp1, vReg tmp2) %{
+  predicate(VM_Version::supports_svebitperm());
   match(Set dst (VectorMaskToLong src));
   effect(TEMP tmp1, TEMP tmp2);
-  format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp1, $tmp2" %}
+  format %{ "vmask_tolong_sve2 $dst, $src\t# KILL $tmp1, $tmp2" %}
   ins_encode %{
-    __ sve_vmask_tolong($dst$$Register, $src$$PRegister,
-                        Matcher::vector_element_basic_type(this, $src),
-                        Matcher::vector_length(this, $src),
-                        $tmp1$$FloatRegister, $tmp2$$FloatRegister);
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    __ sve2_vmask_tolong($dst$$Register, $src$$FloatRegister,
+                         $tmp1$$FloatRegister, $tmp2$$FloatRegister,
+                         Matcher::vector_length(this, $src));
   %}
   ins_pipe(pipe_slow);
 %}
 
 // fromlong
 
-instruct vmask_fromlong(pReg dst, iRegL src, vReg tmp1, vReg tmp2) %{
+instruct vmask_fromlong(vReg dst, iRegL src, vReg tmp) %{
   match(Set dst (VectorLongToMask src));
-  effect(TEMP tmp1, TEMP tmp2);
-  format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp1, $tmp2" %}
+  effect(TEMP_DEF dst, TEMP tmp);
+  format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp" %}
   ins_encode %{
-    __ sve_vmask_fromlong($dst$$PRegister, $src$$Register,
-                          Matcher::vector_element_basic_type(this),
-                          Matcher::vector_length(this),
-                          $tmp1$$FloatRegister, $tmp2$$FloatRegister);
+    __ sve_vmask_fromlong($dst$$FloatRegister, $src$$Register,
+                          $tmp$$FloatRegister, Matcher::vector_length(this));
   %}
   ins_pipe(pipe_slow);
 %}
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index a9f42e1bc08..dff82ce95ac 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -383,6 +383,32 @@ source %{
     return false;
   }
 
+  bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) {
+    // Only SVE supports the predicate feature.
+    if (UseSVE == 0) {
+      // On architectures that do not support predicate, masks are stored in
+      // general vector registers (TypeVect) with sizes ranging from TypeVectA
+      // to TypeVectX based on the vector size in bytes.
+      assert(vt->isa_vectmask() == nullptr, "mask type is not matched");
+      return false;
+    }
+
+    assert(vt->isa_vectmask() != nullptr, "expected TypeVectMask on SVE");
+    switch (opcode) {
+      case Op_VectorMaskToLong:
+      case Op_VectorLongToMask:
+        // These operations lack native SVE predicate instructions and are
+        // implemented using general vector instructions instead. Use vector
+        // registers rather than predicate registers to save the mask for
+        // better performance.
+        return false;
+      default:
+        // By default, the mask operations are implemented with predicate
+        // instructions with a predicate input/output.
+        return true;
+    }
+  }
+
   // Assert that the given node is not a variable shift.
   bool assert_not_var_shift(const Node* n) {
     assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
@@ -4303,31 +4329,44 @@ instruct vmask_tolong_neon(iRegLNoSp dst, vReg src) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct vmask_tolong_sve(iRegLNoSp dst, pReg src, vReg tmp1, vReg tmp2) %{
-  predicate(UseSVE > 0);
+instruct vmask_tolong_sve(iRegLNoSp dst, vReg src, vReg tmp) %{
+  predicate(UseSVE > 0 && !VM_Version::supports_svebitperm());
+  match(Set dst (VectorMaskToLong src));
+  effect(TEMP tmp);
+  format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp" %}
+  ins_encode %{
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    __ sve_vmask_tolong($dst$$Register, $src$$FloatRegister,
+                        $tmp$$FloatRegister, Matcher::vector_length(this, $src));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmask_tolong_sve2(iRegLNoSp dst, vReg src, vReg tmp1, vReg tmp2) %{
+  predicate(VM_Version::supports_svebitperm());
   match(Set dst (VectorMaskToLong src));
   effect(TEMP tmp1, TEMP tmp2);
-  format %{ "vmask_tolong_sve $dst, $src\t# KILL $tmp1, $tmp2" %}
+  format %{ "vmask_tolong_sve2 $dst, $src\t# KILL $tmp1, $tmp2" %}
   ins_encode %{
-    __ sve_vmask_tolong($dst$$Register, $src$$PRegister,
-                        Matcher::vector_element_basic_type(this, $src),
-                        Matcher::vector_length(this, $src),
-                        $tmp1$$FloatRegister, $tmp2$$FloatRegister);
+    // Input "src" is a vector of boolean represented as
+    // bytes with 0x00/0x01 as element values.
+    __ sve2_vmask_tolong($dst$$Register, $src$$FloatRegister,
+                         $tmp1$$FloatRegister, $tmp2$$FloatRegister,
+                         Matcher::vector_length(this, $src));
   %}
   ins_pipe(pipe_slow);
 %}
 
 // fromlong
 
-instruct vmask_fromlong(pReg dst, iRegL src, vReg tmp1, vReg tmp2) %{
+instruct vmask_fromlong(vReg dst, iRegL src, vReg tmp) %{
   match(Set dst (VectorLongToMask src));
-  effect(TEMP tmp1, TEMP tmp2);
-  format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp1, $tmp2" %}
+  effect(TEMP_DEF dst, TEMP tmp);
+  format %{ "vmask_fromlong $dst, $src\t# vector (sve2). KILL $tmp" %}
   ins_encode %{
-    __ sve_vmask_fromlong($dst$$PRegister, $src$$Register,
-                          Matcher::vector_element_basic_type(this),
-                          Matcher::vector_length(this),
-                          $tmp1$$FloatRegister, $tmp2$$FloatRegister);
+    __ sve_vmask_fromlong($dst$$FloatRegister, $src$$Register,
+                          $tmp$$FloatRegister, Matcher::vector_length(this));
   %}
   ins_pipe(pipe_slow);
 %}
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
index ebb4a897906..5e57044dcba 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@@ -1399,137 +1399,125 @@ void C2_MacroAssembler::bytemask_compress(Register dst) {
   andr(dst, dst, 0xff);                   // dst = 0x8D
 }
 
-// Pack the lowest-numbered bit of each mask element in src into a long value
-// in dst, at most the first 64 lane elements.
-// Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
-void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
-                                         FloatRegister vtmp1, FloatRegister vtmp2) {
+// Pack the value of each mask element in "src" into a long value in "dst", at most
+// the first 64 lane elements. The input "src" is a vector of boolean represented as
+// bytes with 0x00/0x01 as element values. Each lane value from "src" is packed into
+// one bit in "dst".
+//
+// Example:   src = 0x0001010000010001 0100000001010001, lane_cnt = 16
+// Expected:  dst = 0x658D
+//
+// Clobbers: rscratch1
+void C2_MacroAssembler::sve_vmask_tolong(Register dst, FloatRegister src,
+                                         FloatRegister vtmp, int lane_cnt) {
   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
   assert_different_registers(dst, rscratch1);
-  assert_different_registers(vtmp1, vtmp2);
+  assert_different_registers(src, vtmp);
+  assert(UseSVE > 0, "must be");
 
-  Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
-  // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
-  // Expected:  dst = 0x658D
+  // Compress the lowest 8 bytes.
+  fmovd(dst, src);
+  bytemask_compress(dst);
+  if (lane_cnt <= 8) return;
 
-  // Convert the mask into vector with sequential bytes.
-  // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
-  sve_cpy(vtmp1, size, src, 1, false);
-  if (bt != T_BYTE) {
-    sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
-  }
-
-  if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
-    // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
-    // is to compress each significant bit of the byte in a cross-lane way. Due
-    // to the lack of a cross-lane bit-compress instruction, we use BEXT
-    // (bit-compress in each lane) with the biggest lane size (T = D) then
-    // concatenate the results.
-
-    // The second source input of BEXT, initialized with 0x01 in each byte.
-    // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
-    sve_dup(vtmp2, B, 1);
-
-    // BEXT vtmp1.D, vtmp1.D, vtmp2.D
-    // vtmp1 = 0x0001010000010001 | 0x0100000001010001
-    // vtmp2 = 0x0101010101010101 | 0x0101010101010101
-    //         ---------------------------------------
-    // vtmp1 = 0x0000000000000065 | 0x000000000000008D
-    sve_bext(vtmp1, D, vtmp1, vtmp2);
-
-    // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
-    // result to dst.
-    // vtmp1 = 0x0000000000000000 | 0x000000000000658D
-    // dst   = 0x658D
-    if (lane_cnt <= 8) {
-      // No need to concatenate.
-      umov(dst, vtmp1, B, 0);
-    } else if (lane_cnt <= 16) {
-      ins(vtmp1, B, vtmp1, 1, 8);
-      umov(dst, vtmp1, H, 0);
-    } else {
-      // As the lane count is 64 at most, the final expected value must be in
-      // the lowest 64 bits after narrowing vtmp1 from D to B.
-      sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
-      umov(dst, vtmp1, D, 0);
-    }
-  } else if (UseSVE > 0) {
-    // Compress the lowest 8 bytes.
-    fmovd(dst, vtmp1);
-    bytemask_compress(dst);
-    if (lane_cnt <= 8) return;
-
-    // Repeat on higher bytes and join the results.
-    // Compress 8 bytes in each iteration.
-    for (int idx = 1; idx < (lane_cnt / 8); idx++) {
-      sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
-      bytemask_compress(rscratch1);
-      orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
-    }
-  } else {
-    assert(false, "unsupported");
-    ShouldNotReachHere();
+  // Repeat on higher bytes and join the results.
+  // Compress 8 bytes in each iteration.
+  for (int idx = 1; idx < (lane_cnt / 8); idx++) {
+    sve_extract_integral(rscratch1, T_LONG, src, idx, vtmp);
+    bytemask_compress(rscratch1);
+    orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
   }
 }
 
-// Unpack the mask, a long value in src, into predicate register dst based on the
-// corresponding data type. Note that dst can support at most 64 lanes.
-// Below example gives the expected dst predicate register in different types, with
-// a valid src(0x658D) on a 1024-bit vector size machine.
-// BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
-// SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
-// INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
-// LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
-//
-// The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
-// has 24 significant bits would be an invalid input if dst predicate register refers to
-// a LONG type 1024-bit vector, which has at most 16 lanes.
-void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
-                                           FloatRegister vtmp1, FloatRegister vtmp2) {
-  assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
-         lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
-  Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
-  // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
-  // Expected:  dst = 0b01101001 10001101
+// The function is same as above "sve_vmask_tolong", but it uses SVE2's BEXT
+// instruction which requires the FEAT_BITPERM feature.
+void C2_MacroAssembler::sve2_vmask_tolong(Register dst, FloatRegister src,
+                                          FloatRegister vtmp1, FloatRegister vtmp2,
+                                          int lane_cnt) {
+  assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
+  assert_different_registers(src, vtmp1, vtmp2);
+  assert(UseSVE > 1 && VM_Version::supports_svebitperm(), "must be");
 
-  // Put long value from general purpose register into the first lane of vector.
-  // vtmp1 = 0x0000000000000000 | 0x000000000000658D
-  sve_dup(vtmp1, B, 0);
-  mov(vtmp1, D, 0, src);
+  // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
+  // is to compress each significant bit of the byte in a cross-lane way. Due
+  // to the lack of a cross-lane bit-compress instruction, we use BEXT
+  // (bit-compress in each lane) with the biggest lane size (T = D) then
+  // concatenate the results.
 
-  // As sve_cmp generates mask value with the minimum unit in byte, we should
-  // transform the value in the first lane which is mask in bit now to the
-  // mask in byte, which can be done by SVE2's BDEP instruction.
-
-  // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
-  // vtmp1 = 0x0000000000000065 | 0x000000000000008D
-  if (lane_cnt <= 8) {
-    // Nothing. As only one byte exsits.
-  } else if (lane_cnt <= 16) {
-    ins(vtmp1, B, vtmp1, 8, 1);
-    mov(vtmp1, B, 1, zr);
-  } else {
-    sve_vector_extend(vtmp1, D, vtmp1, B);
-  }
-
-  // The second source input of BDEP instruction, initialized with 0x01 for each byte.
+  // The second source input of BEXT, initialized with 0x01 in each byte.
   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
   sve_dup(vtmp2, B, 1);
 
-  // BDEP vtmp1.D, vtmp1.D, vtmp2.D
-  // vtmp1 = 0x0000000000000065 | 0x000000000000008D
+  // BEXT vtmp1.D, src.D, vtmp2.D
+  // src   = 0x0001010000010001 | 0x0100000001010001
   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
   //         ---------------------------------------
-  // vtmp1 = 0x0001010000010001 | 0x0100000001010001
-  sve_bdep(vtmp1, D, vtmp1, vtmp2);
+  // vtmp1 = 0x0000000000000065 | 0x000000000000008D
+  sve_bext(vtmp1, D, src, vtmp2);
 
-  if (bt != T_BYTE) {
-    sve_vector_extend(vtmp1, size, vtmp1, B);
+  // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
+  // result to dst.
+  // vtmp1 = 0x0000000000000000 | 0x000000000000658D
+  // dst   = 0x658D
+  if (lane_cnt <= 8) {
+    // No need to concatenate.
+    umov(dst, vtmp1, B, 0);
+  } else if (lane_cnt <= 16) {
+    ins(vtmp1, B, vtmp1, 1, 8);
+    umov(dst, vtmp1, H, 0);
+  } else {
+    // As the lane count is 64 at most, the final expected value must be in
+    // the lowest 64 bits after narrowing vtmp1 from D to B.
+    sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
+    umov(dst, vtmp1, D, 0);
   }
-  // Generate mask according to the given vector, in which the elements have been
-  // extended to expected type.
-  // dst = 0b01101001 10001101
-  sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
+}
+
+// Unpack the mask, a long value in "src", into a vector register of boolean
+// represented as bytes with 0x00/0x01 as element values in "dst".  Each bit in
+// "src" is unpacked into one byte lane in "dst". Note that "dst" can support at
+// most 64 lanes.
+//
+// Below example gives the expected dst vector register, with a valid src(0x658D)
+// on a 128-bit vector size machine.
+// dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
+void C2_MacroAssembler::sve_vmask_fromlong(FloatRegister dst, Register src,
+                                           FloatRegister vtmp, int lane_cnt) {
+  assert_different_registers(dst, vtmp);
+  assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
+         lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
+
+  // Example:   src = 0x658D, lane_cnt = 16
+  // Expected:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
+
+  // Put long value from general purpose register into the first lane of vector.
+  // vtmp = 0x0000000000000000 | 0x000000000000658D
+  sve_dup(vtmp, B, 0);
+  mov(vtmp, D, 0, src);
+
+  // Transform the value in the first lane which is mask in bit now to the mask in
+  // byte, which can be done by SVE2's BDEP instruction.
+
+  // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
+  // vtmp = 0x0000000000000065 | 0x000000000000008D
+  if (lane_cnt <= 8) {
+    // Nothing. As only one byte exsits.
+  } else if (lane_cnt <= 16) {
+    ins(vtmp, B, vtmp, 8, 1);
+  } else {
+    sve_vector_extend(vtmp, D, vtmp, B);
+  }
+
+  // The second source input of BDEP instruction, initialized with 0x01 for each byte.
+  // dst = 0x01010101 0x01010101 0x01010101 0x01010101
+  sve_dup(dst, B, 1);
+
+  // BDEP dst.D, vtmp.D, dst.D
+  // vtmp = 0x0000000000000065 | 0x000000000000008D
+  // dst  = 0x0101010101010101 | 0x0101010101010101
+  //        ---------------------------------------
+  // dst  = 0x0001010000010001 | 0x0100000001010001
+  sve_bdep(dst, D, vtmp, dst);
 }
 
 // Clobbers: rflags
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
index ccd091938a3..412f0f37e9e 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@@ -85,15 +85,19 @@
   // the higher garbage bits.
   void bytemask_compress(Register dst);
 
-  // Pack the lowest-numbered bit of each mask element in src into a long value
-  // in dst, at most the first 64 lane elements.
-  void sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
-                        FloatRegister vtmp1, FloatRegister vtmp2);
+  // Pack the value of each mask element in "src" into a long value in "dst", at most the
+  // first 64 lane elements. The input "src" is a vector of boolean represented as bytes
+  // with 0x00/0x01 as element values. Each lane value from "src" is packed into one bit in
+  // "dst".
+  void sve_vmask_tolong(Register dst, FloatRegister src, FloatRegister vtmp, int lane_cnt);
 
-  // Unpack the mask, a long value in src, into predicate register dst based on the
-  // corresponding data type. Note that dst can support at most 64 lanes.
-  void sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
-                          FloatRegister vtmp1, FloatRegister vtmp2);
+  void sve2_vmask_tolong(Register dst, FloatRegister src, FloatRegister vtmp1,
+                         FloatRegister vtmp2, int lane_cnt);
+
+  // Unpack the mask, a long value in "src", into vector register "dst" with boolean type.
+  // Each bit in "src" is unpacked into one byte lane in "dst". Note that "dst" can support
+  // at most 64 lanes.
+  void sve_vmask_fromlong(FloatRegister dst, Register src, FloatRegister vtmp, int lane_cnt);
 
   // SIMD&FP comparison
   void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad
index 31a442be624..92c0df68deb 100644
--- a/src/hotspot/cpu/arm/arm.ad
+++ b/src/hotspot/cpu/arm/arm.ad
@@ -1003,6 +1003,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen
   return false;
 }
 
+bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) {
+  return false;
+}
+
 const RegMask* Matcher::predicate_reg_mask(void) {
   return nullptr;
 }
diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad
index 36326e5fdb7..7fcd096d2ad 100644
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@@ -2292,6 +2292,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen
   return false;
 }
 
+bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) {
+  return false;
+}
+
 const RegMask* Matcher::predicate_reg_mask(void) {
   return nullptr;
 }
diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad
index fe323474d60..d162280106a 100644
--- a/src/hotspot/cpu/riscv/riscv_v.ad
+++ b/src/hotspot/cpu/riscv/riscv_v.ad
@@ -164,6 +164,11 @@ source %{
   bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
     return false;
   }
+
+  bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) {
+    // Prefer predicate if the mask type is "TypeVectMask".
+    return vt->isa_vectmask() != nullptr;
+  }
 %}
 
 // All VEC instructions
diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad
index 2b2ce713491..cab3965ecfa 100644
--- a/src/hotspot/cpu/s390/s390.ad
+++ b/src/hotspot/cpu/s390/s390.ad
@@ -1809,6 +1809,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen
   return false;
 }
 
+bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) {
+  return false;
+}
+
 const RegMask* Matcher::predicate_reg_mask(void) {
   return nullptr;
 }
diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad
index 9a0bbdc27a0..a9748617e1f 100644
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@@ -3736,6 +3736,11 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen
   }
 }
 
+bool Matcher::mask_op_prefers_predicate(int opcode, const TypeVect* vt) {
+  // Prefer predicate if the mask type is "TypeVectMask".
+  return vt->isa_vectmask() != nullptr;
+}
+
 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
   bool legacy = (generic_opnd->opcode() == LEGVEC);
diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp
index e4396b423ac..01f11b1fdc9 100644
--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@@ -333,6 +333,12 @@ public:
 
   static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen);
 
+  // Identify if a vector mask operation prefers the input/output mask to be
+  // saved with a predicate type or not.
+  // - Return true if it prefers a predicate type (i.e. TypeVectMask).
+  // - Return false if it prefers a general vector type (i.e. TypeVectA to TypeVectZ).
+  static bool mask_op_prefers_predicate(int opcode, const TypeVect* vt);
+
   static const RegMask* predicate_reg_mask(void);
 
   // Vector width in bytes
diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp
index 85d9790c0eb..b48b5f2cd05 100644
--- a/src/hotspot/share/opto/vectorIntrinsics.cpp
+++ b/src/hotspot/share/opto/vectorIntrinsics.cpp
@@ -622,7 +622,7 @@ bool LibraryCallKit::inline_vector_mask_operation() {
     return false;
   }
 
-  if (mask_vec->bottom_type()->isa_vectmask() == nullptr) {
+  if (!Matcher::mask_op_prefers_predicate(mopc, mask_vec->bottom_type()->is_vect())) {
     mask_vec = gvn().transform(VectorStoreMaskNode::make(gvn(), mask_vec, elem_bt, num_elem));
   }
   const Type* maskoper_ty = mopc == Op_VectorMaskToLong ? (const Type*)TypeLong::LONG : (const Type*)TypeInt::INT;
@@ -708,7 +708,7 @@ bool LibraryCallKit::inline_vector_frombits_coerced() {
 
   if (opc == Op_VectorLongToMask) {
     const TypeVect* vt = TypeVect::makemask(elem_bt, num_elem);
-    if (vt->isa_vectmask()) {
+    if (Matcher::mask_op_prefers_predicate(opc, vt)) {
       broadcast = gvn().transform(new VectorLongToMaskNode(elem, vt));
     } else {
       const TypeVect* mvt = TypeVect::make(T_BOOLEAN, num_elem);
@@ -2545,7 +2545,7 @@ bool LibraryCallKit::inline_vector_extract() {
         return false;
       }
       // VectorMaskToLongNode requires the input is either a mask or a vector with BOOLEAN type.
-      if (opd->bottom_type()->isa_vectmask() == nullptr) {
+      if (!Matcher::mask_op_prefers_predicate(Op_VectorMaskToLong, opd->bottom_type()->is_vect())) {
         opd = gvn().transform(VectorStoreMaskNode::make(gvn(), opd, elem_bt, num_elem));
       }
       // ((toLong() >>> pos) & 1L
diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
index 6ae8bbe8aa0..a49f3d24fd4 100644
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -1403,7 +1403,7 @@ Node* ReductionNode::Ideal(PhaseGVN* phase, bool can_reshape) {
 }
 
 // Convert fromLong to maskAll if the input sets or unsets all lanes.
-Node* convertFromLongToMaskAll(PhaseGVN* phase, const TypeLong* bits_type, bool is_mask, const TypeVect* vt) {
+static Node* convertFromLongToMaskAll(PhaseGVN* phase, const TypeLong* bits_type, const TypeVect* vt) {
   uint vlen = vt->length();
   BasicType bt = vt->element_basic_type();
   // The "maskAll" API uses the corresponding integer types for floating-point data.
@@ -1418,7 +1418,7 @@ Node* convertFromLongToMaskAll(PhaseGVN* phase, const TypeLong* bits_type, bool
     } else {
       con = phase->intcon(con_value);
     }
-    Node* res = VectorNode::scalar2vector(con, vlen, maskall_bt, is_mask);
+    Node* res = VectorNode::scalar2vector(con, vlen, maskall_bt, vt->isa_vectmask() != nullptr);
     // Convert back to the original floating-point data type.
     if (is_floating_point_type(bt)) {
       res = new VectorMaskCastNode(phase->transform(res), vt);
@@ -1432,7 +1432,7 @@ Node* VectorLoadMaskNode::Ideal(PhaseGVN* phase, bool can_reshape) {
   // VectorLoadMask(VectorLongToMask(-1/0)) => Replicate(-1/0)
   if (in(1)->Opcode() == Op_VectorLongToMask) {
     const TypeVect* vt = bottom_type()->is_vect();
-    Node* res = convertFromLongToMaskAll(phase, in(1)->in(1)->bottom_type()->isa_long(), false, vt);
+    Node* res = convertFromLongToMaskAll(phase, in(1)->in(1)->bottom_type()->isa_long(), vt);
     if (res != nullptr) {
       return res;
     }
@@ -1900,10 +1900,12 @@ Node* VectorMaskCastNode::Identity(PhaseGVN* phase) {
 // l is -1 or 0.
 Node* VectorMaskToLongNode::Ideal_MaskAll(PhaseGVN* phase) {
   Node* in1 = in(1);
-  // VectorMaskToLong follows a VectorStoreMask if predicate is not supported.
+  // VectorMaskToLong follows a VectorStoreMask if it doesn't require the mask
+  // saved with a predicate type.
   if (in1->Opcode() == Op_VectorStoreMask) {
-    assert(!in1->in(1)->bottom_type()->isa_vectmask(), "sanity");
-    in1 = in1->in(1);
+    Node* mask = in1->in(1);
+    assert(!Matcher::mask_op_prefers_predicate(Opcode(), mask->bottom_type()->is_vect()), "sanity");
+    in1 = mask;
   }
   if (VectorNode::is_all_ones_vector(in1)) {
     int vlen = in1->bottom_type()->is_vect()->length();
@@ -1960,7 +1962,7 @@ Node* VectorLongToMaskNode::Ideal(PhaseGVN* phase, bool can_reshape) {
   // VectorLongToMask(-1/0) => MaskAll(-1/0)
   const TypeLong* bits_type = in(1)->bottom_type()->isa_long();
   if (bits_type && is_mask) {
-    Node* res = convertFromLongToMaskAll(phase, bits_type, true, dst_type);
+    Node* res = convertFromLongToMaskAll(phase, bits_type, dst_type);
     if (res != nullptr) {
       return res;
     }
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index 80429ad868a..25ebcc94844 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -2060,6 +2060,16 @@ public class IRNode {
         beforeMatchingNameRegex(STORE_VECTOR_SCATTER_MASKED, "StoreVectorScatterMasked");
     }
 
+    public static final String VECTOR_LOAD_MASK = PREFIX + "VECTOR_LOAD_MASK" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(VECTOR_LOAD_MASK, "VectorLoadMask");
+    }
+
+    public static final String VECTOR_STORE_MASK = PREFIX + "VECTOR_STORE_MASK" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(VECTOR_STORE_MASK, "VectorStoreMask");
+    }
+
     public static final String SUB = PREFIX + "SUB" + POSTFIX;
     static {
         beforeMatchingNameRegex(SUB, "Sub(I|L|F|D|HF)");
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java
index daa2b9765f8..a24cfbd3e37 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/test/IREncodingPrinter.java
@@ -114,6 +114,7 @@ public class IREncodingPrinter {
         "asimd",
         "sve",
         "sve2",
+        "svebitperm",
         "fphp",
         "asimdhp",
         // RISCV64
diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskFromLongTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskFromLongTest.java
index eaa6211efc5..c4feb97ebf3 100644
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskFromLongTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskFromLongTest.java
@@ -22,14 +22,14 @@
  */
 
 /*
-* @test
-* @bug 8356760 8367391
-* @library /test/lib /
-* @summary Optimize VectorMask.fromLong for all-true/all-false cases
-* @modules jdk.incubator.vector
-*
-* @run driver compiler.vectorapi.VectorMaskFromLongTest
-*/
+ * @test
+ * @bug 8356760 8367391 8367292
+ * @library /test/lib /
+ * @summary IR test for VectorMask.fromLong()
+ * @modules jdk.incubator.vector
+ *
+ * @run driver compiler.vectorapi.VectorMaskFromLongTest
+ */
 
 package compiler.vectorapi;
 
@@ -47,11 +47,6 @@ public class VectorMaskFromLongTest {
 
     static boolean[] mr = new boolean[B_SPECIES.length()];
 
-    @ForceInline
-    public static void maskFromLongKernel(VectorSpecies<?> species, long inputLong) {
-        VectorMask.fromLong(species, inputLong).intoArray(mr, 0);
-    }
-
     @DontInline
     public static void verifyMaskFromLong(VectorSpecies<?> species, long inputLong) {
         for (int i = 0; i < species.length(); i++) {
@@ -63,9 +58,11 @@ public class VectorMaskFromLongTest {
         }
     }
 
+    // Tests for "VectorLongToMask(-1/0) => MaskAll(-1/0)"
+
     @ForceInline
-    public static void testMaskFromLong(VectorSpecies<?> species, long inputLong ) {
-        maskFromLongKernel(species, inputLong);
+    public static void fromLongMaskAllKernel(VectorSpecies<?> species, long inputLong ) {
+        VectorMask.fromLong(species, inputLong).intoArray(mr, 0);
         verifyMaskFromLong(species, inputLong);
     }
 
@@ -73,16 +70,16 @@ public class VectorMaskFromLongTest {
     public static void testMaskFromLongMaskAll(VectorSpecies<?> species) {
         int vlen = species.length();
         long inputLong = 0L;
-        testMaskFromLong(species, inputLong);
+        fromLongMaskAllKernel(species, inputLong);
 
         inputLong = vlen >= 64 ? 0L : (0x1L << vlen);
-        testMaskFromLong(species, inputLong);
+        fromLongMaskAllKernel(species, inputLong);
 
         inputLong = -1L;
-        testMaskFromLong(species, inputLong);
+        fromLongMaskAllKernel(species, inputLong);
 
         inputLong = (-1L >>> (64 - vlen));
-        testMaskFromLong(species, inputLong);
+        fromLongMaskAllKernel(species, inputLong);
     }
 
     @Test
@@ -169,102 +166,104 @@ public class VectorMaskFromLongTest {
         testMaskFromLongMaskAll(D_SPECIES);
     }
 
-    // Tests for general input long values
+    // Tests for general input long values. The purpose is to test the IRs
+    // for API VectorMask.fromLong(). To avoid any IR being optimized out by
+    // compiler, we insert a VectorMask.not() after fromLong().
+
+    @ForceInline
+    public static void fromLongGeneralKernel(VectorSpecies<?> species, long inputLong) {
+        VectorMask.fromLong(species, inputLong).not().intoArray(mr, 0);
+        verifyMaskFromLong(species, inputLong ^ -1L);
+    }
+
+    @ForceInline
+    public static void testMaskFromLongGeneral(VectorSpecies<?> species) {
+        fromLongGeneralKernel(species, (-1L >>> (64 - species.length())) - 1);
+        fromLongGeneralKernel(species, (-1L >>> (64 - species.length())) >>> 1);
+    }
 
     @Test
-    @IR(counts = { IRNode.MASK_ALL, "= 0",
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" })
-    @IR(counts = { IRNode.REPLICATE_B, "= 0",
-                   IRNode.VECTOR_LONG_TO_MASK, "= 0" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
-    @IR(counts = { IRNode.REPLICATE_B, "= 0",
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
         applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
+                   IRNode.VECTOR_LONG_TO_MASK, "= 2" },
+        applyIfCPUFeature = { "svebitperm", "true" })
     public static void testMaskFromLongByte() {
-        // Test cases where some but not all bits are set.
-        testMaskFromLong(B_SPECIES, (-1L >>> (64 - B_SPECIES.length())) - 1);
-        testMaskFromLong(B_SPECIES, (-1L >>> (64 - B_SPECIES.length())) >>> 1);
+        testMaskFromLongGeneral(B_SPECIES);
     }
 
     @Test
-    @IR(counts = { IRNode.MASK_ALL, "= 0",
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" })
-    @IR(counts = { IRNode.REPLICATE_S, "= 0",
-                   IRNode.VECTOR_LONG_TO_MASK, "= 0" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
-    @IR(counts = { IRNode.REPLICATE_S, "= 0",
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
         applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
+                   IRNode.VECTOR_LONG_TO_MASK, "= 2" },
+        applyIfCPUFeature = { "svebitperm", "true" })
     public static void testMaskFromLongShort() {
-        // Test cases where some but not all bits are set.
-        testMaskFromLong(S_SPECIES, (-1L >>> (64 - S_SPECIES.length())) - 1);
-        testMaskFromLong(S_SPECIES, (-1L >>> (64 - S_SPECIES.length())) >>> 1);
+        testMaskFromLongGeneral(S_SPECIES);
     }
 
     @Test
-    @IR(counts = { IRNode.MASK_ALL, "= 0",
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" })
-    @IR(counts = { IRNode.REPLICATE_I, "= 0",
-                   IRNode.VECTOR_LONG_TO_MASK, "= 0" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
-    @IR(counts = { IRNode.REPLICATE_I, "= 0",
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
         applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
+                   IRNode.VECTOR_LONG_TO_MASK, "= 2" },
+        applyIfCPUFeature = { "svebitperm", "true" })
     public static void testMaskFromLongInt() {
-        // Test cases where some but not all bits are set.
-        testMaskFromLong(I_SPECIES, (-1L >>> (64 - I_SPECIES.length())) - 1);
-        testMaskFromLong(I_SPECIES, (-1L >>> (64 - I_SPECIES.length())) >>> 1);
+        testMaskFromLongGeneral(I_SPECIES);
     }
 
     @Test
-    @IR(counts = { IRNode.MASK_ALL, "= 0",
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" })
-    @IR(counts = { IRNode.REPLICATE_L, "= 0",
-                   IRNode.VECTOR_LONG_TO_MASK, "= 0" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
-    @IR(counts = { IRNode.REPLICATE_L, "= 0",
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
         applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
+                   IRNode.VECTOR_LONG_TO_MASK, "= 2" },
+        applyIfCPUFeature = { "svebitperm", "true" })
     public static void testMaskFromLongLong() {
-        // Test cases where some but not all bits are set.
-        testMaskFromLong(L_SPECIES, (-1L >>> (64 - L_SPECIES.length())) - 1);
-        testMaskFromLong(L_SPECIES, (-1L >>> (64 - L_SPECIES.length())) >>> 1);
+        testMaskFromLongGeneral(L_SPECIES);
     }
 
     @Test
-    @IR(counts = { IRNode.MASK_ALL, "= 0",
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" })
-    @IR(counts = { IRNode.REPLICATE_I, "= 0",
-                   IRNode.VECTOR_LONG_TO_MASK, "= 0" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
-    @IR(counts = { IRNode.REPLICATE_I, "= 0",
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
         applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
+                   IRNode.VECTOR_LONG_TO_MASK, "= 2" },
+        applyIfCPUFeature = { "svebitperm", "true" })
     public static void testMaskFromLongFloat() {
-        // Test cases where some but not all bits are set.
-        testMaskFromLong(F_SPECIES, (-1L >>> (64 - F_SPECIES.length())) - 1);
-        testMaskFromLong(F_SPECIES, (-1L >>> (64 - F_SPECIES.length())) >>> 1);
+        testMaskFromLongGeneral(F_SPECIES);
     }
 
     @Test
-    @IR(counts = { IRNode.MASK_ALL, "= 0",
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 0",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx512", "true", "rvv", "true" })
-    @IR(counts = { IRNode.REPLICATE_L, "= 0",
-                   IRNode.VECTOR_LONG_TO_MASK, "= 0" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
-    @IR(counts = { IRNode.REPLICATE_L, "= 0",
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
                    IRNode.VECTOR_LONG_TO_MASK, "= 2" },
         applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_LOAD_MASK, "= 2",
+                   IRNode.VECTOR_LONG_TO_MASK, "= 2" },
+        applyIfCPUFeature = { "svebitperm", "true" })
     public static void testMaskFromLongDouble() {
-        // Test cases where some but not all bits are set.
-        testMaskFromLong(D_SPECIES, (-1L >>> (64 - D_SPECIES.length())) - 1);
-        testMaskFromLong(D_SPECIES, (-1L >>> (64 - D_SPECIES.length())) >>> 1);
+        testMaskFromLongGeneral(D_SPECIES);
     }
 
     public static void main(String[] args) {
diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskToLongTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskToLongTest.java
index 3201d593efe..35a5aca966a 100644
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorMaskToLongTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorMaskToLongTest.java
@@ -22,18 +22,19 @@
  */
 
 /*
-* @test
-* @bug 8356760
-* @library /test/lib /
-* @summary Optimize VectorMask.fromLong for all-true/all-false cases
-* @modules jdk.incubator.vector
-*
-* @run driver compiler.vectorapi.VectorMaskToLongTest
-*/
+ * @test
+ * @bug 8356760 8367292
+ * @library /test/lib /
+ * @summary IR test for VectorMask.toLong()
+ * @modules jdk.incubator.vector
+ *
+ * @run driver compiler.vectorapi.VectorMaskToLongTest
+ */
 
 package compiler.vectorapi;
 
 import compiler.lib.ir_framework.*;
+import java.util.Arrays;
 import jdk.incubator.vector.*;
 import jdk.test.lib.Asserts;
 
@@ -45,12 +46,21 @@ public class VectorMaskToLongTest {
     static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_MAX;
     static final VectorSpecies<Double> D_SPECIES = DoubleVector.SPECIES_MAX;
 
+    private static boolean[] m;
+
+    static {
+        m = new boolean[B_SPECIES.length()];
+        Arrays.fill(m, true);
+    }
+
     @DontInline
     public static void verifyMaskToLong(VectorSpecies<?> species, long inputLong, long got) {
         long expected = inputLong & (-1L >>> (64 - species.length()));
         Asserts.assertEquals(expected, got, "for input long " + inputLong);
     }
 
+    // Tests for "VectorMaskToLong(MaskAll(0/-1)) => ((0/-1) & (-1ULL >> (64 - vlen)))"
+
     @ForceInline
     public static void testMaskAllToLong(VectorSpecies<?> species) {
         int vlen = species.length();
@@ -173,12 +183,12 @@ public class VectorMaskToLongTest {
     @Test
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 0" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" })
+        applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" })
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 1" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
+        applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" })
     public static void testFromLongToLongByte() {
-        // Test the case where some but not all bits are set.
+       // Test the case where some but not all bits are set.
        long inputLong = (-1L >>> (64 - B_SPECIES.length()))-1;
        long got = VectorMask.fromLong(B_SPECIES, inputLong).toLong();
        verifyMaskToLong(B_SPECIES, inputLong, got);
@@ -187,10 +197,10 @@ public class VectorMaskToLongTest {
     @Test
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 0" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" })
+        applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" })
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 1" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
+        applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" })
     public static void testFromLongToLongShort() {
         // Test the case where some but not all bits are set.
         long inputLong = (-1L >>> (64 - S_SPECIES.length()))-1;
@@ -201,10 +211,10 @@ public class VectorMaskToLongTest {
     @Test
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 0" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" })
+        applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" })
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 1" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
+        applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" })
     public static void testFromLongToLongInt() {
         // Test the case where some but not all bits are set.
         long inputLong = (-1L >>> (64 - I_SPECIES.length()))-1;
@@ -215,10 +225,10 @@ public class VectorMaskToLongTest {
     @Test
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 0" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" })
+        applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" })
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 1" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
+        applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" })
     public static void testFromLongToLongLong() {
         // Test the case where some but not all bits are set.
         long inputLong = (-1L >>> (64 - L_SPECIES.length()))-1;
@@ -229,10 +239,10 @@ public class VectorMaskToLongTest {
     @Test
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 1",
                    IRNode.VECTOR_MASK_TO_LONG, "= 1" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" })
+        applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" })
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 1" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
+        applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" })
     public static void testFromLongToLongFloat() {
         // Test the case where some but not all bits are set.
         long inputLong = (-1L >>> (64 - F_SPECIES.length()))-1;
@@ -243,10 +253,10 @@ public class VectorMaskToLongTest {
     @Test
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 1",
                    IRNode.VECTOR_MASK_TO_LONG, "= 1" },
-        applyIfCPUFeatureOr = { "sve2", "true", "avx2", "true", "rvv", "true" })
+        applyIfCPUFeatureOr = { "svebitperm", "true", "avx2", "true", "rvv", "true" })
     @IR(counts = { IRNode.VECTOR_LONG_TO_MASK, "= 0",
                    IRNode.VECTOR_MASK_TO_LONG, "= 1" },
-        applyIfCPUFeatureAnd = { "asimd", "true", "sve", "false" })
+        applyIfCPUFeatureAnd = { "asimd", "true", "svebitperm", "false" })
     public static void testFromLongToLongDouble() {
         // Test the case where some but not all bits are set.
         long inputLong = (-1L >>> (64 - D_SPECIES.length()))-1;
@@ -254,6 +264,100 @@ public class VectorMaskToLongTest {
         verifyMaskToLong(D_SPECIES, inputLong, got);
     }
 
+    // General cases for VectorMask.toLong(). The main purpose is to test the IRs
+    // for API VectorMask.toLong(). To avoid the IRs being optimized out by compiler,
+    // we insert a VectorMask.not() before toLong().
+
+    @ForceInline
+    public static void testToLongGeneral(VectorSpecies<?> species) {
+        long got = VectorMask.fromArray(species, m, 0).not().toLong();
+        verifyMaskToLong(species, 0, got);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeature = { "asimd", "true" })
+    public static void testToLongByte() {
+        testToLongGeneral(B_SPECIES);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeature = { "asimd", "true" })
+    public static void testToLongShort() {
+        testToLongGeneral(S_SPECIES);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeature = { "asimd", "true" })
+    public static void testToLongInt() {
+        testToLongGeneral(I_SPECIES);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeature = { "asimd", "true" })
+    public static void testToLongLong() {
+        testToLongGeneral(L_SPECIES);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeature = { "asimd", "true" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    public static void testToLongFloat() {
+        testToLongGeneral(F_SPECIES);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 0",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureOr = { "avx512", "true", "rvv", "true" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeatureAnd = { "avx2", "true", "avx512", "false" })
+    @IR(counts = { IRNode.VECTOR_STORE_MASK, "= 1",
+                   IRNode.VECTOR_MASK_TO_LONG, "= 1" },
+        applyIfCPUFeature = { "asimd", "true" })
+    public static void testToLongDouble() {
+        testToLongGeneral(D_SPECIES);
+    }
+
     public static void main(String[] args) {
         TestFramework testFramework = new TestFramework();
         testFramework.setDefaultWarmup(10000)