8382052: VectorAPI: Optimize the lanewise BITWISE_BLEND for AArch64

Reviewed-by: xgong, epeter, aph
2026-07-02 15:20:27 +00:00 · 2026-06-17 06:30:18 +00:00 · 2026-06-17 06:30:18 +00:00 · 5fbce068bd
commit 5fbce068bd
parent 2d65ea61d9
13 changed files with 477 additions and 75 deletions
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@ -317,6 +317,13 @@ source %{
          return false; // NEON only, since SLI/USHR are not available in SVE
        }
        break;
+      case Op_VectorBitwiseBlend:
+        // Use NEON BSL when UseSVE < 2; SVE1 has no BSL so larger vectors are
+        // not supported on UseSVE == 1 machines.
+        if (UseSVE < 2 && length_in_bytes > 16) {
+          return false;
+        }
+        break;
      default:
        break;
    }
@ -340,6 +347,7 @@ source %{
      case Op_MulReductionVL:
      case Op_CompressBitsV:
      case Op_ExpandBitsV:
+      case Op_VectorBitwiseBlend:
        return false;
      case Op_SaturatingAddV:
      case Op_SaturatingSubV:
@ -7051,6 +7059,31 @@ instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{
  ins_pipe(pipe_slow);
 %}

+// ------------------------------ Vector bitwise blend -------------------------
+
+instruct vbitwise_blend_neon_sve1(vReg src1, vReg src2, vReg dst_src3) %{
+  predicate(UseSVE < 2 &&
+            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
+  match(Set dst_src3 (VectorBitwiseBlend (Binary src1 src2) dst_src3));
+  format %{ "vbitwise_blend_neon_sve1 $src1, $src2, $dst_src3" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    Assembler::SIMD_Arrangement T = length_in_bytes == 16 ? __ T16B : __ T8B;
+    __ bsl($dst_src3$$FloatRegister, T, $src2$$FloatRegister, $src1$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vbitwise_blend_sve2(vReg src1, vReg dst_src2, vReg src3) %{
+  predicate(UseSVE == 2);
+  match(Set dst_src2 (VectorBitwiseBlend (Binary src1 dst_src2) src3));
+  format %{ "vbitwise_blend_sve2 $src1, $dst_src2, $src3" %}
+  ins_encode %{
+    __ sve_bsl($dst_src2$$FloatRegister, $src1$$FloatRegister, $src3$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // ------------------------------ Vector round ---------------------------------

 // vector Math.round
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@ -307,6 +307,13 @@ source %{
          return false; // NEON only, since SLI/USHR are not available in SVE
        }
        break;
+      case Op_VectorBitwiseBlend:
+        // Use NEON BSL when UseSVE < 2; SVE1 has no BSL so larger vectors are
+        // not supported on UseSVE == 1 machines.
+        if (UseSVE < 2 && length_in_bytes > 16) {
+          return false;
+        }
+        break;
      default:
        break;
    }
@ -330,6 +337,7 @@ source %{
      case Op_MulReductionVL:
      case Op_CompressBitsV:
      case Op_ExpandBitsV:
+      case Op_VectorBitwiseBlend:
        return false;
      case Op_SaturatingAddV:
      case Op_SaturatingSubV:
@ -4754,6 +4762,31 @@ instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{
  ins_pipe(pipe_slow);
 %}

+// ------------------------------ Vector bitwise blend -------------------------
+
+instruct vbitwise_blend_neon_sve1(vReg src1, vReg src2, vReg dst_src3) %{
+  predicate(UseSVE < 2 &&
+            VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
+  match(Set dst_src3 (VectorBitwiseBlend (Binary src1 src2) dst_src3));
+  format %{ "vbitwise_blend_neon_sve1 $src1, $src2, $dst_src3" %}
+  ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    Assembler::SIMD_Arrangement T = length_in_bytes == 16 ? __ T16B : __ T8B;
+    __ bsl($dst_src3$$FloatRegister, T, $src2$$FloatRegister, $src1$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vbitwise_blend_sve2(vReg src1, vReg dst_src2, vReg src3) %{
+  predicate(UseSVE == 2);
+  match(Set dst_src2 (VectorBitwiseBlend (Binary src1 dst_src2) src3));
+  format %{ "vbitwise_blend_sve2 $src1, $dst_src2, $src3" %}
+  ins_encode %{
+    __ sve_bsl($dst_src2$$FloatRegister, $src1$$FloatRegister, $src3$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // ------------------------------ Vector round ---------------------------------

 // vector Math.round
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -4292,14 +4292,15 @@ public:
 #undef INSN

 // SVE2 bitwise ternary operations
-#define INSN(NAME, opc)                                               \
-  void NAME(FloatRegister Zdn, FloatRegister Zm, FloatRegister Zk) {  \
-    starti;                                                           \
-    f(0b00000100, 31, 24), f(opc, 23, 21), rf(Zm, 16);                \
-    f(0b001110, 15, 10), rf(Zk, 5), rf(Zdn, 0);                       \
+#define INSN(NAME, op1, op2)                                           \
+  void NAME(FloatRegister Zdn, FloatRegister Zm, FloatRegister Zk) {   \
+    starti;                                                            \
+    f(0b00000100, 31, 24), f(op1, 23, 21), rf(Zm, 16);                 \
+    f(0b00111, 15, 11), f(op2, 10), rf(Zk, 5), rf(Zdn, 0);             \
  }

-  INSN(sve_eor3, 0b001); // Bitwise exclusive OR of three vectors
+  INSN(sve_eor3, 0b001, 0b0); // Bitwise exclusive OR of three vectors
+  INSN(sve_bsl,  0b001, 0b1); // Bitwise select
 #undef INSN

 // SVE2 saturating operations - predicate
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@ -1843,13 +1843,19 @@ public:

 #undef SVE_DESTRUCTIVE_TERNARY_INS

-  using Assembler::sve_eor3;
-  void sve_eor3(FloatRegister Zd, FloatRegister Zm, FloatRegister Zk) {
-    if (Zd != Zm && Zd != Zk) {
-      try_to_replace_prev_vector_copy_with_movprfx(Zd);
-    }
-    Assembler::sve_eor3(Zd, Zm, Zk);
+#define SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(NAME)                               \
+  using Assembler::NAME;                                                       \
+  void NAME(FloatRegister Zd, FloatRegister Zm, FloatRegister Zk) {            \
+    if (Zd != Zm && Zd != Zk) {                                                \
+      try_to_replace_prev_vector_copy_with_movprfx(Zd);                        \
+    }                                                                          \
+    Assembler::NAME(Zd, Zm, Zk);                                               \
  }
+
+  SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(sve_bsl);
+  SVE_DESTRUCTIVE_TERNARY_UNPRED_INS(sve_eor3);
+
+#undef SVE_DESTRUCTIVE_TERNARY_UNPRED_INS
 };

 #ifdef ASSERT
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@ -512,6 +512,7 @@ macro(VectorMaskWrapper)
 macro(VectorMaskCmp)
 macro(VectorMaskCast)
 macro(VectorTest)
+macro(VectorBitwiseBlend)
 macro(VectorBlend)
 macro(VectorRearrange)
 macro(VectorLoadMask)
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@ -2385,7 +2385,8 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
      break;
    }
    case Op_VectorBlend:
-    case Op_VectorInsert: {
+    case Op_VectorInsert:
+    case Op_VectorBitwiseBlend: {
      Node* pair = new BinaryNode(n->in(1), n->in(2));
      n->set_req(1, pair);
      n->set_req(2, n->in(3));
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -874,6 +874,7 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, Node* n3, const TypeV
  case Op_SignumVD: return new SignumVDNode(n1, n2, n3, vt);
  case Op_SignumVF: return new SignumVFNode(n1, n2, n3, vt);
  case Op_VectorBlend: return new VectorBlendNode(n1, n2, n3);
+  case Op_VectorBitwiseBlend: return new VectorBitwiseBlendNode(n1, n2, n3, vt);
  default:
    fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
    return nullptr;
@ -2768,6 +2769,70 @@ Node* XorVNode::Ideal_XorV_VectorMaskCmp(PhaseGVN* phase, bool can_reshape) {
  return res;
 }

+// XorV(a, AndV(sel, XorV(a, b)))       => VectorBitwiseBlend(a, b, sel)
+// XorV(a, AndV(sel, XorV(a, b)), mask) =>
+//   VectorBlend(a, VectorBitwiseBlend(a, b, sel), mask)
+Node* XorVNode::Ideal_XorV_to_VectorBitwiseBlend(PhaseGVN* phase, bool can_reshape) {
+  const TypeVect* vt = vect_type();
+  BasicType bt = vt->element_basic_type();
+  uint vlen = vt->length();
+  if (!Matcher::match_rule_supported_vector(Op_VectorBitwiseBlend, vlen, bt)) {
+    return nullptr;
+  }
+
+  bool is_masked = is_predicated_vector();
+  if (is_masked &&
+      !Matcher::match_rule_supported_vector(Op_VectorBlend, vlen, bt)) {
+    return nullptr;
+  }
+
+  // For the predicated case in(1) is fixed as the merge source. Otherwise the
+  // outer XorV is commutative.
+  Node* a = nullptr;
+  Node* andv = nullptr;
+  if (is_masked || in(2)->Opcode() == Op_AndV) {
+    andv = in(2);
+    a = in(1);
+  } else {
+    andv = in(1);
+    a = in(2);
+  }
+  if (andv->Opcode() != Op_AndV || andv->is_predicated_vector()) {
+    return nullptr;
+  }
+
+  Node* sel = nullptr;
+  Node* inner_xor = nullptr;
+  if (andv->in(2)->Opcode() == Op_XorV) {
+    inner_xor = andv->in(2);
+    sel = andv->in(1);
+  } else if (andv->in(1)->Opcode() == Op_XorV) {
+    inner_xor = andv->in(1);
+    sel = andv->in(2);
+  } else {
+    return nullptr;
+  }
+  if (inner_xor->is_predicated_vector()) {
+    return nullptr;
+  }
+
+  Node* b = nullptr;
+  if (inner_xor->in(1) == a) {
+    b = inner_xor->in(2);
+  } else if (inner_xor->in(2) == a) {
+    b = inner_xor->in(1);
+  } else {
+    return nullptr;
+  }
+
+  Node* blend = new VectorBitwiseBlendNode(a, b, sel, vt);
+  if (!is_masked) {
+    return blend;
+  }
+  blend = phase->transform(blend);
+  return new VectorBlendNode(a, blend, in(3));
+}
+
 Node* XorVNode::Ideal(PhaseGVN* phase, bool can_reshape) {
  // (XorV src src)      => (Replicate zero)
  // (XorVMask src src)  => (MaskAll zero)
@ -2786,6 +2851,11 @@ Node* XorVNode::Ideal(PhaseGVN* phase, bool can_reshape) {
  if (res != nullptr) {
    return res;
  }
+
+  res = Ideal_XorV_to_VectorBitwiseBlend(phase, can_reshape);
+  if (res != nullptr) {
+    return res;
+  }
  return VectorNode::Ideal(phase, can_reshape);
 }

--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -1075,6 +1075,7 @@ class XorVNode : public VectorNode {
  virtual int Opcode() const;
  virtual Node* Ideal(PhaseGVN* phase, bool can_reshape);
  Node* Ideal_XorV_VectorMaskCmp(PhaseGVN* phase, bool can_reshape);
+  Node* Ideal_XorV_to_VectorBitwiseBlend(PhaseGVN* phase, bool can_reshape);
 };

 // Vector xor byte, short, int, long as a reduction
@ -1802,6 +1803,24 @@ class VectorBlendNode : public VectorNode {
  Node* vec_mask() const { return in(3); }
 };

+// Vector bitwise blend (bit-select): (sel & vec_true) | (~sel & vec_false).
+class VectorBitwiseBlendNode : public VectorNode {
+ public:
+  VectorBitwiseBlendNode(Node* vec_false, Node* vec_true, Node* sel, const TypeVect* vt)
+    : VectorNode(vec_false, vec_true, sel, vt) {
+    assert(vec_false->bottom_type()->isa_vect() != nullptr &&
+           vec_true->bottom_type()->isa_vect() != nullptr &&
+           sel->bottom_type()->isa_vect() != nullptr,
+           "inputs must all be vectors");
+    uint vlen = vt->length();
+    assert(vec_false->bottom_type()->is_vect()->length() == vlen &&
+           vec_true->bottom_type()->is_vect()->length() == vlen &&
+           sel->bottom_type()->is_vect()->length() == vlen,
+           "mismatched vector length");
+  }
+  virtual int Opcode() const;
+};
+
 // Rearrange lane elements from a source vector under the control of a shuffle
 // (indexes) vector. Each lane in the shuffle vector specifies which lane from
 // the source vector to select for the corresponding output lane. All indexes
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@ -1121,7 +1121,7 @@ class SVEVectorOp(Instruction):
        self._bitwiseop = False
        if name[0] == 'f':
            self._width = RegVariant(2, 3)
-        elif not self._isPredicated and (name in ["and", "eor", "orr", "bic", "eor3"]):
+        elif not self._isPredicated and (name in ["and", "bic", "bsl", "eor", "eor3", "orr"]):
            self._width = RegVariant(3, 3)
            self._bitwiseop = True
        elif name == "revb":
@ -1150,7 +1150,7 @@ class SVEVectorOp(Instruction):
                        width +
                        [str(self.reg[i]) for i in range(1, self.numRegs)]))
    def astr(self):
-        firstArg = 0 if self._name == "eor3" else 1
+        firstArg = 0 if self._name in ["bsl", "eor3"] else 1
        formatStr = "%s%s" + ''.join([", %s" for i in range(firstArg, self.numRegs)])
        if self._dnm == 'dn':
            formatStr += ", %s"
@ -2258,6 +2258,7 @@ generate(SVEVectorOp, [["add", "ZZZ"],
                       # SVE2 instructions
                       ["bext", "ZZZ"],
                       ["bdep", "ZZZ"],
+                       ["bsl", "ZZZ"],
                       ["eor3", "ZZZ"],
                       ["sqadd", "ZPZ", "m", "dn"],
                       ["sqsub", "ZPZ", "m", "dn"],
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@ -1419,38 +1419,39 @@
    __ sve_fabd(z14, __ S, p5, z22);                   //       fabd    z14.s, p5/m, z14.s, z22.s
    __ sve_bext(z5, __ H, z18, z0);                    //       bext    z5.h, z18.h, z0.h
    __ sve_bdep(z9, __ D, z2, z3);                     //       bdep    z9.d, z2.d, z3.d
-    __ sve_eor3(z14, z4, z29);                         //       eor3    z14.d, z14.d, z4.d, z29.d
-    __ sve_sqadd(z14, __ D, p5, z4);                   //       sqadd   z14.d, p5/m, z14.d, z4.d
-    __ sve_sqsub(z27, __ S, p3, z22);                  //       sqsub   z27.s, p3/m, z27.s, z22.s
-    __ sve_uqadd(z31, __ S, p6, z11);                  //       uqadd   z31.s, p6/m, z31.s, z11.s
-    __ sve_uqsub(z12, __ B, p4, z28);                  //       uqsub   z12.b, p4/m, z12.b, z28.b
+    __ sve_bsl(z14, z4, z29);                          //       bsl     z14.d, z14.d, z4.d, z29.d
+    __ sve_eor3(z14, z22, z4);                         //       eor3    z14.d, z14.d, z22.d, z4.d
+    __ sve_sqadd(z27, __ S, p3, z22);                  //       sqadd   z27.s, p3/m, z27.s, z22.s
+    __ sve_sqsub(z31, __ S, p6, z11);                  //       sqsub   z31.s, p6/m, z31.s, z11.s
+    __ sve_uqadd(z12, __ B, p4, z28);                  //       uqadd   z12.b, p4/m, z12.b, z28.b
+    __ sve_uqsub(z28, __ D, p4, z4);                   //       uqsub   z28.d, p4/m, z28.d, z4.d

 // SVEReductionOp
-    __ sve_andv(v28, __ D, p4, z4);                    //       andv d28, p4, z4.d
-    __ sve_orv(v6, __ S, p0, z15);                     //       orv s6, p0, z15.s
-    __ sve_eorv(v1, __ S, p5, z18);                    //       eorv s1, p5, z18.s
-    __ sve_smaxv(v2, __ H, p2, z4);                    //       smaxv h2, p2, z4.h
-    __ sve_sminv(v11, __ S, p2, z28);                  //       sminv s11, p2, z28.s
-    __ sve_umaxv(v3, __ H, p5, z31);                   //       umaxv h3, p5, z31.h
-    __ sve_uminv(v24, __ H, p5, z15);                  //       uminv h24, p5, z15.h
-    __ sve_fminv(v6, __ S, p3, z8);                    //       fminv s6, p3, z8.s
-    __ sve_fmaxv(v21, __ D, p7, z4);                   //       fmaxv d21, p7, z4.d
-    __ sve_fadda(v24, __ S, p5, z6);                   //       fadda s24, p5, s24, z6.s
-    __ sve_uaddv(v4, __ D, p2, z9);                    //       uaddv d4, p2, z9.d
+    __ sve_andv(v6, __ S, p0, z15);                    //       andv s6, p0, z15.s
+    __ sve_orv(v1, __ S, p5, z18);                     //       orv s1, p5, z18.s
+    __ sve_eorv(v2, __ H, p2, z4);                     //       eorv h2, p2, z4.h
+    __ sve_smaxv(v11, __ S, p2, z28);                  //       smaxv s11, p2, z28.s
+    __ sve_sminv(v3, __ H, p5, z31);                   //       sminv h3, p5, z31.h
+    __ sve_umaxv(v24, __ H, p5, z15);                  //       umaxv h24, p5, z15.h
+    __ sve_uminv(v6, __ H, p3, z8);                    //       uminv h6, p3, z8.h
+    __ sve_fminv(v21, __ D, p7, z4);                   //       fminv d21, p7, z4.d
+    __ sve_fmaxv(v24, __ S, p5, z6);                   //       fmaxv s24, p5, z6.s
+    __ sve_fadda(v4, __ D, p2, z9);                    //       fadda d4, p2, d4, z9.d
+    __ sve_uaddv(v10, __ S, p1, z31);                  //       uaddv d10, p1, z31.s

 // AddWideNEONOp
-    __ saddwv(v10, v11, __ T8H, v12, __ T8B);          //       saddw   v10.8H, v11.8H, v12.8B
-    __ saddwv2(v5, v6, __ T8H, v7, __ T16B);           //       saddw2  v5.8H, v6.8H, v7.16B
-    __ saddwv(v31, v0, __ T4S, v1, __ T4H);            //       saddw   v31.4S, v0.4S, v1.4H
-    __ saddwv2(v22, v23, __ T4S, v24, __ T8H);         //       saddw2  v22.4S, v23.4S, v24.8H
-    __ saddwv(v25, v26, __ T2D, v27, __ T2S);          //       saddw   v25.2D, v26.2D, v27.2S
-    __ saddwv2(v15, v16, __ T2D, v17, __ T4S);         //       saddw2  v15.2D, v16.2D, v17.4S
-    __ uaddwv(v3, v4, __ T8H, v5, __ T8B);             //       uaddw   v3.8H, v4.8H, v5.8B
-    __ uaddwv2(v18, v19, __ T8H, v20, __ T16B);        //       uaddw2  v18.8H, v19.8H, v20.16B
-    __ uaddwv(v14, v15, __ T4S, v16, __ T4H);          //       uaddw   v14.4S, v15.4S, v16.4H
-    __ uaddwv2(v10, v11, __ T4S, v12, __ T8H);         //       uaddw2  v10.4S, v11.4S, v12.8H
-    __ uaddwv(v2, v3, __ T2D, v4, __ T2S);             //       uaddw   v2.2D, v3.2D, v4.2S
-    __ uaddwv2(v10, v11, __ T2D, v12, __ T4S);         //       uaddw2  v10.2D, v11.2D, v12.4S
+    __ saddwv(v25, v26, __ T8H, v27, __ T8B);          //       saddw   v25.8H, v26.8H, v27.8B
+    __ saddwv2(v15, v16, __ T8H, v17, __ T16B);        //       saddw2  v15.8H, v16.8H, v17.16B
+    __ saddwv(v3, v4, __ T4S, v5, __ T4H);             //       saddw   v3.4S, v4.4S, v5.4H
+    __ saddwv2(v18, v19, __ T4S, v20, __ T8H);         //       saddw2  v18.4S, v19.4S, v20.8H
+    __ saddwv(v14, v15, __ T2D, v16, __ T2S);          //       saddw   v14.2D, v15.2D, v16.2S
+    __ saddwv2(v10, v11, __ T2D, v12, __ T4S);         //       saddw2  v10.2D, v11.2D, v12.4S
+    __ uaddwv(v2, v3, __ T8H, v4, __ T8B);             //       uaddw   v2.8H, v3.8H, v4.8B
+    __ uaddwv2(v10, v11, __ T8H, v12, __ T16B);        //       uaddw2  v10.8H, v11.8H, v12.16B
+    __ uaddwv(v8, v9, __ T4S, v10, __ T4H);            //       uaddw   v8.4S, v9.4S, v10.4H
+    __ uaddwv2(v11, v12, __ T4S, v13, __ T8H);         //       uaddw2  v11.4S, v12.4S, v13.8H
+    __ uaddwv(v22, v23, __ T2D, v24, __ T2S);          //       uaddw   v22.2D, v23.2D, v24.2S
+    __ uaddwv2(v3, v4, __ T2D, v5, __ T4S);            //       uaddw2  v3.2D, v4.2D, v5.4S

    __ bind(forth);

@ -1469,30 +1470,30 @@
    0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
    0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
    0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x140004cb,     0x94000000,
-    0x97ffffd4,     0x940004c8,     0x3400000a,     0x34fffa2a,
-    0x340098aa,     0x35000008,     0x35fff9c8,     0x35009848,
-    0xb400000b,     0xb4fff96b,     0xb40097eb,     0xb500001d,
-    0xb5fff91d,     0xb500979d,     0x10000013,     0x10fff8b3,
-    0x10009733,     0x90000013,     0x36300016,     0x3637f836,
-    0x363096b6,     0x3758000c,     0x375ff7cc,     0x3758964c,
+    0x14000000,     0x17ffffd7,     0x140004cc,     0x94000000,
+    0x97ffffd4,     0x940004c9,     0x3400000a,     0x34fffa2a,
+    0x340098ca,     0x35000008,     0x35fff9c8,     0x35009868,
+    0xb400000b,     0xb4fff96b,     0xb400980b,     0xb500001d,
+    0xb5fff91d,     0xb50097bd,     0x10000013,     0x10fff8b3,
+    0x10009753,     0x90000013,     0x36300016,     0x3637f836,
+    0x363096d6,     0x3758000c,     0x375ff7cc,     0x3758966c,
    0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
    0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
    0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
    0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x54009420,     0x54000001,     0x54fff541,     0x540093c1,
-    0x54000002,     0x54fff4e2,     0x54009362,     0x54000002,
-    0x54fff482,     0x54009302,     0x54000003,     0x54fff423,
-    0x540092a3,     0x54000003,     0x54fff3c3,     0x54009243,
-    0x54000004,     0x54fff364,     0x540091e4,     0x54000005,
-    0x54fff305,     0x54009185,     0x54000006,     0x54fff2a6,
-    0x54009126,     0x54000007,     0x54fff247,     0x540090c7,
-    0x54000008,     0x54fff1e8,     0x54009068,     0x54000009,
-    0x54fff189,     0x54009009,     0x5400000a,     0x54fff12a,
-    0x54008faa,     0x5400000b,     0x54fff0cb,     0x54008f4b,
-    0x5400000c,     0x54fff06c,     0x54008eec,     0x5400000d,
-    0x54fff00d,     0x54008e8d,     0x5400000e,     0x54ffefae,
-    0x54008e2e,     0x5400000f,     0x54ffef4f,     0x54008dcf,
+    0x54009440,     0x54000001,     0x54fff541,     0x540093e1,
+    0x54000002,     0x54fff4e2,     0x54009382,     0x54000002,
+    0x54fff482,     0x54009322,     0x54000003,     0x54fff423,
+    0x540092c3,     0x54000003,     0x54fff3c3,     0x54009263,
+    0x54000004,     0x54fff364,     0x54009204,     0x54000005,
+    0x54fff305,     0x540091a5,     0x54000006,     0x54fff2a6,
+    0x54009146,     0x54000007,     0x54fff247,     0x540090e7,
+    0x54000008,     0x54fff1e8,     0x54009088,     0x54000009,
+    0x54fff189,     0x54009029,     0x5400000a,     0x54fff12a,
+    0x54008fca,     0x5400000b,     0x54fff0cb,     0x54008f6b,
+    0x5400000c,     0x54fff06c,     0x54008f0c,     0x5400000d,
+    0x54fff00d,     0x54008ead,     0x5400000e,     0x54ffefae,
+    0x54008e4e,     0x5400000f,     0x54ffef4f,     0x54008def,
    0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
    0xd44cad80,     0xd503201f,     0xd503203f,     0xd503205f,
    0xd503209f,     0xd50320bf,     0xd503219f,     0xd50323bf,
@ -1535,7 +1536,7 @@
    0x39598921,     0x795d3077,     0x399d0675,     0x7998d8f3,
    0x79dbd02a,     0xb99d068a,     0xfd5d11a0,     0xbd58d76b,
    0xfd1ac72d,     0xbd1d9c14,     0x5800001a,     0x18ffda33,
-    0xf8991100,     0xd8007880,     0xf8a758e0,     0xf9989d80,
+    0xf8991100,     0xd80078a0,     0xf8a758e0,     0xf9989d80,
    0x1a0b0298,     0x3a1c01a0,     0x5a0400ea,     0x7a02020f,
    0x9a1d028c,     0xba0e01ad,     0xda140186,     0xfa19022c,
    0x0b2b877e,     0x2b21c8ee,     0xcb3ba47d,     0x6b3ae9a0,
@ -1769,13 +1770,13 @@
    0x65b45aff,     0x65e07fa2,     0x04454097,     0x044d6e3c,
    0x04283148,     0x04bd3013,     0x047731b0,     0x04ed33d7,
    0x05606ad9,     0x056b6fd9,     0x658896ce,     0x4540b245,
-    0x45c3b449,     0x04243bae,     0x44d8948e,     0x449a8edb,
-    0x4499997f,     0x441b938c,     0x04da309c,     0x049821e6,
-    0x04993641,     0x04482882,     0x048a2b8b,     0x044937e3,
-    0x044b35f8,     0x65872d06,     0x65c63c95,     0x659834d8,
-    0x04c12924,     0x0e2c116a,     0x4e2710c5,     0x0e61101f,
-    0x4e7812f6,     0x0ebb1359,     0x4eb1120f,     0x2e251083,
-    0x6e341272,     0x2e7011ee,     0x6e6c116a,     0x2ea41062,
-    0x6eac116a,
+    0x45c3b449,     0x04243fae,     0x0436388e,     0x44988edb,
+    0x449a997f,     0x4419938c,     0x44db909c,     0x049a21e6,
+    0x04983641,     0x04592882,     0x04882b8b,     0x044a37e3,
+    0x044935f8,     0x044b2d06,     0x65c73c95,     0x658634d8,
+    0x65d82924,     0x048127ea,     0x0e3b1359,     0x4e31120f,
+    0x0e651083,     0x4e741272,     0x0eb011ee,     0x4eac116a,
+    0x2e241062,     0x6e2c116a,     0x2e6a1128,     0x6e6d118b,
+    0x2eb812f6,     0x6ea51083,
  };
 // END  Generated code -- do not edit
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@ -2442,6 +2442,12 @@ public class IRNode {
        vectorNode(VECTOR_BLEND_D, "VectorBlend", TYPE_DOUBLE);
    }

+    public static final String VECTOR_BITWISE_BLEND = PREFIX + "VECTOR_BITWISE_BLEND" + POSTFIX;
+    static {
+        String regex = START + "VectorBitwiseBlend" + MID + END;
+        afterBarrierExpansionToBeforeMatching(VECTOR_BITWISE_BLEND, regex);
+    }
+
    public static final String VECTOR_MASK_CMP_I = VECTOR_PREFIX + "VECTOR_MASK_CMP_I" + POSTFIX;
    static {
        vectorNode(VECTOR_MASK_CMP_I, "VectorMaskCmp", TYPE_INT);
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorBitwiseBlendTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorBitwiseBlendTest.java
@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8382052
+ * @key randomness
+ * @library /test/lib /
+ * @summary IR tests for Vector BITWISE_BLEND optimization
+ * @modules jdk.incubator.vector
+ *
+ * @run driver ${test.main.class}
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.generators.*;
+import compiler.lib.ir_framework.*;
+import jdk.incubator.vector.*;
+
+public class VectorBitwiseBlendTest {
+
+    private static final Generators RD = Generators.G;
+
+    private static final VectorSpecies<Byte> B_SPECIES = ByteVector.SPECIES_MAX;
+    private static final VectorSpecies<Short> S_SPECIES = ShortVector.SPECIES_MAX;
+    private static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_MAX;
+    private static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_MAX;
+
+    private static final int BUF_LEN = 256;
+
+    private static final byte[] ba = new byte[BUF_LEN];
+    private static final byte[] bb = new byte[BUF_LEN];
+    private static final byte[] bc = new byte[BUF_LEN];
+    private static final byte[] br = new byte[BUF_LEN];
+
+    private static final short[] sa = new short[BUF_LEN];
+    private static final short[] sb = new short[BUF_LEN];
+    private static final short[] sc = new short[BUF_LEN];
+    private static final short[] sr = new short[BUF_LEN];
+
+    private static final int[] ia = new int[BUF_LEN];
+    private static final int[] ib = new int[BUF_LEN];
+    private static final int[] ic = new int[BUF_LEN];
+    private static final int[] ir = new int[BUF_LEN];
+
+    private static final long[] la = new long[BUF_LEN];
+    private static final long[] lb = new long[BUF_LEN];
+    private static final long[] lc = new long[BUF_LEN];
+    private static final long[] lr = new long[BUF_LEN];
+
+    private static final boolean[] mask_arr = new boolean[BUF_LEN];
+
+    static {
+        Generator<Integer> iGen = RD.ints();
+        Generator<Long> lGen = RD.longs();
+
+        for (int i = 0; i < BUF_LEN; i++) {
+            mask_arr[i] = (i & 1) != 0;
+            ba[i] = iGen.next().byteValue();
+            bb[i] = iGen.next().byteValue();
+            bc[i] = iGen.next().byteValue();
+            sa[i] = iGen.next().shortValue();
+            sb[i] = iGen.next().shortValue();
+            sc[i] = iGen.next().shortValue();
+        }
+        RD.fill(iGen, ia);
+        RD.fill(iGen, ib);
+        RD.fill(iGen, ic);
+        RD.fill(lGen, la);
+        RD.fill(lGen, lb);
+        RD.fill(lGen, lc);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
+        applyIf = { "MaxVectorSize", "<= 16" })
+    @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeature = { "sve2", "true" })
+    public static void testUnmaskedBlendByte() {
+        ByteVector va = ByteVector.fromArray(B_SPECIES, ba, 0);
+        ByteVector vb = ByteVector.fromArray(B_SPECIES, bb, 0);
+        ByteVector vc = ByteVector.fromArray(B_SPECIES, bc, 0);
+        va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(br, 0);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
+        applyIf = { "MaxVectorSize", "<= 16" })
+    @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeature = { "sve2", "true" })
+    public static void testUnmaskedBlendShort() {
+        ShortVector va = ShortVector.fromArray(S_SPECIES, sa, 0);
+        ShortVector vb = ShortVector.fromArray(S_SPECIES, sb, 0);
+        ShortVector vc = ShortVector.fromArray(S_SPECIES, sc, 0);
+        va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(sr, 0);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
+        applyIf = { "MaxVectorSize", "<= 16" })
+    @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeature = { "sve2", "true" })
+    public static void testUnmaskedBlendInt() {
+        IntVector va = IntVector.fromArray(I_SPECIES, ia, 0);
+        IntVector vb = IntVector.fromArray(I_SPECIES, ib, 0);
+        IntVector vc = IntVector.fromArray(I_SPECIES, ic, 0);
+        va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(ir, 0);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
+        applyIf = { "MaxVectorSize", "<= 16" })
+    @IR(counts = { IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeature = { "sve2", "true" })
+    public static void testUnmaskedBlendLong() {
+        LongVector va = LongVector.fromArray(L_SPECIES, la, 0);
+        LongVector vb = LongVector.fromArray(L_SPECIES, lb, 0);
+        LongVector vc = LongVector.fromArray(L_SPECIES, lc, 0);
+        va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc).intoArray(lr, 0);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_BLEND_B, "= 1",
+                   IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
+        applyIf = { "MaxVectorSize", "<= 16" })
+    @IR(counts = { IRNode.VECTOR_BLEND_B, "= 1",
+                   IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeature = { "sve2", "true" })
+    public static void testMaskedBlendByte() {
+        VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, mask_arr, 0);
+        ByteVector va = ByteVector.fromArray(B_SPECIES, ba, 0);
+        ByteVector vb = ByteVector.fromArray(B_SPECIES, bb, 0);
+        ByteVector vc = ByteVector.fromArray(B_SPECIES, bc, 0);
+        va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(br, 0);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_BLEND_S, "= 1",
+                   IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
+        applyIf = { "MaxVectorSize", "<= 16" })
+    @IR(counts = { IRNode.VECTOR_BLEND_S, "= 1",
+                   IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeature = { "sve2", "true" })
+    public static void testMaskedBlendShort() {
+        VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, mask_arr, 0);
+        ShortVector va = ShortVector.fromArray(S_SPECIES, sa, 0);
+        ShortVector vb = ShortVector.fromArray(S_SPECIES, sb, 0);
+        ShortVector vc = ShortVector.fromArray(S_SPECIES, sc, 0);
+        va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(sr, 0);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_BLEND_I, "= 1",
+                   IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
+        applyIf = { "MaxVectorSize", "<= 16" })
+    @IR(counts = { IRNode.VECTOR_BLEND_I, "= 1",
+                   IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeature = { "sve2", "true" })
+    public static void testMaskedBlendInt() {
+        VectorMask<Integer> mask = VectorMask.fromArray(I_SPECIES, mask_arr, 0);
+        IntVector va = IntVector.fromArray(I_SPECIES, ia, 0);
+        IntVector vb = IntVector.fromArray(I_SPECIES, ib, 0);
+        IntVector vc = IntVector.fromArray(I_SPECIES, ic, 0);
+        va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(ir, 0);
+    }
+
+    @Test
+    @IR(counts = { IRNode.VECTOR_BLEND_L, "= 1",
+                   IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeatureAnd = { "asimd", "true", "sve2", "false" },
+        applyIf = { "MaxVectorSize", "<= 16" })
+    @IR(counts = { IRNode.VECTOR_BLEND_L, "= 1",
+                   IRNode.VECTOR_BITWISE_BLEND, "= 1" },
+        applyIfCPUFeature = { "sve2", "true" })
+    public static void testMaskedBlendLong() {
+        VectorMask<Long> mask = VectorMask.fromArray(L_SPECIES, mask_arr, 0);
+        LongVector va = LongVector.fromArray(L_SPECIES, la, 0);
+        LongVector vb = LongVector.fromArray(L_SPECIES, lb, 0);
+        LongVector vc = LongVector.fromArray(L_SPECIES, lc, 0);
+        va.lanewise(VectorOperators.BITWISE_BLEND, vb, vc, mask).intoArray(lr, 0);
+    }
+
+    public static void main(String[] args) {
+        TestFramework testFramework = new TestFramework();
+        testFramework.setDefaultWarmup(10000)
+                     .addFlags("--add-modules=jdk.incubator.vector")
+                     .start();
+    }
+}
--- a/test/micro/org/openjdk/bench/jdk/incubator/vector/MaskedLogicOpts.java
+++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/MaskedLogicOpts.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2026, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -55,8 +55,9 @@ public class MaskedLogicOpts {
    int int512_arr_idx;
    int int256_arr_idx;
    int int128_arr_idx;
-    int long256_arr_idx;
    int long512_arr_idx;
+    int long256_arr_idx;
+    int long128_arr_idx;

    private Random r = new Random(1024);

@ -65,8 +66,9 @@ public class MaskedLogicOpts {
        int512_arr_idx = -16;
        int256_arr_idx = -8;
        int128_arr_idx = -4;
-        long256_arr_idx = -4;
        long512_arr_idx = -8;
+        long256_arr_idx = -4;
+        long128_arr_idx = -2;

        mask_arr = new boolean[ARRAYLEN];
        i1 = new int[ARRAYLEN];
@ -106,6 +108,7 @@ public class MaskedLogicOpts {
        int128_arr_idx = (((ARRAYLEN & ~3) - int128_arr_idx) <= 4)  ? 0 : int128_arr_idx + 4;
        long512_arr_idx = (((ARRAYLEN & ~7) - long512_arr_idx) <= 8) ? 0 : long512_arr_idx + 8;
        long256_arr_idx = (((ARRAYLEN & ~3) - long256_arr_idx) <= 4) ? 0 : long256_arr_idx + 4;
+        long128_arr_idx = (((ARRAYLEN & ~1) - long128_arr_idx) <= 2) ? 0 : long128_arr_idx + 2;
    }

    @CompilerControl(CompilerControl.Mode.INLINE)
@ -278,6 +281,11 @@ public class MaskedLogicOpts {
        partiallyMaskedLogicOperationsLongKernel(LongVector.SPECIES_256, long256_arr_idx);
    }

+    @Benchmark
+    public void partiallyMaskedLogicOperationsLong128() {
+        partiallyMaskedLogicOperationsLongKernel(LongVector.SPECIES_128, long128_arr_idx);
+    }
+
    @CompilerControl(CompilerControl.Mode.INLINE)
    public void bitwiseBlendOperationLongKernel(VectorSpecies<Long> SPECIES, int index) {
        VectorMask<Long> lmask = VectorMask.fromArray(SPECIES, mask_arr, index);
@ -305,4 +313,9 @@ public class MaskedLogicOpts {
    public void bitwiseBlendOperationLong256() {
        bitwiseBlendOperationLongKernel(LongVector.SPECIES_256, long256_arr_idx);
    }
+
+    @Benchmark
+    public void bitwiseBlendOperationLong128() {
+        bitwiseBlendOperationLongKernel(LongVector.SPECIES_128, long128_arr_idx);
+    }
 }