diff --git a/src/hotspot/share/opto/subnode.hpp b/src/hotspot/share/opto/subnode.hpp
index 387c1c46ba9..29ec25b41f8 100644
--- a/src/hotspot/share/opto/subnode.hpp
+++ b/src/hotspot/share/opto/subnode.hpp
@@ -520,7 +520,12 @@ class SqrtDNode : public Node {
 public:
   SqrtDNode(Compile* C, Node *c, Node *in1) : Node(c, in1) {
     init_flags(Flag_is_expensive);
-    C->add_expensive_node(this);
+    // Treat node only as expensive if a control input is set because it might
+    // be created from SqrtVDNode in VectorNode::push_through_replicate which
+    // does not have control input.
+    if (c != nullptr) {
+      C->add_expensive_node(this);
+    }
   }
   virtual int Opcode() const;
   const Type *bottom_type() const { return Type::DOUBLE; }
diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
index 651a27af9c7..a54fe6e3a73 100644
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -22,10 +22,12 @@
  */
 
 #include "memory/allocation.inline.hpp"
+#include "opto/addnode.hpp"
 #include "opto/c2_globals.hpp"
 #include "opto/compile.hpp"
 #include "opto/connode.hpp"
 #include "opto/convertnode.hpp"
+#include "opto/divnode.hpp"
 #include "opto/mulnode.hpp"
 #include "opto/subnode.hpp"
 #include "opto/vectornode.hpp"
@@ -290,7 +292,146 @@ int VectorNode::opcode(int sopc, BasicType bt) {
     assert(!VectorNode::is_convert_opcode(sopc),
            "Convert node %s should be processed by VectorCastNode::opcode()",
            NodeClassNames[sopc]);
-    return 0; // Unimplemented
+    return 0;  // not handled
+  }
+}
+
+// Return the scalar opcode for the specified vector opcode and basic type.
+// Returns 0 if not handled.
+int VectorNode::scalar_opcode(int vopc, BasicType bt) {
+  switch (vopc) {
+    case Op_AddVB:
+    case Op_AddVS:
+    case Op_AddVI:
+      return Op_AddI;
+    case Op_AddVL:
+      return Op_AddL;
+    case Op_AddVF:
+      return Op_AddF;
+    case Op_AddVD:
+      return Op_AddD;
+
+    case Op_SubVB:
+    case Op_SubVS:
+    case Op_SubVI:
+      return Op_SubI;
+    case Op_SubVL:
+      return Op_SubL;
+    case Op_SubVF:
+      return Op_SubF;
+    case Op_SubVD:
+      return Op_SubD;
+
+    case Op_MulVB:
+    case Op_MulVS:
+    case Op_MulVI:
+      return Op_MulI;
+    case Op_MulVL:
+      return Op_MulL;
+    case Op_MulVF:
+      return Op_MulF;
+    case Op_MulVD:
+      return Op_MulD;
+
+    case Op_DivVF:
+      return Op_DivF;
+    case Op_DivVD:
+      return Op_DivD;
+
+    case Op_AndV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_AndI;
+        case T_LONG:
+          return Op_AndL;
+        default:
+          return 0;
+      }
+
+    case Op_OrV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_OrI;
+        case T_LONG:
+          return Op_OrL;
+        default:
+          return 0;
+      }
+
+    case Op_XorV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_XorI;
+        case T_LONG:
+          return Op_XorL;
+        default:
+          return 0;
+      }
+
+    case Op_MinV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+          // unsigned, not supported for Min
+          return 0;
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_MinI;
+        case T_LONG:
+          return Op_MinL;
+        case T_FLOAT:
+          return Op_MinF;
+        case T_DOUBLE:
+          return Op_MinD;
+        default:
+          return 0;
+      }
+
+    case Op_MaxV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+          // unsigned, not supported for Max
+          return 0;
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_MaxI;
+        case T_LONG:
+          return Op_MaxL;
+        case T_FLOAT:
+          return Op_MaxF;
+        case T_DOUBLE:
+          return Op_MaxD;
+        default:
+          return 0;
+      }
+
+    case Op_SqrtVD:
+      return Op_SqrtD;
+    case Op_SqrtVF:
+      return Op_SqrtF;
+
+    case Op_FmaVF:
+      return Op_FmaF;
+    case Op_FmaVD:
+      return Op_FmaD;
+
+    default:
+      return 0;  // not handled
   }
 }
 
@@ -984,17 +1125,9 @@ static Node* ideal_partial_operations(PhaseGVN* phase, Node* node, const TypeVec
   }
 }
 
-bool VectorNode::should_swap_inputs_to_help_global_value_numbering() {
-  // Predicated vector operations are sensitive to ordering of inputs.
-  // When the mask corresponding to a vector lane is false then
-  // the result of the operation is corresponding lane of its first operand.
-  //   i.e. RES = VEC1.lanewise(OPER, VEC2, MASK) is semantically equivalent to
-  //        RES = BLEND(VEC1, VEC1.lanewise(OPER, VEC2), MASK)
-  if (is_predicated_vector()) {
-    return false;
-  }
-
-  switch(Opcode()) {
+// Check if the vector operation is commutative (assuming that it is not predicated/masked).
+static bool is_commutative_vector_operation(int opcode) {
+  switch(opcode) {
     case Op_AddVB:
     case Op_AddVS:
     case Op_AddVI:
@@ -1022,18 +1155,228 @@ bool VectorNode::should_swap_inputs_to_help_global_value_numbering() {
     case Op_XorVMask:
 
     case Op_SaturatingAddV:
-      assert(req() == 3, "Must be a binary operation");
-      // For non-predicated commutative operations, sort the inputs in
-      // increasing order of node indices.
-      if (in(1)->_idx > in(2)->_idx) {
-        return true;
-      }
-      // fallthrough
+      return true;
     default:
       return false;
   }
 }
 
+bool VectorNode::should_swap_inputs_to_help_global_value_numbering() {
+  // Predicated vector operations are sensitive to ordering of inputs.
+  // When the mask corresponding to a vector lane is false then
+  // the result of the operation is corresponding lane of its first operand.
+  //   i.e. RES = VEC1.lanewise(OPER, VEC2, MASK) is semantically equivalent to
+  //        RES = BLEND(VEC1, VEC1.lanewise(OPER, VEC2), MASK)
+  if (is_predicated_vector()) {
+    return false;
+  }
+
+  if (is_commutative_vector_operation(Opcode())) {
+    assert(req() == 3, "Must be a binary operation");
+    // For non-predicated commutative operations, sort the inputs in
+    // increasing order of node indices.
+    if (in(1)->_idx > in(2)->_idx) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Check whether we can push this vector op through replicate (all inputs are Replicate).
+bool VectorNode::can_push_through_replicate(BasicType bt) {
+  if (scalar_opcode(Opcode(), bt) == 0) {
+    return false;
+  }
+
+  // Skip over predicated vector operations for now, for masked lanes we preserve
+  // destination/first source vector contents.
+  if (is_predicated_vector()) {
+    return false;
+  }
+
+  for (uint i = 1; i < req(); i++) {
+    if (in(i)->Opcode() != Op_Replicate) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Node* VectorNode::make_scalar(Compile* c, int vopc, BasicType bt, Node* control, Node* in1, Node* in2, Node* in3) {
+  int sopc = scalar_opcode(vopc, bt);
+  assert(sopc != 0, "unhandled vector opcode %s", NodeClassNames[vopc]);
+  assert(opcode(sopc, bt) == vopc, "scalar_opcode and opcode must agree for %s", NodeClassNames[vopc]);
+  switch (sopc) {
+    case Op_AddI:
+      return new AddINode(in1, in2);
+    case Op_AddL:
+      return new AddLNode(in1, in2);
+    case Op_AddF:
+      return new AddFNode(in1, in2);
+    case Op_AddD:
+      return new AddDNode(in1, in2);
+    case Op_MulI:
+      return new MulINode(in1, in2);
+    case Op_MulL:
+      return new MulLNode(in1, in2);
+    case Op_MulF:
+      return new MulFNode(in1, in2);
+    case Op_MulD:
+      return new MulDNode(in1, in2);
+    case Op_AndI:
+      return new AndINode(in1, in2);
+    case Op_AndL:
+      return new AndLNode(in1, in2);
+    case Op_DivF:
+      return new DivFNode(control, in1, in2);
+    case Op_DivD:
+      return new DivDNode(control, in1, in2);
+    case Op_OrI:
+      return new OrINode(in1, in2);
+    case Op_OrL:
+      return new OrLNode(in1, in2);
+    case Op_XorI:
+      return new XorINode(in1, in2);
+    case Op_XorL:
+      return new XorLNode(in1, in2);
+    case Op_SubI:
+      return new SubINode(in1, in2);
+    case Op_SubL:
+      return new SubLNode(in1, in2);
+    case Op_SubF:
+      return new SubFNode(in1, in2);
+    case Op_SubD:
+      return new SubDNode(in1, in2);
+    case Op_MinI:
+      return new MinINode(in1, in2);
+    case Op_MinL:
+      return new MinLNode(c, in1, in2);
+    case Op_MinF:
+      return new MinFNode(in1, in2);
+    case Op_MinD:
+      return new MinDNode(in1, in2);
+    case Op_MaxI:
+      return new MaxINode(in1, in2);
+    case Op_MaxL:
+      return new MaxLNode(c, in1, in2);
+    case Op_MaxF:
+      return new MaxFNode(in1, in2);
+    case Op_MaxD:
+      return new MaxDNode(in1, in2);
+    case Op_SqrtF:
+      return new SqrtFNode(c, control, in1);
+    case Op_SqrtD:
+      return new SqrtDNode(c, control, in1);
+    case Op_FmaF:
+      return new FmaFNode(in1, in2, in3);
+    case Op_FmaD:
+      return new FmaDNode(in1, in2, in3);
+    default:
+      assert(false, "unexpected scalar opcode");
+      return nullptr;
+  }
+}
+
+// Re-wires and creates a new ideal graph pallet with following connectivity
+//   parent(child(cinput1, cinput2), pinput2)
+Node* VectorNode::create_reassociated_node(Node* parent, Node* child, Node* cinput1, Node* cinput2,
+                                           Node* pinput2, PhaseGVN* phase) {
+  Node* cloned_child = child->clone();
+  cloned_child->set_req(1, cinput1);
+  cloned_child->set_req(2, cinput2);
+  cloned_child = phase->transform(cloned_child);
+  Node* cloned_parent = parent->clone();
+  cloned_parent->set_req(1, cloned_child);
+  cloned_parent->set_req(2, pinput2);
+  return cloned_parent;
+}
+
+// Try to reassociate commutative vector operations using the following ideal transformation,
+// this will facilitate strength reducing a vector operation with all replicated inputs to
+// a scalar operation.
+//
+// VectorOp (Replicate INP1) (VectorOp (Replicate INP2) INP3) =>
+//    VectorOp (VectorOp (Replicate INP1) (Replicate INP2)) INP3
+//
+Node* VectorNode::reassociate_vector_operation(PhaseGVN* phase) {
+  // Enable re-association for integral vector operations.
+  if (!is_integral_type(vect_type()->element_basic_type())) {
+    return nullptr;
+  }
+
+  // Enable re-association for commutative vector operations.
+  if (!is_commutative_vector_operation(Opcode())) {
+    return nullptr;
+  }
+
+  Node* in1 = in(1);
+  Node* in2 = in(2);
+  if (in2->Opcode() == Op_Replicate && in1->Opcode() == Opcode()) {
+    swap(in1, in2);
+  }
+
+  if (in1->Opcode() != Op_Replicate || in2->Opcode() != Opcode()) {
+    return nullptr;
+  }
+
+  // Skip predicated vector operations, mask semantics prevent reassociation.
+  if (is_predicated_vector() || in2->as_Vector()->is_predicated_vector()) {
+    return nullptr;
+  }
+
+  Node* in2_1 = in2->in(1);
+  Node* in2_2 = in2->in(2);
+  if (in2_1->Opcode() == Op_Replicate) {
+    return create_reassociated_node(this, in2, in1, in2_1, in2_2, phase);
+  } else if (in2_2->Opcode() == Op_Replicate) {
+    return create_reassociated_node(this, in2, in1, in2_2, in2_1, phase);
+  }
+
+  return nullptr;
+}
+
+// Convert vector operation with all Replicate inputs to scalar operation using following
+// ideal transformation.
+//
+// VectorOp (Replicate INP1, Replicate INP2) =>
+//   Replicate (ScalarOp INP1, INP2)
+//
+Node* VectorNode::push_through_replicate(PhaseGVN* phase) {
+  BasicType bt = vect_type()->element_basic_type();
+  if (!can_push_through_replicate(bt)) {
+    return nullptr;
+  }
+
+  assert(req() >= 2 && req() <= 4, "unexpected req() %u for %s", req(), NodeClassNames[Opcode()]);
+
+  Node* sinp1 = nullptr;
+  Node* sinp2 = nullptr;
+  Node* sinp3 = nullptr;
+
+  assert(in(1)->Opcode() == Op_Replicate, "");
+  sinp1 = in(1)->in(1);
+
+  if (req() > 2) {
+    assert(in(2)->Opcode() == Op_Replicate, "");
+    sinp2 = in(2)->in(1);
+  }
+
+  if (req() > 3) {
+    assert(in(3)->Opcode() == Op_Replicate, "");
+    sinp3 = in(3)->in(1);
+  }
+
+  Node* sop = make_scalar(phase->C, Opcode(), bt, in(0), sinp1, sinp2, sinp3);
+  if (sop == nullptr) {
+    return nullptr;
+  }
+
+  sop = phase->transform(sop);
+
+  return new ReplicateNode(sop, vect_type());
+}
+
 Node* VectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
   Node* n = ideal_partial_operations(phase, this, vect_type());
   if (n != nullptr) {
@@ -1044,7 +1387,13 @@ Node* VectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
   if (should_swap_inputs_to_help_global_value_numbering()) {
     swap_edges(1, 2);
   }
-  return nullptr;
+
+  n = push_through_replicate(phase);
+  if (n != nullptr) {
+    return n;
+  }
+
+  return reassociate_vector_operation(phase);
 }
 
 // Traverses a chain of VectorMaskCast and returns the first non VectorMaskCast node.
@@ -2094,7 +2443,7 @@ Node* FmaVNode::Ideal(PhaseGVN* phase, bool can_reshape) {
     swap_edges(1, 2);
     return this;
   }
-  return nullptr;
+  return VectorNode::Ideal(phase, can_reshape);
 }
 
 // Generate other vector nodes to implement the masked/non-masked vector negation.
diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp
index 897cedd6a1b..6bcb7702d13 100644
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@@ -146,12 +146,20 @@ class VectorNode : public TypeNode {
   static bool is_minmax_opcode(int opc);
 
   bool should_swap_inputs_to_help_global_value_numbering();
+  Node* reassociate_vector_operation(PhaseGVN* phase);
+  static Node* create_reassociated_node(Node* parent, Node* child, Node* cinput1, Node* cinput2,
+                                        Node* pinput2, PhaseGVN* phase);
 
   static bool is_vshift_cnt_opcode(int opc);
 
   static bool is_rotate_opcode(int opc);
 
   static int opcode(int sopc, BasicType bt);         // scalar_opc -> vector_opc
+  static int scalar_opcode(int vopc, BasicType bt);  // vector_opc -> scalar_opc, 0 if not handled
+  static Node* make_scalar(Compile* c, int vopc, BasicType bt, Node* control, Node* in1, Node* in2, Node* in3);
+
+  bool can_push_through_replicate(BasicType bt);
+  Node* push_through_replicate(PhaseGVN* phase);
 
   static int shift_count_opcode(int opc);
 
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index 55d591acdb3..4f7869f444a 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -233,6 +233,11 @@ public class IRNode {
         beforeMatchingNameRegex(ADD_P, "AddP");
     }
 
+    public static final String ADD_D = PREFIX + "ADD_D" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(ADD_D, "AddD");
+    }
+
     public static final String ADD_VD = VECTOR_PREFIX + "ADD_VD" + POSTFIX;
     static {
         vectorNode(ADD_VD, "AddVD", TYPE_DOUBLE);
@@ -763,11 +768,21 @@ public class IRNode {
         vectorNode(DIV_VHF, "DivVHF", TYPE_SHORT);
     }
 
+    public static final String DIV_F = PREFIX + "DIV_F" + POSTFIX;
+    static {
+       beforeMatchingNameRegex(DIV_F, "DivF");
+    }
+
     public static final String DIV_VF = VECTOR_PREFIX + "DIV_VF" + POSTFIX;
     static {
         vectorNode(DIV_VF, "DivVF", TYPE_FLOAT);
     }
 
+    public static final String DIV_D = PREFIX + "DIV_D" + POSTFIX;
+    static {
+       beforeMatchingNameRegex(DIV_D, "DivD");
+    }
+
     public static final String DIV_VD = VECTOR_PREFIX + "DIV_VD" + POSTFIX;
     static {
         vectorNode(DIV_VD, "DivVD", TYPE_DOUBLE);
diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorBroadcastTransforms.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorBroadcastTransforms.java
new file mode 100644
index 00000000000..c58a6710c86
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorBroadcastTransforms.java
@@ -0,0 +1,1100 @@
+/*
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8358521
+ * @summary Optimize vector operations by reassociating broadcasted inputs
+ * @modules jdk.incubator.vector
+ * @library /test/lib /
+ * @run driver compiler.vectorapi.TestVectorBroadcastTransforms
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.ir_framework.*;
+import compiler.lib.verify.Verify;
+import jdk.incubator.vector.*;
+
+import jdk.test.lib.Utils;
+import java.util.Random;
+
+public class TestVectorBroadcastTransforms {
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
+    }
+
+    private static final Random R = Utils.getRandomInstance();
+
+    /* =======================
+     * INT
+     * ======================= */
+
+    static final VectorSpecies<Integer> ISP = IntVector.SPECIES_PREFERRED;
+
+    @Test
+    @IR(failOn = IRNode.ADD_VI,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_I, ">= 1", IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static int int_add(int ia, int ib) {
+        return IntVector.broadcast(ISP, ia)
+                .add(IntVector.broadcast(ISP, ib))
+                .lane(0);
+    }
+
+    @Run(test = "int_add")
+    static void run_int_add() {
+        int ia = R.nextInt();
+        int ib = R.nextInt();
+        int ir = int_add(ia, ib);
+        Verify.checkEQ(ir, ia + ib);
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VI,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_I, ">= 1", IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static int int_sub(int ia, int ib) {
+        return IntVector.broadcast(ISP, ia)
+                .sub(IntVector.broadcast(ISP, ib))
+                .lane(0);
+    }
+
+    @Run(test = "int_sub")
+    static void run_int_sub() {
+        int ia = R.nextInt();
+        int ib = R.nextInt();
+        int ir = int_sub(ia, ib);
+        Verify.checkEQ(ir, ia - ib);
+    }
+
+    @Test
+    @IR(failOn = IRNode.MUL_VI,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_I, ">= 1", IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static int int_mul(int ia, int ib) {
+        return IntVector.broadcast(ISP, ia)
+                .mul(IntVector.broadcast(ISP, ib))
+                .lane(0);
+    }
+
+    @Run(test = "int_mul")
+    static void run_int_mul() {
+        int ia = R.nextInt();
+        int ib = R.nextInt();
+        int ir = int_mul(ia, ib);
+        Verify.checkEQ(ir, ia * ib);
+    }
+
+    @Test
+    @IR(failOn = IRNode.AND_VI,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.AND_I, ">= 1", IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static int int_and(int ia, int ib) {
+        return IntVector.broadcast(ISP, ia)
+                .and(IntVector.broadcast(ISP, ib))
+                .lane(0);
+    }
+
+    @Run(test = "int_and")
+    static void run_int_and() {
+        int ia = R.nextInt();
+        int ib = R.nextInt();
+        int ir = int_and(ia, ib);
+        Verify.checkEQ(ir, ia & ib);
+    }
+
+    @Test
+    @IR(failOn = IRNode.OR_VI,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.OR_I, ">= 1", IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static int int_or(int ia, int ib) {
+        return IntVector.broadcast(ISP, ia)
+                .or(IntVector.broadcast(ISP, ib))
+                .lane(0);
+    }
+
+    @Run(test = "int_or")
+    static void run_int_or() {
+        int ia = R.nextInt();
+        int ib = R.nextInt();
+        int ir = int_or(ia, ib);
+        Verify.checkEQ(ir, ia | ib);
+    }
+
+    @Test
+    @IR(failOn = IRNode.XOR_VI,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.XOR_I, ">= 1", IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static int int_xor(int ia, int ib) {
+        return IntVector.broadcast(ISP, ia)
+                .lanewise(VectorOperators.XOR, IntVector.broadcast(ISP, ib))
+                .lane(0);
+    }
+
+    @Run(test = "int_xor")
+    static void run_int_xor() {
+        int ia = R.nextInt();
+        int ib = R.nextInt();
+        int ir = int_xor(ia, ib);
+        Verify.checkEQ(ir, ia ^ ib);
+    }
+
+    @Test
+    @IR(failOn = IRNode.MIN_VI,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MIN_I, ">= 1", IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static int int_min(int ia, int ib) {
+        return IntVector.broadcast(ISP, ia)
+                .min(IntVector.broadcast(ISP, ib))
+                .lane(0);
+    }
+
+    @Run(test = "int_min")
+    static void run_int_min() {
+        int ia = R.nextInt();
+        int ib = R.nextInt();
+        int ir = int_min(ia, ib);
+        Verify.checkEQ(ir, Math.min(ia, ib));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MAX_VI,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MAX_I, ">= 1", IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static int int_max(int ia, int ib) {
+        return IntVector.broadcast(ISP, ia)
+                .max(IntVector.broadcast(ISP, ib))
+                .lane(0);
+    }
+
+    @Run(test = "int_max")
+    static void run_int_max() {
+        int ia = R.nextInt();
+        int ib = R.nextInt();
+        int ir = int_max(ia, ib);
+        Verify.checkEQ(ir, Math.max(ia, ib));
+    }
+
+    /* =======================
+     * LONG
+     * ======================= */
+
+    static final VectorSpecies<Long> LSP = LongVector.SPECIES_PREFERRED;
+
+    @Test
+    @IR(failOn = IRNode.ADD_VL,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_L, ">= 1", IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static long long_add(long la, long lb) {
+        return LongVector.broadcast(LSP, la)
+                .add(LongVector.broadcast(LSP, lb))
+                .lane(0);
+    }
+
+    @Run(test = "long_add")
+    static void run_long_add() {
+        long la = R.nextLong();
+        long lb = R.nextLong();
+        long lr = long_add(la, lb);
+        Verify.checkEQ(lr, la + lb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VL,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_L, ">= 1", IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static long long_sub(long la, long lb) {
+        return LongVector.broadcast(LSP, la)
+                .sub(LongVector.broadcast(LSP, lb))
+                .lane(0);
+    }
+
+    @Run(test = "long_sub")
+    static void run_long_sub() {
+        long la = R.nextLong();
+        long lb = R.nextLong();
+        long lr = long_sub(la, lb);
+        Verify.checkEQ(lr, la - lb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.MUL_VL,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_L, ">= 1", IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static long long_mul(long la, long lb) {
+        return LongVector.broadcast(LSP, la)
+                .mul(LongVector.broadcast(LSP, lb))
+                .lane(0);
+    }
+
+    @Run(test = "long_mul")
+    static void run_long_mul() {
+        long la = R.nextLong();
+        long lb = R.nextLong();
+        long lr = long_mul(la, lb);
+        Verify.checkEQ(lr, la * lb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.AND_VL,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.AND_L, ">= 1", IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static long long_and(long la, long lb) {
+        return LongVector.broadcast(LSP, la)
+                .and(LongVector.broadcast(LSP, lb))
+                .lane(0);
+    }
+
+    @Run(test = "long_and")
+    static void run_long_and() {
+        long la = R.nextLong();
+        long lb = R.nextLong();
+        long lr = long_and(la, lb);
+        Verify.checkEQ(lr, la & lb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.OR_VL,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.OR_L, ">= 1", IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static long long_or(long la, long lb) {
+        return LongVector.broadcast(LSP, la)
+                .or(LongVector.broadcast(LSP, lb))
+                .lane(0);
+    }
+
+    @Run(test = "long_or")
+    static void run_long_or() {
+        long la = R.nextLong();
+        long lb = R.nextLong();
+        long lr = long_or(la, lb);
+        Verify.checkEQ(lr, la | lb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.XOR_VL,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.XOR_L, ">= 1", IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static long long_xor(long la, long lb) {
+        return LongVector.broadcast(LSP, la)
+                .lanewise(VectorOperators.XOR, LongVector.broadcast(LSP, lb))
+                .lane(0);
+    }
+
+    @Run(test = "long_xor")
+    static void run_long_xor() {
+        long la = R.nextLong();
+        long lb = R.nextLong();
+        long lr = long_xor(la, lb);
+        Verify.checkEQ(lr, la ^ lb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.MIN_VL,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = {IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static long long_min(long la, long lb) {
+        return LongVector.broadcast(LSP, la)
+                .min(LongVector.broadcast(LSP, lb))
+                .lane(0);
+    }
+
+    @Run(test = "long_min")
+    static void run_long_min() {
+        long la = R.nextLong();
+        long lb = R.nextLong();
+        long lr = long_min(la, lb);
+        Verify.checkEQ(lr, Math.min(la, lb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MAX_VL,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = {IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static long long_max(long la, long lb) {
+        return LongVector.broadcast(LSP, la)
+                .max(LongVector.broadcast(LSP, lb))
+                .lane(0);
+    }
+
+    @Run(test = "long_max")
+    static void run_long_max() {
+        long la = R.nextLong();
+        long lb = R.nextLong();
+        long lr = long_max(la, lb);
+        Verify.checkEQ(lr, Math.max(la, lb));
+    }
+
+    /* =======================
+     * FLOAT
+     * ======================= */
+
+    static final VectorSpecies<Float> FSP = FloatVector.SPECIES_PREFERRED;
+
+    @Test
+    @IR(failOn = IRNode.ADD_VF,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_F, ">= 1", IRNode.REPLICATE_F, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static float float_add(float fa, float fb) {
+        return FloatVector.broadcast(FSP, fa)
+                .add(FloatVector.broadcast(FSP, fb))
+                .lane(0);
+    }
+
+    @Run(test = "float_add")
+    static void run_float_add() {
+        float fa = R.nextFloat();
+        float fb = R.nextFloat();
+        float fr = float_add(fa, fb);
+        Verify.checkEQ(fr, fa + fb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VF,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_F, ">= 1", IRNode.REPLICATE_F, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static float float_sub(float fa, float fb) {
+        return FloatVector.broadcast(FSP, fa)
+                .sub(FloatVector.broadcast(FSP, fb))
+                .lane(0);
+    }
+
+    @Run(test = "float_sub")
+    static void run_float_sub() {
+        float fa = R.nextFloat();
+        float fb = R.nextFloat();
+        float fr = float_sub(fa, fb);
+        Verify.checkEQ(fr, fa - fb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.MUL_VF,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_F, ">= 1", IRNode.REPLICATE_F, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static float float_mul(float fa, float fb) {
+        return FloatVector.broadcast(FSP, fa)
+                .mul(FloatVector.broadcast(FSP, fb))
+                .lane(0);
+    }
+
+    @Run(test = "float_mul")
+    static void run_float_mul() {
+        float fa = R.nextFloat();
+        float fb = R.nextFloat();
+        float fr = float_mul(fa, fb);
+        Verify.checkEQ(fr, fa * fb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.DIV_VF,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.DIV_F, ">= 1", IRNode.REPLICATE_F, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static float float_div(float fa, float fb) {
+        return FloatVector.broadcast(FSP, fa)
+                .div(FloatVector.broadcast(FSP, fb))
+                .lane(0);
+    }
+
+    @Run(test = "float_div")
+    static void run_float_div() {
+        float fa = R.nextFloat();
+        float fb = R.nextFloat();
+        if (fb == 0f) fb = 1f;
+        float fr = float_div(fa, fb);
+        Verify.checkEQ(fr, fa / fb);
+    }
+
+    @Test
+    @IR(failOn = IRNode.MIN_VF,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MIN_F, ">= 1", IRNode.REPLICATE_F, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static float float_min(float fa, float fb) {
+        return FloatVector.broadcast(FSP, fa)
+                .min(FloatVector.broadcast(FSP, fb))
+                .lane(0);
+    }
+
+    @Run(test = "float_min")
+    static void run_float_min() {
+        float fa = R.nextFloat();
+        float fb = R.nextFloat();
+        float fr = float_min(fa, fb);
+        Verify.checkEQ(fr, Math.min(fa, fb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MAX_VF,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MAX_F, ">= 1", IRNode.REPLICATE_F, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static float float_max(float fa, float fb) {
+        return FloatVector.broadcast(FSP, fa)
+                .max(FloatVector.broadcast(FSP, fb))
+                .lane(0);
+    }
+
+    @Run(test = "float_max")
+    static void run_float_max() {
+        float fa = R.nextFloat();
+        float fb = R.nextFloat();
+        float fr = float_max(fa, fb);
+        Verify.checkEQ(fr, Math.max(fa, fb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.SQRT_VF,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SQRT_F, ">= 1", IRNode.REPLICATE_F, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static float float_sqrt(float fa) {
+        return FloatVector.broadcast(FSP, fa)
+                .sqrt()
+                .lane(0);
+    }
+
+    @Run(test = "float_sqrt")
+    static void run_float_sqrt() {
+        float fa = Math.abs(R.nextFloat()) + Float.MIN_VALUE;
+        float fr = float_sqrt(fa);
+        Verify.checkEQ(fr, (float) Math.sqrt(fa));
+    }
+
+    @Test
+    @IR(failOn = IRNode.FMA_VF,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.FMA_F, ">= 1", IRNode.REPLICATE_F, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static float float_fma(float fa, float fb, float fc) {
+        return FloatVector.broadcast(FSP, fa)
+                .fma(FloatVector.broadcast(FSP, fb),
+                     FloatVector.broadcast(FSP, fc))
+                .lane(0);
+    }
+
+    @Run(test = "float_fma")
+    static void run_float_fma() {
+        float fa = R.nextFloat();
+        float fb = R.nextFloat();
+        float fc = R.nextFloat();
+        float fr = float_fma(fa, fb, fc);
+        Verify.checkEQ(fr, Math.fma(fa, fb, fc));
+    }
+
+    /* =======================
+     * DOUBLE
+     * ======================= */
+
+    static final VectorSpecies<Double> DSP = DoubleVector.SPECIES_PREFERRED;
+
+    @Test
+    @IR(failOn = IRNode.ADD_VD,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_D, ">= 1", IRNode.REPLICATE_D, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static double double_add(double da, double db) {
+        return DoubleVector.broadcast(DSP, da)
+                .add(DoubleVector.broadcast(DSP, db))
+                .lane(0);
+    }
+
+    @Run(test = "double_add")
+    static void run_double_add() {
+        double da = R.nextDouble();
+        double db = R.nextDouble();
+        double dr = double_add(da, db);
+        Verify.checkEQ(dr, da + db);
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VD,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_D, ">= 1", IRNode.REPLICATE_D, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static double double_sub(double da, double db) {
+        return DoubleVector.broadcast(DSP, da)
+                .sub(DoubleVector.broadcast(DSP, db))
+                .lane(0);
+    }
+
+    @Run(test = "double_sub")
+    static void run_double_sub() {
+        double da = R.nextDouble();
+        double db = R.nextDouble();
+        double dr = double_sub(da, db);
+        Verify.checkEQ(dr, da - db);
+    }
+
+    @Test
+    @IR(failOn = IRNode.MUL_VD,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_D, ">= 1", IRNode.REPLICATE_D, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static double double_mul(double da, double db) {
+        return DoubleVector.broadcast(DSP, da)
+                .mul(DoubleVector.broadcast(DSP, db))
+                .lane(0);
+    }
+
+    @Run(test = "double_mul")
+    static void run_double_mul() {
+        double da = R.nextDouble();
+        double db = R.nextDouble();
+        double dr = double_mul(da, db);
+        Verify.checkEQ(dr, da * db);
+    }
+
+    @Test
+    @IR(failOn = IRNode.DIV_VD,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.DIV_D, ">= 1", IRNode.REPLICATE_D, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static double double_div(double da, double db) {
+        return DoubleVector.broadcast(DSP, da)
+                .div(DoubleVector.broadcast(DSP, db))
+                .lane(0);
+    }
+
+    @Run(test = "double_div")
+    static void run_double_div() {
+        double da = R.nextDouble();
+        double db = R.nextDouble();
+        if (db == 0d) db = 1d;
+        double dr = double_div(da, db);
+        Verify.checkEQ(dr, da / db);
+    }
+
+    @Test
+    @IR(failOn = IRNode.MIN_VD,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MIN_D, ">= 1", IRNode.REPLICATE_D, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static double double_min(double da, double db) {
+        return DoubleVector.broadcast(DSP, da)
+                .min(DoubleVector.broadcast(DSP, db))
+                .lane(0);
+    }
+
+    @Run(test = "double_min")
+    static void run_double_min() {
+        double da = R.nextDouble();
+        double db = R.nextDouble();
+        double dr = double_min(da, db);
+        Verify.checkEQ(dr, Math.min(da, db));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MAX_VD,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MAX_D, ">= 1", IRNode.REPLICATE_D, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static double double_max(double da, double db) {
+        return DoubleVector.broadcast(DSP, da)
+                .max(DoubleVector.broadcast(DSP, db))
+                .lane(0);
+    }
+
+    @Run(test = "double_max")
+    static void run_double_max() {
+        double da = R.nextDouble();
+        double db = R.nextDouble();
+        double dr = double_max(da, db);
+        Verify.checkEQ(dr, Math.max(da, db));
+    }
+
+    @Test
+    @IR(failOn = IRNode.SQRT_VD,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SQRT_D, ">= 1", IRNode.REPLICATE_D, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static double double_sqrt(double da) {
+        return DoubleVector.broadcast(DSP, da)
+                .sqrt()
+                .lane(0);
+    }
+
+    @Run(test = "double_sqrt")
+    static void run_double_sqrt() {
+        double da = Math.abs(R.nextDouble()) + Double.MIN_VALUE;
+        double dr = double_sqrt(da);
+        Verify.checkEQ(dr, Math.sqrt(da));
+    }
+
+    @Test
+    @IR(failOn = IRNode.FMA_VD,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.FMA_D, ">= 1", IRNode.REPLICATE_D, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static double double_fma(double da, double db, double dc) {
+        return DoubleVector.broadcast(DSP, da)
+                .fma(DoubleVector.broadcast(DSP, db),
+                     DoubleVector.broadcast(DSP, dc))
+                .lane(0);
+    }
+
+    @Run(test = "double_fma")
+    static void run_double_fma() {
+        double da = R.nextDouble();
+        double db = R.nextDouble();
+        double dc = R.nextDouble();
+        double dr = double_fma(da, db, dc);
+        Verify.checkEQ(dr, Math.fma(da, db, dc));
+    }
+
+    /* =======================
+     * BYTE
+     * ======================= */
+
+    static final VectorSpecies<Byte> BSP = ByteVector.SPECIES_PREFERRED;
+    static byte B_MAX = Byte.MAX_VALUE, B_MIN = Byte.MIN_VALUE;
+    static byte B_ONE = (byte) 1, B_NEG_ONE = (byte) -1;
+
+    @Test
+    @IR(failOn = IRNode.ADD_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_add(byte ba, byte bb) {
+        return ByteVector.broadcast(BSP, ba)
+                .add(ByteVector.broadcast(BSP, bb))
+                .lane(0);
+    }
+
+    @Run(test = "byte_add")
+    static void run_byte_add() {
+        byte ba = (byte) R.nextInt();
+        byte bb = (byte) R.nextInt();
+        byte br = byte_add(ba, bb);
+        Verify.checkEQ(br, (byte) (ba + bb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_sub(byte ba, byte bb) {
+        return ByteVector.broadcast(BSP, ba)
+                .sub(ByteVector.broadcast(BSP, bb))
+                .lane(0);
+    }
+
+    @Run(test = "byte_sub")
+    static void run_byte_sub() {
+        byte ba = (byte) R.nextInt();
+        byte bb = (byte) R.nextInt();
+        byte br = byte_sub(ba, bb);
+        Verify.checkEQ(br, (byte) (ba - bb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.ADD_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_add_overflow() {
+        return ByteVector.broadcast(BSP, B_MAX)
+                .add(ByteVector.broadcast(BSP, B_ONE))
+                .lane(0);
+    }
+
+    @Run(test = "byte_add_overflow")
+    static void run_byte_add_overflow() {
+        byte br = byte_add_overflow();
+        Verify.checkEQ(br, (byte) (B_MAX + B_ONE));
+    }
+
+    @Test
+    @IR(failOn = IRNode.ADD_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_add_underflow() {
+        return ByteVector.broadcast(BSP, B_MIN)
+                .add(ByteVector.broadcast(BSP, B_NEG_ONE))
+                .lane(0);
+    }
+
+    @Run(test = "byte_add_underflow")
+    static void run_byte_add_underflow() {
+        byte br = byte_add_underflow();
+        Verify.checkEQ(br, (byte) (B_MIN + B_NEG_ONE));
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_sub_overflow() {
+        return ByteVector.broadcast(BSP, B_MAX)
+                .sub(ByteVector.broadcast(BSP, B_NEG_ONE))
+                .lane(0);
+    }
+
+    @Run(test = "byte_sub_overflow")
+    static void run_byte_sub_overflow() {
+        byte br = byte_sub_overflow();
+        Verify.checkEQ(br, (byte) (B_MAX - B_NEG_ONE));
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_sub_underflow() {
+        return ByteVector.broadcast(BSP, B_MIN)
+                .sub(ByteVector.broadcast(BSP, B_ONE))
+                .lane(0);
+    }
+
+    @Run(test = "byte_sub_underflow")
+    static void run_byte_sub_underflow() {
+        byte br = byte_sub_underflow();
+        Verify.checkEQ(br, (byte) (B_MIN - B_ONE));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MUL_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_mul(byte ba, byte bb) {
+        return ByteVector.broadcast(BSP, ba)
+                .mul(ByteVector.broadcast(BSP, bb))
+                .lane(0);
+    }
+
+    @Run(test = "byte_mul")
+    static void run_byte_mul() {
+        byte ba = (byte) R.nextInt();
+        byte bb = (byte) R.nextInt();
+        byte br = byte_mul(ba, bb);
+        Verify.checkEQ(br, (byte) (ba * bb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.AND_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.AND_I, ">= 1", IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_and(byte ba, byte bb) {
+        return ByteVector.broadcast(BSP, ba)
+                .and(ByteVector.broadcast(BSP, bb))
+                .lane(0);
+    }
+
+    @Run(test = "byte_and")
+    static void run_byte_and() {
+        byte ba = (byte) R.nextInt();
+        byte bb = (byte) R.nextInt();
+        byte br = byte_and(ba, bb);
+        Verify.checkEQ(br, (byte) (ba & bb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.OR_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.OR_I, ">= 1", IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_or(byte ba, byte bb) {
+        return ByteVector.broadcast(BSP, ba)
+                .or(ByteVector.broadcast(BSP, bb))
+                .lane(0);
+    }
+
+    @Run(test = "byte_or")
+    static void run_byte_or() {
+        byte ba = (byte) R.nextInt();
+        byte bb = (byte) R.nextInt();
+        byte br = byte_or(ba, bb);
+        Verify.checkEQ(br, (byte) (ba | bb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.XOR_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.XOR_I, ">= 1", IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_xor(byte ba, byte bb) {
+        return ByteVector.broadcast(BSP, ba)
+                .lanewise(VectorOperators.XOR, ByteVector.broadcast(BSP, bb))
+                .lane(0);
+    }
+
+    @Run(test = "byte_xor")
+    static void run_byte_xor() {
+        byte ba = (byte) R.nextInt();
+        byte bb = (byte) R.nextInt();
+        byte br = byte_xor(ba, bb);
+        Verify.checkEQ(br, (byte) (ba ^ bb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MIN_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MIN_I, ">= 1", IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_min(byte ba, byte bb) {
+        return ByteVector.broadcast(BSP, ba)
+                .min(ByteVector.broadcast(BSP, bb))
+                .lane(0);
+    }
+
+    @Run(test = "byte_min")
+    static void run_byte_min() {
+        byte ba = (byte) R.nextInt();
+        byte bb = (byte) R.nextInt();
+        byte br = byte_min(ba, bb);
+        Verify.checkEQ(br, (byte) Math.min(ba, bb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MAX_VB,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MAX_I, ">= 1", IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static byte byte_max(byte ba, byte bb) {
+        return ByteVector.broadcast(BSP, ba)
+                .max(ByteVector.broadcast(BSP, bb))
+                .lane(0);
+    }
+
+    @Run(test = "byte_max")
+    static void run_byte_max() {
+        byte ba = (byte) R.nextInt();
+        byte bb = (byte) R.nextInt();
+        byte br = byte_max(ba, bb);
+        Verify.checkEQ(br, (byte) Math.max(ba, bb));
+    }
+
+    /* =======================
+     * SHORT
+     * ======================= */
+
+    static final VectorSpecies<Short> SSP = ShortVector.SPECIES_PREFERRED;
+    static short S_MAX = Short.MAX_VALUE, S_MIN = Short.MIN_VALUE;
+    static short S_ONE = (short) 1, S_NEG_ONE = (short) -1;
+
+    @Test
+    @IR(failOn = IRNode.ADD_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_add(short sa, short sb) {
+        return ShortVector.broadcast(SSP, sa)
+                .add(ShortVector.broadcast(SSP, sb))
+                .lane(0);
+    }
+
+    @Run(test = "short_add")
+    static void run_short_add() {
+        short sa = (short) R.nextInt();
+        short sb = (short) R.nextInt();
+        short sr = short_add(sa, sb);
+        Verify.checkEQ(sr, (short) (sa + sb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_sub(short sa, short sb) {
+        return ShortVector.broadcast(SSP, sa)
+                .sub(ShortVector.broadcast(SSP, sb))
+                .lane(0);
+    }
+
+    @Run(test = "short_sub")
+    static void run_short_sub() {
+        short sa = (short) R.nextInt();
+        short sb = (short) R.nextInt();
+        short sr = short_sub(sa, sb);
+        Verify.checkEQ(sr, (short) (sa - sb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.ADD_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_add_overflow() {
+        return ShortVector.broadcast(SSP, S_MAX)
+                .add(ShortVector.broadcast(SSP, S_ONE))
+                .lane(0);
+    }
+
+    @Run(test = "short_add_overflow")
+    static void run_short_add_overflow() {
+        short sr = short_add_overflow();
+        Verify.checkEQ(sr, (short) (S_MAX + S_ONE));
+    }
+
+    @Test
+    @IR(failOn = IRNode.ADD_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_add_underflow() {
+        return ShortVector.broadcast(SSP, S_MIN)
+                .add(ShortVector.broadcast(SSP, S_NEG_ONE))
+                .lane(0);
+    }
+
+    @Run(test = "short_add_underflow")
+    static void run_short_add_underflow() {
+        short sr = short_add_underflow();
+        Verify.checkEQ(sr, (short) (S_MIN + S_NEG_ONE));
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_sub_overflow() {
+        return ShortVector.broadcast(SSP, S_MAX)
+                .sub(ShortVector.broadcast(SSP, S_NEG_ONE))
+                .lane(0);
+    }
+
+    @Run(test = "short_sub_overflow")
+    static void run_short_sub_overflow() {
+        short sr = short_sub_overflow();
+        Verify.checkEQ(sr, (short) (S_MAX - S_NEG_ONE));
+    }
+
+    @Test
+    @IR(failOn = IRNode.SUB_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.SUB_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_sub_underflow() {
+        return ShortVector.broadcast(SSP, S_MIN)
+                .sub(ShortVector.broadcast(SSP, S_ONE))
+                .lane(0);
+    }
+
+    @Run(test = "short_sub_underflow")
+    static void run_short_sub_underflow() {
+        short sr = short_sub_underflow();
+        Verify.checkEQ(sr, (short) (S_MIN - S_ONE));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MUL_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_mul(short sa, short sb) {
+        return ShortVector.broadcast(SSP, sa)
+                .mul(ShortVector.broadcast(SSP, sb))
+                .lane(0);
+    }
+
+    @Run(test = "short_mul")
+    static void run_short_mul() {
+        short sa = (short) R.nextInt();
+        short sb = (short) R.nextInt();
+        short sr = short_mul(sa, sb);
+        Verify.checkEQ(sr, (short) (sa * sb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.AND_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.AND_I, ">= 1", IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_and(short sa, short sb) {
+        return ShortVector.broadcast(SSP, sa)
+                .and(ShortVector.broadcast(SSP, sb))
+                .lane(0);
+    }
+
+    @Run(test = "short_and")
+    static void run_short_and() {
+        short sa = (short) R.nextInt();
+        short sb = (short) R.nextInt();
+        short sr = short_and(sa, sb);
+        Verify.checkEQ(sr, (short) (sa & sb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.OR_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.OR_I, ">= 1", IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_or(short sa, short sb) {
+        return ShortVector.broadcast(SSP, sa)
+                .or(ShortVector.broadcast(SSP, sb))
+                .lane(0);
+    }
+
+    @Run(test = "short_or")
+    static void run_short_or() {
+        short sa = (short) R.nextInt();
+        short sb = (short) R.nextInt();
+        short sr = short_or(sa, sb);
+        Verify.checkEQ(sr, (short) (sa | sb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.XOR_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.XOR_I, ">= 1", IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_xor(short sa, short sb) {
+        return ShortVector.broadcast(SSP, sa)
+                .lanewise(VectorOperators.XOR, ShortVector.broadcast(SSP, sb))
+                .lane(0);
+    }
+
+    @Run(test = "short_xor")
+    static void run_short_xor() {
+        short sa = (short) R.nextInt();
+        short sb = (short) R.nextInt();
+        short sr = short_xor(sa, sb);
+        Verify.checkEQ(sr, (short) (sa ^ sb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MIN_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MIN_I, ">= 1", IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_min(short sa, short sb) {
+        return ShortVector.broadcast(SSP, sa)
+                .min(ShortVector.broadcast(SSP, sb))
+                .lane(0);
+    }
+
+    @Run(test = "short_min")
+    static void run_short_min() {
+        short sa = (short) R.nextInt();
+        short sb = (short) R.nextInt();
+        short sr = short_min(sa, sb);
+        Verify.checkEQ(sr, (short) Math.min(sa, sb));
+    }
+
+    @Test
+    @IR(failOn = IRNode.MAX_VS,
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MAX_I, ">= 1", IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    static short short_max(short sa, short sb) {
+        return ShortVector.broadcast(SSP, sa)
+                .max(ShortVector.broadcast(SSP, sb))
+                .lane(0);
+    }
+
+    @Run(test = "short_max")
+    static void run_short_max() {
+        short sa = (short) R.nextInt();
+        short sb = (short) R.nextInt();
+        short sr = short_max(sa, sb);
+        Verify.checkEQ(sr, (short) Math.max(sa, sb));
+    }
+
+}
diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorReassociations.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorReassociations.java
new file mode 100644
index 00000000000..c6a11627215
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorReassociations.java
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8358521
+ * @summary Test reassociation of broadcasted inputs across vector operations
+ * @modules jdk.incubator.vector
+ * @library /test/lib /
+ * @run driver compiler.vectorapi.TestVectorReassociations
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.ir_framework.*;
+import jdk.incubator.vector.*;
+import java.util.stream.IntStream;
+
+/**
+ * Tests for the reassociation transform:
+ *   VectorOp(broadcast(a), VectorOp(broadcast(b), array))
+ *     => VectorOp(broadcast(ScalarOp(a, b)), array)
+ */
+public class TestVectorReassociations {
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
+    }
+
+    /* =======================
+     * INT
+     * ======================= */
+
+    static final VectorSpecies<Integer> ISP = IntVector.SPECIES_PREFERRED;
+    static int[] intIn  = IntStream.range(0, IntVector.SPECIES_PREFERRED.length()).toArray();
+    static int[] intOut = new int[IntVector.SPECIES_PREFERRED.length()];
+    static int ia = 17, ib = 9;
+
+    // --- INT ADD ---
+
+    // bcast(a) ADD (bcast(b) ADD array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_add_reassociation_pattern1() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.broadcast(ISP, ib)
+                                    .lanewise(VectorOperators.ADD,
+                                              IntVector.fromArray(ISP, intIn, 0)))
+                 .intoArray(intOut, 0);
+    }
+
+    // bcast(a) ADD (array ADD bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_add_reassociation_pattern2() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.fromArray(ISP, intIn, 0)
+                                    .lanewise(VectorOperators.ADD,
+                                              IntVector.broadcast(ISP, ib)))
+                 .intoArray(intOut, 0);
+    }
+
+    // (bcast(a) ADD array) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_add_reassociation_pattern3() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.fromArray(ISP, intIn, 0))
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.broadcast(ISP, ib))
+                 .intoArray(intOut, 0);
+    }
+
+    // (array ADD bcast(a)) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_add_reassociation_pattern4() {
+        IntVector.fromArray(ISP, intIn, 0)
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.broadcast(ISP, ia))
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.broadcast(ISP, ib))
+                 .intoArray(intOut, 0);
+    }
+
+    // --- INT MUL ---
+
+    // bcast(a) MUL (bcast(b) MUL array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_mul_reassociation_pattern1() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.broadcast(ISP, ib)
+                                    .lanewise(VectorOperators.MUL,
+                                              IntVector.fromArray(ISP, intIn, 0)))
+                 .intoArray(intOut, 0);
+    }
+
+    // bcast(a) MUL (array MUL bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_mul_reassociation_pattern2() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.fromArray(ISP, intIn, 0)
+                                    .lanewise(VectorOperators.MUL,
+                                              IntVector.broadcast(ISP, ib)))
+                 .intoArray(intOut, 0);
+    }
+
+    // (bcast(a) MUL array) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_mul_reassociation_pattern3() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.fromArray(ISP, intIn, 0))
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.broadcast(ISP, ib))
+                 .intoArray(intOut, 0);
+    }
+
+    // (array MUL bcast(a)) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_mul_reassociation_pattern4() {
+        IntVector.fromArray(ISP, intIn, 0)
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.broadcast(ISP, ia))
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.broadcast(ISP, ib))
+                 .intoArray(intOut, 0);
+    }
+
+    /* =======================
+     * LONG
+     * ======================= */
+
+    static final VectorSpecies<Long> LSP = LongVector.SPECIES_PREFERRED;
+    static long[] longIn;
+    static long[] longOut;
+    static long la = 17L, lb = 9L;
+
+    static {
+        longIn = new long[LSP.length()];
+        longOut = new long[LSP.length()];
+        for (int i = 0; i < LSP.length(); i++) {
+            longIn[i] = (long) i;
+        }
+    }
+
+    // --- LONG ADD ---
+
+    // bcast(a) ADD (bcast(b) ADD array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_add_reassociation_pattern1() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.broadcast(LSP, lb)
+                                     .lanewise(VectorOperators.ADD,
+                                               LongVector.fromArray(LSP, longIn, 0)))
+                  .intoArray(longOut, 0);
+    }
+
+    // bcast(a) ADD (array ADD bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_add_reassociation_pattern2() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.fromArray(LSP, longIn, 0)
+                                     .lanewise(VectorOperators.ADD,
+                                               LongVector.broadcast(LSP, lb)))
+                  .intoArray(longOut, 0);
+    }
+
+    // (bcast(a) ADD array) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_add_reassociation_pattern3() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.fromArray(LSP, longIn, 0))
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.broadcast(LSP, lb))
+                  .intoArray(longOut, 0);
+    }
+
+    // (array ADD bcast(a)) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_add_reassociation_pattern4() {
+        LongVector.fromArray(LSP, longIn, 0)
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.broadcast(LSP, la))
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.broadcast(LSP, lb))
+                  .intoArray(longOut, 0);
+    }
+
+    // --- LONG MUL ---
+
+    // bcast(a) MUL (bcast(b) MUL array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_mul_reassociation_pattern1() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.broadcast(LSP, lb)
+                                     .lanewise(VectorOperators.MUL,
+                                               LongVector.fromArray(LSP, longIn, 0)))
+                  .intoArray(longOut, 0);
+    }
+
+    // bcast(a) MUL (array MUL bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_mul_reassociation_pattern2() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.fromArray(LSP, longIn, 0)
+                                     .lanewise(VectorOperators.MUL,
+                                               LongVector.broadcast(LSP, lb)))
+                  .intoArray(longOut, 0);
+    }
+
+    // (bcast(a) MUL array) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_mul_reassociation_pattern3() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.fromArray(LSP, longIn, 0))
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.broadcast(LSP, lb))
+                  .intoArray(longOut, 0);
+    }
+
+    // (array MUL bcast(a)) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_mul_reassociation_pattern4() {
+        LongVector.fromArray(LSP, longIn, 0)
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.broadcast(LSP, la))
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.broadcast(LSP, lb))
+                  .intoArray(longOut, 0);
+    }
+
+    /* =======================
+     * SHORT
+     * ======================= */
+
+    static final VectorSpecies<Short> SSP = ShortVector.SPECIES_PREFERRED;
+    static short[] shortIn;
+    static short[] shortOut;
+    static short sa = 17, sb = 9;
+
+    static {
+        shortIn = new short[SSP.length()];
+        shortOut = new short[SSP.length()];
+        for (int i = 0; i < SSP.length(); i++) {
+            shortIn[i] = (short) i;
+        }
+    }
+
+    // --- SHORT ADD ---
+
+    // bcast(a) ADD (bcast(b) ADD array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_add_reassociation_pattern1() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.broadcast(SSP, sb)
+                                      .lanewise(VectorOperators.ADD,
+                                                ShortVector.fromArray(SSP, shortIn, 0)))
+                   .intoArray(shortOut, 0);
+    }
+
+    // bcast(a) ADD (array ADD bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_add_reassociation_pattern2() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.fromArray(SSP, shortIn, 0)
+                                      .lanewise(VectorOperators.ADD,
+                                                ShortVector.broadcast(SSP, sb)))
+                   .intoArray(shortOut, 0);
+    }
+
+    // (bcast(a) ADD array) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_add_reassociation_pattern3() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.fromArray(SSP, shortIn, 0))
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.broadcast(SSP, sb))
+                   .intoArray(shortOut, 0);
+    }
+
+    // (array ADD bcast(a)) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_add_reassociation_pattern4() {
+        ShortVector.fromArray(SSP, shortIn, 0)
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.broadcast(SSP, sa))
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.broadcast(SSP, sb))
+                   .intoArray(shortOut, 0);
+    }
+
+    // --- SHORT MUL ---
+
+    // bcast(a) MUL (bcast(b) MUL array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_mul_reassociation_pattern1() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.broadcast(SSP, sb)
+                                      .lanewise(VectorOperators.MUL,
+                                                ShortVector.fromArray(SSP, shortIn, 0)))
+                   .intoArray(shortOut, 0);
+    }
+
+    // bcast(a) MUL (array MUL bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_mul_reassociation_pattern2() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.fromArray(SSP, shortIn, 0)
+                                      .lanewise(VectorOperators.MUL,
+                                                ShortVector.broadcast(SSP, sb)))
+                   .intoArray(shortOut, 0);
+    }
+
+    // (bcast(a) MUL array) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_mul_reassociation_pattern3() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.fromArray(SSP, shortIn, 0))
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.broadcast(SSP, sb))
+                   .intoArray(shortOut, 0);
+    }
+
+    // (array MUL bcast(a)) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_mul_reassociation_pattern4() {
+        ShortVector.fromArray(SSP, shortIn, 0)
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.broadcast(SSP, sa))
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.broadcast(SSP, sb))
+                   .intoArray(shortOut, 0);
+    }
+
+    /* =======================
+     * BYTE
+     * ======================= */
+
+    static final VectorSpecies<Byte> BSP = ByteVector.SPECIES_PREFERRED;
+    static byte[] byteIn;
+    static byte[] byteOut;
+    static byte ba = 17, bb = 9;
+
+    static {
+        byteIn = new byte[BSP.length()];
+        byteOut = new byte[BSP.length()];
+        for (int i = 0; i < BSP.length(); i++) {
+            byteIn[i] = (byte) i;
+        }
+    }
+
+    // --- BYTE ADD ---
+
+    // bcast(a) ADD (bcast(b) ADD array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_add_reassociation_pattern1() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.broadcast(BSP, bb)
+                                     .lanewise(VectorOperators.ADD,
+                                               ByteVector.fromArray(BSP, byteIn, 0)))
+                  .intoArray(byteOut, 0);
+    }
+
+    // bcast(a) ADD (array ADD bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_add_reassociation_pattern2() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.fromArray(BSP, byteIn, 0)
+                                     .lanewise(VectorOperators.ADD,
+                                               ByteVector.broadcast(BSP, bb)))
+                  .intoArray(byteOut, 0);
+    }
+
+    // (bcast(a) ADD array) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_add_reassociation_pattern3() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.fromArray(BSP, byteIn, 0))
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.broadcast(BSP, bb))
+                  .intoArray(byteOut, 0);
+    }
+
+    // (array ADD bcast(a)) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_add_reassociation_pattern4() {
+        ByteVector.fromArray(BSP, byteIn, 0)
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.broadcast(BSP, ba))
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.broadcast(BSP, bb))
+                  .intoArray(byteOut, 0);
+    }
+
+    // --- BYTE MUL ---
+
+    // bcast(a) MUL (bcast(b) MUL array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_mul_reassociation_pattern1() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.broadcast(BSP, bb)
+                                     .lanewise(VectorOperators.MUL,
+                                               ByteVector.fromArray(BSP, byteIn, 0)))
+                  .intoArray(byteOut, 0);
+    }
+
+    // bcast(a) MUL (array MUL bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_mul_reassociation_pattern2() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.fromArray(BSP, byteIn, 0)
+                                     .lanewise(VectorOperators.MUL,
+                                               ByteVector.broadcast(BSP, bb)))
+                  .intoArray(byteOut, 0);
+    }
+
+    // (bcast(a) MUL array) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_mul_reassociation_pattern3() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.fromArray(BSP, byteIn, 0))
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.broadcast(BSP, bb))
+                  .intoArray(byteOut, 0);
+    }
+
+    // (array MUL bcast(a)) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_mul_reassociation_pattern4() {
+        ByteVector.fromArray(BSP, byteIn, 0)
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.broadcast(BSP, ba))
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.broadcast(BSP, bb))
+                  .intoArray(byteOut, 0);
+    }
+}
diff --git a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorReassociateBenchmark.java b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorReassociateBenchmark.java
new file mode 100644
index 00000000000..cf95b2f7971
--- /dev/null
+++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorReassociateBenchmark.java
@@ -0,0 +1,239 @@
+/*
+ *  Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ *  This code is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 only, as
+ *  published by the Free Software Foundation.
+ *
+ *  This code is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  version 2 for more details (a copy is included in the LICENSE file that
+ *  accompanied this code).
+ *
+ *  You should have received a copy of the GNU General Public License version
+ *  2 along with this work; if not, write to the Free Software Foundation,
+ *  Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *  Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ *  or visit www.oracle.com if you need additional information or have any
+ *  questions.
+ *
+ */
+
+package org.openjdk.bench.jdk.incubator.vector;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+import jdk.incubator.vector.*;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Thread)
+@Fork(jvmArgs = {"--add-modules=jdk.incubator.vector"})
+public class VectorReassociateBenchmark {
+    @Param({"1024", "2048"})
+    int size;
+
+    int [] intIn1;
+    int [] intOut;
+
+    long [] longIn1;
+    long [] longOut;
+
+    short [] shortIn1;
+    short [] shortOut;
+
+    byte [] byteIn1;
+    byte [] byteOut;
+
+    static final VectorSpecies<Float> fspecies = FloatVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Double> dspecies = DoubleVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Integer> ispecies = IntVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Long> lspecies = LongVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Short> sspecies = ShortVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Byte> bspecies = ByteVector.SPECIES_PREFERRED;
+
+    @Setup(Level.Trial)
+    public void BmSetup() {
+        Random r = new Random(2048);
+        intIn1 = new int[size];
+        intOut = new int[size];
+
+        longIn1 = new long[size];
+        longOut = new long[size];
+
+        shortIn1 = new short[size];
+        shortOut = new short[size];
+
+        byteIn1 = new byte[size];
+        byteOut = new byte[size];
+
+        for (int i = 4; i < size; i++) {
+            intIn1[i] = r.nextInt();
+            longIn1[i] = r.nextLong();
+            shortIn1[i] = (short) r.nextInt();
+            byteIn1[i] = (byte) r.nextInt();
+        }
+    }
+
+    @Benchmark
+    public float pushBroadcastsAcrossVectorKernel1() {
+        FloatVector res = FloatVector.broadcast(fspecies, 0.0f);
+        for (int i = 0; i < size; i++) {
+            FloatVector vec1 = FloatVector.broadcast(fspecies, (float)i);
+            FloatVector vec2 = FloatVector.broadcast(fspecies, (float)i + 1);
+            FloatVector vec3 = FloatVector.broadcast(fspecies, (float)i + 2);
+            res = res.lanewise(VectorOperators.ADD, vec1.lanewise(VectorOperators.FMA, vec2, vec3));
+        }
+        return res.lane(0);
+    }
+
+    @Benchmark
+    public double pushBroadcastsAcrossVectorKernel2() {
+        DoubleVector res = DoubleVector.broadcast(dspecies, 0.0f);
+        for (int i = 0; i < size; i++) {
+            DoubleVector vec1 = DoubleVector.broadcast(dspecies, (double)i);
+            res = res.lanewise(VectorOperators.ADD, vec1.lanewise(VectorOperators.SQRT));
+        }
+        return res.lane(0);
+    }
+
+    // int: bcast(a) MUL (bcast(b) MUL (bcast(c) MUL array))
+    @Benchmark
+    public void reassociateIntMulChainedBroadcasts() {
+        for (int i = 0; i < ispecies.loopBound(size); i += ispecies.length()) {
+            IntVector.broadcast(ispecies, i)
+                     .lanewise(VectorOperators.MUL,
+                               IntVector.broadcast(ispecies, i + 1)
+                                        .lanewise(VectorOperators.MUL,
+                                                  IntVector.broadcast(ispecies, i + 2)
+                                                           .lanewise(VectorOperators.MUL,
+                                                                     IntVector.fromArray(ispecies, intIn1, i))))
+            .intoArray(intOut, i);
+        }
+    }
+
+    // int: (bcast(a) MUL bcast(b)) MUL (bcast(c) MUL array)
+    @Benchmark
+    public void reassociateIntMulBalancedBroadcasts() {
+        for (int i = 0; i < ispecies.loopBound(size); i += ispecies.length()) {
+            IntVector left =
+                IntVector.broadcast(ispecies, i)
+                         .lanewise(VectorOperators.MUL,
+                                   IntVector.broadcast(ispecies, i + 1));
+
+            IntVector right =
+                IntVector.broadcast(ispecies, i + 2)
+                         .lanewise(VectorOperators.MUL,
+                                   IntVector.fromArray(ispecies, intIn1, i));
+
+            left.lanewise(VectorOperators.MUL, right)
+                .intoArray(intOut, i);
+        }
+    }
+
+    // long: bcast(a) MUL (bcast(b) MUL (bcast(c) MUL array))
+    @Benchmark
+    public void reassociateLongMulChainedBroadcasts() {
+        for (int i = 0; i < lspecies.loopBound(size); i += lspecies.length()) {
+            LongVector.broadcast(lspecies, (long) i)
+                      .lanewise(VectorOperators.MUL,
+                                LongVector.broadcast(lspecies, (long) (i + 1))
+                                          .lanewise(VectorOperators.MUL,
+                                                    LongVector.broadcast(lspecies, (long) (i + 2))
+                                                              .lanewise(VectorOperators.MUL,
+                                                                        LongVector.fromArray(lspecies, longIn1, i))))
+            .intoArray(longOut, i);
+        }
+    }
+
+    // long: (bcast(a) MUL bcast(b)) MUL (bcast(c) MUL array)
+    @Benchmark
+    public void reassociateLongMulBalancedBroadcasts() {
+        for (int i = 0; i < lspecies.loopBound(size); i += lspecies.length()) {
+            LongVector left =
+                LongVector.broadcast(lspecies, (long) i)
+                          .lanewise(VectorOperators.MUL,
+                                    LongVector.broadcast(lspecies, (long) (i + 1)));
+
+            LongVector right =
+                LongVector.broadcast(lspecies, (long) (i + 2))
+                          .lanewise(VectorOperators.MUL,
+                                    LongVector.fromArray(lspecies, longIn1, i));
+
+            left.lanewise(VectorOperators.MUL, right)
+                .intoArray(longOut, i);
+        }
+    }
+
+    // short: bcast(a) MUL (bcast(b) MUL (bcast(c) MUL array))
+    @Benchmark
+    public void reassociateShortMulChainedBroadcasts() {
+        for (int i = 0; i < sspecies.loopBound(size); i += sspecies.length()) {
+            ShortVector.broadcast(sspecies, (short) i)
+                       .lanewise(VectorOperators.MUL,
+                                 ShortVector.broadcast(sspecies, (short) (i + 1))
+                                            .lanewise(VectorOperators.MUL,
+                                                      ShortVector.broadcast(sspecies, (short) (i + 2))
+                                                                 .lanewise(VectorOperators.MUL,
+                                                                           ShortVector.fromArray(sspecies, shortIn1, i))))
+            .intoArray(shortOut, i);
+        }
+    }
+
+    // short: (bcast(a) MUL bcast(b)) MUL (bcast(c) MUL array)
+    @Benchmark
+    public void reassociateShortMulBalancedBroadcasts() {
+        for (int i = 0; i < sspecies.loopBound(size); i += sspecies.length()) {
+            ShortVector left =
+                ShortVector.broadcast(sspecies, (short) i)
+                           .lanewise(VectorOperators.MUL,
+                                     ShortVector.broadcast(sspecies, (short) (i + 1)));
+
+            ShortVector right =
+                ShortVector.broadcast(sspecies, (short) (i + 2))
+                           .lanewise(VectorOperators.MUL,
+                                     ShortVector.fromArray(sspecies, shortIn1, i));
+
+            left.lanewise(VectorOperators.MUL, right)
+                .intoArray(shortOut, i);
+        }
+    }
+
+    // byte: bcast(a) MUL (bcast(b) MUL (bcast(c) MUL array))
+    @Benchmark
+    public void reassociateByteMulChainedBroadcasts() {
+        for (int i = 0; i < bspecies.loopBound(size); i += bspecies.length()) {
+            ByteVector.broadcast(bspecies, (byte) i)
+                      .lanewise(VectorOperators.MUL,
+                                ByteVector.broadcast(bspecies, (byte) (i + 1))
+                                           .lanewise(VectorOperators.MUL,
+                                                     ByteVector.broadcast(bspecies, (byte) (i + 2))
+                                                                .lanewise(VectorOperators.MUL,
+                                                                          ByteVector.fromArray(bspecies, byteIn1, i))))
+            .intoArray(byteOut, i);
+        }
+    }
+
+    // byte: (bcast(a) MUL bcast(b)) MUL (bcast(c) MUL array)
+    @Benchmark
+    public void reassociateByteMulBalancedBroadcasts() {
+        for (int i = 0; i < bspecies.loopBound(size); i += bspecies.length()) {
+            ByteVector left =
+                ByteVector.broadcast(bspecies, (byte) i)
+                          .lanewise(VectorOperators.MUL,
+                                    ByteVector.broadcast(bspecies, (byte) (i + 1)));
+
+            ByteVector right =
+                ByteVector.broadcast(bspecies, (byte) (i + 2))
+                          .lanewise(VectorOperators.MUL,
+                                    ByteVector.fromArray(bspecies, byteIn1, i));
+
+            left.lanewise(VectorOperators.MUL, right)
+                .intoArray(byteOut, i);
+        }
+    }
+}