8358521: Optimize vector operations by reassociating broadcasted inputs

Reviewed-by: epeter, vlivanov, xgong
2026-05-17 00:50:48 +00:00 · 2026-05-12 06:18:37 +00:00 · 2026-05-12 06:18:37 +00:00 · 7ff7efd59d
commit 7ff7efd59d
parent 776bb729e8
7 changed files with 2343 additions and 22 deletions
--- a/src/hotspot/share/opto/subnode.hpp
+++ b/src/hotspot/share/opto/subnode.hpp
@ -520,7 +520,12 @@ class SqrtDNode : public Node {
 public:
  SqrtDNode(Compile* C, Node *c, Node *in1) : Node(c, in1) {
    init_flags(Flag_is_expensive);
-    C->add_expensive_node(this);
+    // Treat node only as expensive if a control input is set because it might
+    // be created from SqrtVDNode in VectorNode::push_through_replicate which
+    // does not have control input.
+    if (c != nullptr) {
+      C->add_expensive_node(this);
+    }
  }
  virtual int Opcode() const;
  const Type *bottom_type() const { return Type::DOUBLE; }
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -22,10 +22,12 @@
 */

 #include "memory/allocation.inline.hpp"
+#include "opto/addnode.hpp"
 #include "opto/c2_globals.hpp"
 #include "opto/compile.hpp"
 #include "opto/connode.hpp"
 #include "opto/convertnode.hpp"
+#include "opto/divnode.hpp"
 #include "opto/mulnode.hpp"
 #include "opto/subnode.hpp"
 #include "opto/vectornode.hpp"
@ -290,7 +292,146 @@ int VectorNode::opcode(int sopc, BasicType bt) {
    assert(!VectorNode::is_convert_opcode(sopc),
           "Convert node %s should be processed by VectorCastNode::opcode()",
           NodeClassNames[sopc]);
-    return 0; // Unimplemented
+    return 0;  // not handled
+  }
+}
+
+// Return the scalar opcode for the specified vector opcode and basic type.
+// Returns 0 if not handled.
+int VectorNode::scalar_opcode(int vopc, BasicType bt) {
+  switch (vopc) {
+    case Op_AddVB:
+    case Op_AddVS:
+    case Op_AddVI:
+      return Op_AddI;
+    case Op_AddVL:
+      return Op_AddL;
+    case Op_AddVF:
+      return Op_AddF;
+    case Op_AddVD:
+      return Op_AddD;
+
+    case Op_SubVB:
+    case Op_SubVS:
+    case Op_SubVI:
+      return Op_SubI;
+    case Op_SubVL:
+      return Op_SubL;
+    case Op_SubVF:
+      return Op_SubF;
+    case Op_SubVD:
+      return Op_SubD;
+
+    case Op_MulVB:
+    case Op_MulVS:
+    case Op_MulVI:
+      return Op_MulI;
+    case Op_MulVL:
+      return Op_MulL;
+    case Op_MulVF:
+      return Op_MulF;
+    case Op_MulVD:
+      return Op_MulD;
+
+    case Op_DivVF:
+      return Op_DivF;
+    case Op_DivVD:
+      return Op_DivD;
+
+    case Op_AndV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_AndI;
+        case T_LONG:
+          return Op_AndL;
+        default:
+          return 0;
+      }
+
+    case Op_OrV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_OrI;
+        case T_LONG:
+          return Op_OrL;
+        default:
+          return 0;
+      }
+
+    case Op_XorV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_XorI;
+        case T_LONG:
+          return Op_XorL;
+        default:
+          return 0;
+      }
+
+    case Op_MinV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+          // unsigned, not supported for Min
+          return 0;
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_MinI;
+        case T_LONG:
+          return Op_MinL;
+        case T_FLOAT:
+          return Op_MinF;
+        case T_DOUBLE:
+          return Op_MinD;
+        default:
+          return 0;
+      }
+
+    case Op_MaxV:
+      switch (bt) {
+        case T_BOOLEAN:
+        case T_CHAR:
+          // unsigned, not supported for Max
+          return 0;
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:
+          return Op_MaxI;
+        case T_LONG:
+          return Op_MaxL;
+        case T_FLOAT:
+          return Op_MaxF;
+        case T_DOUBLE:
+          return Op_MaxD;
+        default:
+          return 0;
+      }
+
+    case Op_SqrtVD:
+      return Op_SqrtD;
+    case Op_SqrtVF:
+      return Op_SqrtF;
+
+    case Op_FmaVF:
+      return Op_FmaF;
+    case Op_FmaVD:
+      return Op_FmaD;
+
+    default:
+      return 0;  // not handled
  }
 }

@ -984,17 +1125,9 @@ static Node* ideal_partial_operations(PhaseGVN* phase, Node* node, const TypeVec
  }
 }

-bool VectorNode::should_swap_inputs_to_help_global_value_numbering() {
-  // Predicated vector operations are sensitive to ordering of inputs.
-  // When the mask corresponding to a vector lane is false then
-  // the result of the operation is corresponding lane of its first operand.
-  //   i.e. RES = VEC1.lanewise(OPER, VEC2, MASK) is semantically equivalent to
-  //        RES = BLEND(VEC1, VEC1.lanewise(OPER, VEC2), MASK)
-  if (is_predicated_vector()) {
-    return false;
-  }
-
-  switch(Opcode()) {
+// Check if the vector operation is commutative (assuming that it is not predicated/masked).
+static bool is_commutative_vector_operation(int opcode) {
+  switch(opcode) {
    case Op_AddVB:
    case Op_AddVS:
    case Op_AddVI:
@ -1022,18 +1155,228 @@ bool VectorNode::should_swap_inputs_to_help_global_value_numbering() {
    case Op_XorVMask:

    case Op_SaturatingAddV:
-      assert(req() == 3, "Must be a binary operation");
-      // For non-predicated commutative operations, sort the inputs in
-      // increasing order of node indices.
-      if (in(1)->_idx > in(2)->_idx) {
-        return true;
-      }
-      // fallthrough
+      return true;
    default:
      return false;
  }
 }

+bool VectorNode::should_swap_inputs_to_help_global_value_numbering() {
+  // Predicated vector operations are sensitive to ordering of inputs.
+  // When the mask corresponding to a vector lane is false then
+  // the result of the operation is corresponding lane of its first operand.
+  //   i.e. RES = VEC1.lanewise(OPER, VEC2, MASK) is semantically equivalent to
+  //        RES = BLEND(VEC1, VEC1.lanewise(OPER, VEC2), MASK)
+  if (is_predicated_vector()) {
+    return false;
+  }
+
+  if (is_commutative_vector_operation(Opcode())) {
+    assert(req() == 3, "Must be a binary operation");
+    // For non-predicated commutative operations, sort the inputs in
+    // increasing order of node indices.
+    if (in(1)->_idx > in(2)->_idx) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Check whether we can push this vector op through replicate (all inputs are Replicate).
+bool VectorNode::can_push_through_replicate(BasicType bt) {
+  if (scalar_opcode(Opcode(), bt) == 0) {
+    return false;
+  }
+
+  // Skip over predicated vector operations for now, for masked lanes we preserve
+  // destination/first source vector contents.
+  if (is_predicated_vector()) {
+    return false;
+  }
+
+  for (uint i = 1; i < req(); i++) {
+    if (in(i)->Opcode() != Op_Replicate) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Node* VectorNode::make_scalar(Compile* c, int vopc, BasicType bt, Node* control, Node* in1, Node* in2, Node* in3) {
+  int sopc = scalar_opcode(vopc, bt);
+  assert(sopc != 0, "unhandled vector opcode %s", NodeClassNames[vopc]);
+  assert(opcode(sopc, bt) == vopc, "scalar_opcode and opcode must agree for %s", NodeClassNames[vopc]);
+  switch (sopc) {
+    case Op_AddI:
+      return new AddINode(in1, in2);
+    case Op_AddL:
+      return new AddLNode(in1, in2);
+    case Op_AddF:
+      return new AddFNode(in1, in2);
+    case Op_AddD:
+      return new AddDNode(in1, in2);
+    case Op_MulI:
+      return new MulINode(in1, in2);
+    case Op_MulL:
+      return new MulLNode(in1, in2);
+    case Op_MulF:
+      return new MulFNode(in1, in2);
+    case Op_MulD:
+      return new MulDNode(in1, in2);
+    case Op_AndI:
+      return new AndINode(in1, in2);
+    case Op_AndL:
+      return new AndLNode(in1, in2);
+    case Op_DivF:
+      return new DivFNode(control, in1, in2);
+    case Op_DivD:
+      return new DivDNode(control, in1, in2);
+    case Op_OrI:
+      return new OrINode(in1, in2);
+    case Op_OrL:
+      return new OrLNode(in1, in2);
+    case Op_XorI:
+      return new XorINode(in1, in2);
+    case Op_XorL:
+      return new XorLNode(in1, in2);
+    case Op_SubI:
+      return new SubINode(in1, in2);
+    case Op_SubL:
+      return new SubLNode(in1, in2);
+    case Op_SubF:
+      return new SubFNode(in1, in2);
+    case Op_SubD:
+      return new SubDNode(in1, in2);
+    case Op_MinI:
+      return new MinINode(in1, in2);
+    case Op_MinL:
+      return new MinLNode(c, in1, in2);
+    case Op_MinF:
+      return new MinFNode(in1, in2);
+    case Op_MinD:
+      return new MinDNode(in1, in2);
+    case Op_MaxI:
+      return new MaxINode(in1, in2);
+    case Op_MaxL:
+      return new MaxLNode(c, in1, in2);
+    case Op_MaxF:
+      return new MaxFNode(in1, in2);
+    case Op_MaxD:
+      return new MaxDNode(in1, in2);
+    case Op_SqrtF:
+      return new SqrtFNode(c, control, in1);
+    case Op_SqrtD:
+      return new SqrtDNode(c, control, in1);
+    case Op_FmaF:
+      return new FmaFNode(in1, in2, in3);
+    case Op_FmaD:
+      return new FmaDNode(in1, in2, in3);
+    default:
+      assert(false, "unexpected scalar opcode");
+      return nullptr;
+  }
+}
+
+// Re-wires and creates a new ideal graph pallet with following connectivity
+//   parent(child(cinput1, cinput2), pinput2)
+Node* VectorNode::create_reassociated_node(Node* parent, Node* child, Node* cinput1, Node* cinput2,
+                                           Node* pinput2, PhaseGVN* phase) {
+  Node* cloned_child = child->clone();
+  cloned_child->set_req(1, cinput1);
+  cloned_child->set_req(2, cinput2);
+  cloned_child = phase->transform(cloned_child);
+  Node* cloned_parent = parent->clone();
+  cloned_parent->set_req(1, cloned_child);
+  cloned_parent->set_req(2, pinput2);
+  return cloned_parent;
+}
+
+// Try to reassociate commutative vector operations using the following ideal transformation,
+// this will facilitate strength reducing a vector operation with all replicated inputs to
+// a scalar operation.
+//
+// VectorOp (Replicate INP1) (VectorOp (Replicate INP2) INP3) =>
+//    VectorOp (VectorOp (Replicate INP1) (Replicate INP2)) INP3
+//
+Node* VectorNode::reassociate_vector_operation(PhaseGVN* phase) {
+  // Enable re-association for integral vector operations.
+  if (!is_integral_type(vect_type()->element_basic_type())) {
+    return nullptr;
+  }
+
+  // Enable re-association for commutative vector operations.
+  if (!is_commutative_vector_operation(Opcode())) {
+    return nullptr;
+  }
+
+  Node* in1 = in(1);
+  Node* in2 = in(2);
+  if (in2->Opcode() == Op_Replicate && in1->Opcode() == Opcode()) {
+    swap(in1, in2);
+  }
+
+  if (in1->Opcode() != Op_Replicate || in2->Opcode() != Opcode()) {
+    return nullptr;
+  }
+
+  // Skip predicated vector operations, mask semantics prevent reassociation.
+  if (is_predicated_vector() || in2->as_Vector()->is_predicated_vector()) {
+    return nullptr;
+  }
+
+  Node* in2_1 = in2->in(1);
+  Node* in2_2 = in2->in(2);
+  if (in2_1->Opcode() == Op_Replicate) {
+    return create_reassociated_node(this, in2, in1, in2_1, in2_2, phase);
+  } else if (in2_2->Opcode() == Op_Replicate) {
+    return create_reassociated_node(this, in2, in1, in2_2, in2_1, phase);
+  }
+
+  return nullptr;
+}
+
+// Convert vector operation with all Replicate inputs to scalar operation using following
+// ideal transformation.
+//
+// VectorOp (Replicate INP1, Replicate INP2) =>
+//   Replicate (ScalarOp INP1, INP2)
+//
+Node* VectorNode::push_through_replicate(PhaseGVN* phase) {
+  BasicType bt = vect_type()->element_basic_type();
+  if (!can_push_through_replicate(bt)) {
+    return nullptr;
+  }
+
+  assert(req() >= 2 && req() <= 4, "unexpected req() %u for %s", req(), NodeClassNames[Opcode()]);
+
+  Node* sinp1 = nullptr;
+  Node* sinp2 = nullptr;
+  Node* sinp3 = nullptr;
+
+  assert(in(1)->Opcode() == Op_Replicate, "");
+  sinp1 = in(1)->in(1);
+
+  if (req() > 2) {
+    assert(in(2)->Opcode() == Op_Replicate, "");
+    sinp2 = in(2)->in(1);
+  }
+
+  if (req() > 3) {
+    assert(in(3)->Opcode() == Op_Replicate, "");
+    sinp3 = in(3)->in(1);
+  }
+
+  Node* sop = make_scalar(phase->C, Opcode(), bt, in(0), sinp1, sinp2, sinp3);
+  if (sop == nullptr) {
+    return nullptr;
+  }
+
+  sop = phase->transform(sop);
+
+  return new ReplicateNode(sop, vect_type());
+}
+
 Node* VectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
  Node* n = ideal_partial_operations(phase, this, vect_type());
  if (n != nullptr) {
@ -1044,7 +1387,13 @@ Node* VectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
  if (should_swap_inputs_to_help_global_value_numbering()) {
    swap_edges(1, 2);
  }
-  return nullptr;
+
+  n = push_through_replicate(phase);
+  if (n != nullptr) {
+    return n;
+  }
+
+  return reassociate_vector_operation(phase);
 }

 // Traverses a chain of VectorMaskCast and returns the first non VectorMaskCast node.
@ -2094,7 +2443,7 @@ Node* FmaVNode::Ideal(PhaseGVN* phase, bool can_reshape) {
    swap_edges(1, 2);
    return this;
  }
-  return nullptr;
+  return VectorNode::Ideal(phase, can_reshape);
 }

 // Generate other vector nodes to implement the masked/non-masked vector negation.
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -146,12 +146,20 @@ class VectorNode : public TypeNode {
  static bool is_minmax_opcode(int opc);

  bool should_swap_inputs_to_help_global_value_numbering();
+  Node* reassociate_vector_operation(PhaseGVN* phase);
+  static Node* create_reassociated_node(Node* parent, Node* child, Node* cinput1, Node* cinput2,
+                                        Node* pinput2, PhaseGVN* phase);

  static bool is_vshift_cnt_opcode(int opc);

  static bool is_rotate_opcode(int opc);

  static int opcode(int sopc, BasicType bt);         // scalar_opc -> vector_opc
+  static int scalar_opcode(int vopc, BasicType bt);  // vector_opc -> scalar_opc, 0 if not handled
+  static Node* make_scalar(Compile* c, int vopc, BasicType bt, Node* control, Node* in1, Node* in2, Node* in3);
+
+  bool can_push_through_replicate(BasicType bt);
+  Node* push_through_replicate(PhaseGVN* phase);

  static int shift_count_opcode(int opc);

--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@ -233,6 +233,11 @@ public class IRNode {
        beforeMatchingNameRegex(ADD_P, "AddP");
    }

+    public static final String ADD_D = PREFIX + "ADD_D" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(ADD_D, "AddD");
+    }
+
    public static final String ADD_VD = VECTOR_PREFIX + "ADD_VD" + POSTFIX;
    static {
        vectorNode(ADD_VD, "AddVD", TYPE_DOUBLE);
@ -763,11 +768,21 @@ public class IRNode {
        vectorNode(DIV_VHF, "DivVHF", TYPE_SHORT);
    }

+    public static final String DIV_F = PREFIX + "DIV_F" + POSTFIX;
+    static {
+       beforeMatchingNameRegex(DIV_F, "DivF");
+    }
+
    public static final String DIV_VF = VECTOR_PREFIX + "DIV_VF" + POSTFIX;
    static {
        vectorNode(DIV_VF, "DivVF", TYPE_FLOAT);
    }

+    public static final String DIV_D = PREFIX + "DIV_D" + POSTFIX;
+    static {
+       beforeMatchingNameRegex(DIV_D, "DivD");
+    }
+
    public static final String DIV_VD = VECTOR_PREFIX + "DIV_VD" + POSTFIX;
    static {
        vectorNode(DIV_VD, "DivVD", TYPE_DOUBLE);
--- a/test/hotspot/jtreg/compiler/vectorapi/TestVectorBroadcastTransforms.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorBroadcastTransforms.java
--- a/test/hotspot/jtreg/compiler/vectorapi/TestVectorReassociations.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorReassociations.java
@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8358521
+ * @summary Test reassociation of broadcasted inputs across vector operations
+ * @modules jdk.incubator.vector
+ * @library /test/lib /
+ * @run driver compiler.vectorapi.TestVectorReassociations
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.ir_framework.*;
+import jdk.incubator.vector.*;
+import java.util.stream.IntStream;
+
+/**
+ * Tests for the reassociation transform:
+ *   VectorOp(broadcast(a), VectorOp(broadcast(b), array))
+ *     => VectorOp(broadcast(ScalarOp(a, b)), array)
+ */
+public class TestVectorReassociations {
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
+    }
+
+    /* =======================
+     * INT
+     * ======================= */
+
+    static final VectorSpecies<Integer> ISP = IntVector.SPECIES_PREFERRED;
+    static int[] intIn  = IntStream.range(0, IntVector.SPECIES_PREFERRED.length()).toArray();
+    static int[] intOut = new int[IntVector.SPECIES_PREFERRED.length()];
+    static int ia = 17, ib = 9;
+
+    // --- INT ADD ---
+
+    // bcast(a) ADD (bcast(b) ADD array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_add_reassociation_pattern1() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.broadcast(ISP, ib)
+                                    .lanewise(VectorOperators.ADD,
+                                              IntVector.fromArray(ISP, intIn, 0)))
+                 .intoArray(intOut, 0);
+    }
+
+    // bcast(a) ADD (array ADD bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_add_reassociation_pattern2() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.fromArray(ISP, intIn, 0)
+                                    .lanewise(VectorOperators.ADD,
+                                              IntVector.broadcast(ISP, ib)))
+                 .intoArray(intOut, 0);
+    }
+
+    // (bcast(a) ADD array) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_add_reassociation_pattern3() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.fromArray(ISP, intIn, 0))
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.broadcast(ISP, ib))
+                 .intoArray(intOut, 0);
+    }
+
+    // (array ADD bcast(a)) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_add_reassociation_pattern4() {
+        IntVector.fromArray(ISP, intIn, 0)
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.broadcast(ISP, ia))
+                 .lanewise(VectorOperators.ADD,
+                           IntVector.broadcast(ISP, ib))
+                 .intoArray(intOut, 0);
+    }
+
+    // --- INT MUL ---
+
+    // bcast(a) MUL (bcast(b) MUL array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_mul_reassociation_pattern1() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.broadcast(ISP, ib)
+                                    .lanewise(VectorOperators.MUL,
+                                              IntVector.fromArray(ISP, intIn, 0)))
+                 .intoArray(intOut, 0);
+    }
+
+    // bcast(a) MUL (array MUL bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_mul_reassociation_pattern2() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.fromArray(ISP, intIn, 0)
+                                    .lanewise(VectorOperators.MUL,
+                                              IntVector.broadcast(ISP, ib)))
+                 .intoArray(intOut, 0);
+    }
+
+    // (bcast(a) MUL array) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_mul_reassociation_pattern3() {
+        IntVector.broadcast(ISP, ia)
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.fromArray(ISP, intIn, 0))
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.broadcast(ISP, ib))
+                 .intoArray(intOut, 0);
+    }
+
+    // (array MUL bcast(a)) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VI, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_I, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_int_mul_reassociation_pattern4() {
+        IntVector.fromArray(ISP, intIn, 0)
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.broadcast(ISP, ia))
+                 .lanewise(VectorOperators.MUL,
+                           IntVector.broadcast(ISP, ib))
+                 .intoArray(intOut, 0);
+    }
+
+    /* =======================
+     * LONG
+     * ======================= */
+
+    static final VectorSpecies<Long> LSP = LongVector.SPECIES_PREFERRED;
+    static long[] longIn;
+    static long[] longOut;
+    static long la = 17L, lb = 9L;
+
+    static {
+        longIn = new long[LSP.length()];
+        longOut = new long[LSP.length()];
+        for (int i = 0; i < LSP.length(); i++) {
+            longIn[i] = (long) i;
+        }
+    }
+
+    // --- LONG ADD ---
+
+    // bcast(a) ADD (bcast(b) ADD array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_add_reassociation_pattern1() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.broadcast(LSP, lb)
+                                     .lanewise(VectorOperators.ADD,
+                                               LongVector.fromArray(LSP, longIn, 0)))
+                  .intoArray(longOut, 0);
+    }
+
+    // bcast(a) ADD (array ADD bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_add_reassociation_pattern2() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.fromArray(LSP, longIn, 0)
+                                     .lanewise(VectorOperators.ADD,
+                                               LongVector.broadcast(LSP, lb)))
+                  .intoArray(longOut, 0);
+    }
+
+    // (bcast(a) ADD array) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_add_reassociation_pattern3() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.fromArray(LSP, longIn, 0))
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.broadcast(LSP, lb))
+                  .intoArray(longOut, 0);
+    }
+
+    // (array ADD bcast(a)) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_add_reassociation_pattern4() {
+        LongVector.fromArray(LSP, longIn, 0)
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.broadcast(LSP, la))
+                  .lanewise(VectorOperators.ADD,
+                            LongVector.broadcast(LSP, lb))
+                  .intoArray(longOut, 0);
+    }
+
+    // --- LONG MUL ---
+
+    // bcast(a) MUL (bcast(b) MUL array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_mul_reassociation_pattern1() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.broadcast(LSP, lb)
+                                     .lanewise(VectorOperators.MUL,
+                                               LongVector.fromArray(LSP, longIn, 0)))
+                  .intoArray(longOut, 0);
+    }
+
+    // bcast(a) MUL (array MUL bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_mul_reassociation_pattern2() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.fromArray(LSP, longIn, 0)
+                                     .lanewise(VectorOperators.MUL,
+                                               LongVector.broadcast(LSP, lb)))
+                  .intoArray(longOut, 0);
+    }
+
+    // (bcast(a) MUL array) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_mul_reassociation_pattern3() {
+        LongVector.broadcast(LSP, la)
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.fromArray(LSP, longIn, 0))
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.broadcast(LSP, lb))
+                  .intoArray(longOut, 0);
+    }
+
+    // (array MUL bcast(a)) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VL, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_L, ">= 1",
+                   IRNode.REPLICATE_L, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_long_mul_reassociation_pattern4() {
+        LongVector.fromArray(LSP, longIn, 0)
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.broadcast(LSP, la))
+                  .lanewise(VectorOperators.MUL,
+                            LongVector.broadcast(LSP, lb))
+                  .intoArray(longOut, 0);
+    }
+
+    /* =======================
+     * SHORT
+     * ======================= */
+
+    static final VectorSpecies<Short> SSP = ShortVector.SPECIES_PREFERRED;
+    static short[] shortIn;
+    static short[] shortOut;
+    static short sa = 17, sb = 9;
+
+    static {
+        shortIn = new short[SSP.length()];
+        shortOut = new short[SSP.length()];
+        for (int i = 0; i < SSP.length(); i++) {
+            shortIn[i] = (short) i;
+        }
+    }
+
+    // --- SHORT ADD ---
+
+    // bcast(a) ADD (bcast(b) ADD array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_add_reassociation_pattern1() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.broadcast(SSP, sb)
+                                      .lanewise(VectorOperators.ADD,
+                                                ShortVector.fromArray(SSP, shortIn, 0)))
+                   .intoArray(shortOut, 0);
+    }
+
+    // bcast(a) ADD (array ADD bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_add_reassociation_pattern2() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.fromArray(SSP, shortIn, 0)
+                                      .lanewise(VectorOperators.ADD,
+                                                ShortVector.broadcast(SSP, sb)))
+                   .intoArray(shortOut, 0);
+    }
+
+    // (bcast(a) ADD array) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_add_reassociation_pattern3() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.fromArray(SSP, shortIn, 0))
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.broadcast(SSP, sb))
+                   .intoArray(shortOut, 0);
+    }
+
+    // (array ADD bcast(a)) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_add_reassociation_pattern4() {
+        ShortVector.fromArray(SSP, shortIn, 0)
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.broadcast(SSP, sa))
+                   .lanewise(VectorOperators.ADD,
+                             ShortVector.broadcast(SSP, sb))
+                   .intoArray(shortOut, 0);
+    }
+
+    // --- SHORT MUL ---
+
+    // bcast(a) MUL (bcast(b) MUL array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_mul_reassociation_pattern1() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.broadcast(SSP, sb)
+                                      .lanewise(VectorOperators.MUL,
+                                                ShortVector.fromArray(SSP, shortIn, 0)))
+                   .intoArray(shortOut, 0);
+    }
+
+    // bcast(a) MUL (array MUL bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_mul_reassociation_pattern2() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.fromArray(SSP, shortIn, 0)
+                                      .lanewise(VectorOperators.MUL,
+                                                ShortVector.broadcast(SSP, sb)))
+                   .intoArray(shortOut, 0);
+    }
+
+    // (bcast(a) MUL array) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_mul_reassociation_pattern3() {
+        ShortVector.broadcast(SSP, sa)
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.fromArray(SSP, shortIn, 0))
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.broadcast(SSP, sb))
+                   .intoArray(shortOut, 0);
+    }
+
+    // (array MUL bcast(a)) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VS, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_S, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_short_mul_reassociation_pattern4() {
+        ShortVector.fromArray(SSP, shortIn, 0)
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.broadcast(SSP, sa))
+                   .lanewise(VectorOperators.MUL,
+                             ShortVector.broadcast(SSP, sb))
+                   .intoArray(shortOut, 0);
+    }
+
+    /* =======================
+     * BYTE
+     * ======================= */
+
+    static final VectorSpecies<Byte> BSP = ByteVector.SPECIES_PREFERRED;
+    static byte[] byteIn;
+    static byte[] byteOut;
+    static byte ba = 17, bb = 9;
+
+    static {
+        byteIn = new byte[BSP.length()];
+        byteOut = new byte[BSP.length()];
+        for (int i = 0; i < BSP.length(); i++) {
+            byteIn[i] = (byte) i;
+        }
+    }
+
+    // --- BYTE ADD ---
+
+    // bcast(a) ADD (bcast(b) ADD array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_add_reassociation_pattern1() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.broadcast(BSP, bb)
+                                     .lanewise(VectorOperators.ADD,
+                                               ByteVector.fromArray(BSP, byteIn, 0)))
+                  .intoArray(byteOut, 0);
+    }
+
+    // bcast(a) ADD (array ADD bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_add_reassociation_pattern2() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.fromArray(BSP, byteIn, 0)
+                                     .lanewise(VectorOperators.ADD,
+                                               ByteVector.broadcast(BSP, bb)))
+                  .intoArray(byteOut, 0);
+    }
+
+    // (bcast(a) ADD array) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_add_reassociation_pattern3() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.fromArray(BSP, byteIn, 0))
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.broadcast(BSP, bb))
+                  .intoArray(byteOut, 0);
+    }
+
+    // (array ADD bcast(a)) ADD bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.ADD_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.ADD_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_add_reassociation_pattern4() {
+        ByteVector.fromArray(BSP, byteIn, 0)
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.broadcast(BSP, ba))
+                  .lanewise(VectorOperators.ADD,
+                            ByteVector.broadcast(BSP, bb))
+                  .intoArray(byteOut, 0);
+    }
+
+    // --- BYTE MUL ---
+
+    // bcast(a) MUL (bcast(b) MUL array)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_mul_reassociation_pattern1() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.broadcast(BSP, bb)
+                                     .lanewise(VectorOperators.MUL,
+                                               ByteVector.fromArray(BSP, byteIn, 0)))
+                  .intoArray(byteOut, 0);
+    }
+
+    // bcast(a) MUL (array MUL bcast(b))
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_mul_reassociation_pattern2() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.fromArray(BSP, byteIn, 0)
+                                     .lanewise(VectorOperators.MUL,
+                                               ByteVector.broadcast(BSP, bb)))
+                  .intoArray(byteOut, 0);
+    }
+
+    // (bcast(a) MUL array) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_mul_reassociation_pattern3() {
+        ByteVector.broadcast(BSP, ba)
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.fromArray(BSP, byteIn, 0))
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.broadcast(BSP, bb))
+                  .intoArray(byteOut, 0);
+    }
+
+    // (array MUL bcast(a)) MUL bcast(b)
+    @Test
+    @IR(applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        counts = { IRNode.MUL_VB, IRNode.VECTOR_SIZE_ANY, " 1 ", IRNode.MUL_I, ">= 1",
+                   IRNode.REPLICATE_B, IRNode.VECTOR_SIZE_ANY, ">= 1" })
+    @Warmup(value = 10000)
+    static void test_byte_mul_reassociation_pattern4() {
+        ByteVector.fromArray(BSP, byteIn, 0)
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.broadcast(BSP, ba))
+                  .lanewise(VectorOperators.MUL,
+                            ByteVector.broadcast(BSP, bb))
+                  .intoArray(byteOut, 0);
+    }
+}
--- a/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorReassociateBenchmark.java
+++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/VectorReassociateBenchmark.java
@ -0,0 +1,239 @@
+/*
+ *  Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
+ *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ *  This code is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 only, as
+ *  published by the Free Software Foundation.
+ *
+ *  This code is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  version 2 for more details (a copy is included in the LICENSE file that
+ *  accompanied this code).
+ *
+ *  You should have received a copy of the GNU General Public License version
+ *  2 along with this work; if not, write to the Free Software Foundation,
+ *  Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *  Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ *  or visit www.oracle.com if you need additional information or have any
+ *  questions.
+ *
+ */
+
+package org.openjdk.bench.jdk.incubator.vector;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+import jdk.incubator.vector.*;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Thread)
+@Fork(jvmArgs = {"--add-modules=jdk.incubator.vector"})
+public class VectorReassociateBenchmark {
+    @Param({"1024", "2048"})
+    int size;
+
+    int [] intIn1;
+    int [] intOut;
+
+    long [] longIn1;
+    long [] longOut;
+
+    short [] shortIn1;
+    short [] shortOut;
+
+    byte [] byteIn1;
+    byte [] byteOut;
+
+    static final VectorSpecies<Float> fspecies = FloatVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Double> dspecies = DoubleVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Integer> ispecies = IntVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Long> lspecies = LongVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Short> sspecies = ShortVector.SPECIES_PREFERRED;
+    static final VectorSpecies<Byte> bspecies = ByteVector.SPECIES_PREFERRED;
+
+    @Setup(Level.Trial)
+    public void BmSetup() {
+        Random r = new Random(2048);
+        intIn1 = new int[size];
+        intOut = new int[size];
+
+        longIn1 = new long[size];
+        longOut = new long[size];
+
+        shortIn1 = new short[size];
+        shortOut = new short[size];
+
+        byteIn1 = new byte[size];
+        byteOut = new byte[size];
+
+        for (int i = 4; i < size; i++) {
+            intIn1[i] = r.nextInt();
+            longIn1[i] = r.nextLong();
+            shortIn1[i] = (short) r.nextInt();
+            byteIn1[i] = (byte) r.nextInt();
+        }
+    }
+
+    @Benchmark
+    public float pushBroadcastsAcrossVectorKernel1() {
+        FloatVector res = FloatVector.broadcast(fspecies, 0.0f);
+        for (int i = 0; i < size; i++) {
+            FloatVector vec1 = FloatVector.broadcast(fspecies, (float)i);
+            FloatVector vec2 = FloatVector.broadcast(fspecies, (float)i + 1);
+            FloatVector vec3 = FloatVector.broadcast(fspecies, (float)i + 2);
+            res = res.lanewise(VectorOperators.ADD, vec1.lanewise(VectorOperators.FMA, vec2, vec3));
+        }
+        return res.lane(0);
+    }
+
+    @Benchmark
+    public double pushBroadcastsAcrossVectorKernel2() {
+        DoubleVector res = DoubleVector.broadcast(dspecies, 0.0f);
+        for (int i = 0; i < size; i++) {
+            DoubleVector vec1 = DoubleVector.broadcast(dspecies, (double)i);
+            res = res.lanewise(VectorOperators.ADD, vec1.lanewise(VectorOperators.SQRT));
+        }
+        return res.lane(0);
+    }
+
+    // int: bcast(a) MUL (bcast(b) MUL (bcast(c) MUL array))
+    @Benchmark
+    public void reassociateIntMulChainedBroadcasts() {
+        for (int i = 0; i < ispecies.loopBound(size); i += ispecies.length()) {
+            IntVector.broadcast(ispecies, i)
+                     .lanewise(VectorOperators.MUL,
+                               IntVector.broadcast(ispecies, i + 1)
+                                        .lanewise(VectorOperators.MUL,
+                                                  IntVector.broadcast(ispecies, i + 2)
+                                                           .lanewise(VectorOperators.MUL,
+                                                                     IntVector.fromArray(ispecies, intIn1, i))))
+            .intoArray(intOut, i);
+        }
+    }
+
+    // int: (bcast(a) MUL bcast(b)) MUL (bcast(c) MUL array)
+    @Benchmark
+    public void reassociateIntMulBalancedBroadcasts() {
+        for (int i = 0; i < ispecies.loopBound(size); i += ispecies.length()) {
+            IntVector left =
+                IntVector.broadcast(ispecies, i)
+                         .lanewise(VectorOperators.MUL,
+                                   IntVector.broadcast(ispecies, i + 1));
+
+            IntVector right =
+                IntVector.broadcast(ispecies, i + 2)
+                         .lanewise(VectorOperators.MUL,
+                                   IntVector.fromArray(ispecies, intIn1, i));
+
+            left.lanewise(VectorOperators.MUL, right)
+                .intoArray(intOut, i);
+        }
+    }
+
+    // long: bcast(a) MUL (bcast(b) MUL (bcast(c) MUL array))
+    @Benchmark
+    public void reassociateLongMulChainedBroadcasts() {
+        for (int i = 0; i < lspecies.loopBound(size); i += lspecies.length()) {
+            LongVector.broadcast(lspecies, (long) i)
+                      .lanewise(VectorOperators.MUL,
+                                LongVector.broadcast(lspecies, (long) (i + 1))
+                                          .lanewise(VectorOperators.MUL,
+                                                    LongVector.broadcast(lspecies, (long) (i + 2))
+                                                              .lanewise(VectorOperators.MUL,
+                                                                        LongVector.fromArray(lspecies, longIn1, i))))
+            .intoArray(longOut, i);
+        }
+    }
+
+    // long: (bcast(a) MUL bcast(b)) MUL (bcast(c) MUL array)
+    @Benchmark
+    public void reassociateLongMulBalancedBroadcasts() {
+        for (int i = 0; i < lspecies.loopBound(size); i += lspecies.length()) {
+            LongVector left =
+                LongVector.broadcast(lspecies, (long) i)
+                          .lanewise(VectorOperators.MUL,
+                                    LongVector.broadcast(lspecies, (long) (i + 1)));
+
+            LongVector right =
+                LongVector.broadcast(lspecies, (long) (i + 2))
+                          .lanewise(VectorOperators.MUL,
+                                    LongVector.fromArray(lspecies, longIn1, i));
+
+            left.lanewise(VectorOperators.MUL, right)
+                .intoArray(longOut, i);
+        }
+    }
+
+    // short: bcast(a) MUL (bcast(b) MUL (bcast(c) MUL array))
+    @Benchmark
+    public void reassociateShortMulChainedBroadcasts() {
+        for (int i = 0; i < sspecies.loopBound(size); i += sspecies.length()) {
+            ShortVector.broadcast(sspecies, (short) i)
+                       .lanewise(VectorOperators.MUL,
+                                 ShortVector.broadcast(sspecies, (short) (i + 1))
+                                            .lanewise(VectorOperators.MUL,
+                                                      ShortVector.broadcast(sspecies, (short) (i + 2))
+                                                                 .lanewise(VectorOperators.MUL,
+                                                                           ShortVector.fromArray(sspecies, shortIn1, i))))
+            .intoArray(shortOut, i);
+        }
+    }
+
+    // short: (bcast(a) MUL bcast(b)) MUL (bcast(c) MUL array)
+    @Benchmark
+    public void reassociateShortMulBalancedBroadcasts() {
+        for (int i = 0; i < sspecies.loopBound(size); i += sspecies.length()) {
+            ShortVector left =
+                ShortVector.broadcast(sspecies, (short) i)
+                           .lanewise(VectorOperators.MUL,
+                                     ShortVector.broadcast(sspecies, (short) (i + 1)));
+
+            ShortVector right =
+                ShortVector.broadcast(sspecies, (short) (i + 2))
+                           .lanewise(VectorOperators.MUL,
+                                     ShortVector.fromArray(sspecies, shortIn1, i));
+
+            left.lanewise(VectorOperators.MUL, right)
+                .intoArray(shortOut, i);
+        }
+    }
+
+    // byte: bcast(a) MUL (bcast(b) MUL (bcast(c) MUL array))
+    @Benchmark
+    public void reassociateByteMulChainedBroadcasts() {
+        for (int i = 0; i < bspecies.loopBound(size); i += bspecies.length()) {
+            ByteVector.broadcast(bspecies, (byte) i)
+                      .lanewise(VectorOperators.MUL,
+                                ByteVector.broadcast(bspecies, (byte) (i + 1))
+                                           .lanewise(VectorOperators.MUL,
+                                                     ByteVector.broadcast(bspecies, (byte) (i + 2))
+                                                                .lanewise(VectorOperators.MUL,
+                                                                          ByteVector.fromArray(bspecies, byteIn1, i))))
+            .intoArray(byteOut, i);
+        }
+    }
+
+    // byte: (bcast(a) MUL bcast(b)) MUL (bcast(c) MUL array)
+    @Benchmark
+    public void reassociateByteMulBalancedBroadcasts() {
+        for (int i = 0; i < bspecies.loopBound(size); i += bspecies.length()) {
+            ByteVector left =
+                ByteVector.broadcast(bspecies, (byte) i)
+                          .lanewise(VectorOperators.MUL,
+                                    ByteVector.broadcast(bspecies, (byte) (i + 1)));
+
+            ByteVector right =
+                ByteVector.broadcast(bspecies, (byte) (i + 2))
+                          .lanewise(VectorOperators.MUL,
+                                    ByteVector.fromArray(bspecies, byteIn1, i));
+
+            left.lanewise(VectorOperators.MUL, right)
+                .intoArray(byteOut, i);
+        }
+    }
+}