8369448: C2 SuperWord: refactor VTransform to do move_unordered_reduction_out_of_loop during VTransform::optimize

Reviewed-by: chagedorn, kvn
2026-01-28 12:09:14 +00:00 · 2025-10-14 08:32:32 +00:00 · 2025-10-14 08:32:32 +00:00 · 4786f8bee5
commit 4786f8bee5
parent a3ee821f38
10 changed files with 402 additions and 341 deletions
--- a/src/hotspot/share/opto/loopnode.cpp
+++ b/src/hotspot/share/opto/loopnode.cpp
@ -5287,16 +5287,6 @@ void PhaseIdealLoop::build_and_optimize() {
    }
  }

-  // Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization.
-  if (C->has_loops() && !C->major_progress()) {
-    for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
-      IdealLoopTree* lpt = iter.current();
-      if (lpt->is_counted() && lpt->is_innermost()) {
-        move_unordered_reduction_out_of_loop(lpt);
-      }
-    }
-  }
-
  // Keep loop predicates and perform optimizations with them
  // until no more loop optimizations could be done.
  // After that switch predicates off and do more loop optimizations.
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@ -1550,9 +1550,6 @@ public:
  IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj);
  bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt);

-  // Move an unordered Reduction out of loop if possible
-  void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
-
  // Create a scheduled list of nodes control dependent on ctrl set.
  void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
  // Has a use in the vector set
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@ -4548,211 +4548,6 @@ void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(Id
  do_multiversioning(lpt, old_new);
 }

-// Returns true if the Reduction node is unordered.
-static bool is_unordered_reduction(Node* n) {
-  return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
-}
-
-// Having ReductionNodes in the loop is expensive. They need to recursively
-// fold together the vector values, for every vectorized loop iteration. If
-// we encounter the following pattern, we can vector accumulate the values
-// inside the loop, and only have a single UnorderedReduction after the loop.
-//
-// Note: UnorderedReduction represents a ReductionNode which does not require
-// calculating in strict order.
-//
-// CountedLoop     init
-//          |        |
-//          +------+ | +-----------------------+
-//                 | | |                       |
-//                PhiNode (s)                  |
-//                  |                          |
-//                  |          Vector          |
-//                  |            |             |
-//               UnorderedReduction (first_ur) |
-//                  |                          |
-//                 ...         Vector          |
-//                  |            |             |
-//               UnorderedReduction (last_ur)  |
-//                       |                     |
-//                       +---------------------+
-//
-// We patch the graph to look like this:
-//
-// CountedLoop   identity_vector
-//         |         |
-//         +-------+ | +---------------+
-//                 | | |               |
-//                PhiNode (v)          |
-//                   |                 |
-//                   |         Vector  |
-//                   |           |     |
-//                 VectorAccumulator   |
-//                   |                 |
-//                  ...        Vector  |
-//                   |           |     |
-//      init       VectorAccumulator   |
-//        |          |     |           |
-//     UnorderedReduction  +-----------+
-//
-// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
-// use vector_accumulators, which do the same reductions, but only element
-// wise. This is a single operation per vector_accumulator, rather than many
-// for a UnorderedReduction. We can then reduce the last vector_accumulator
-// after the loop, and also reduce the init value into it.
-//
-// We can not do this with all reductions. Some reductions do not allow the
-// reordering of operations (for example float addition/multiplication require
-// strict order).
-void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
-  assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
-
-  // Find all Phi nodes with an unordered Reduction on backedge.
-  CountedLoopNode* cl = loop->_head->as_CountedLoop();
-  for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
-    Node* phi = cl->fast_out(j);
-    // We have a phi with a single use, and an unordered Reduction on the backedge.
-    if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) {
-      continue;
-    }
-
-    ReductionNode* last_ur = phi->in(2)->as_Reduction();
-    assert(!last_ur->requires_strict_order(), "must be");
-
-    // Determine types
-    const TypeVect* vec_t = last_ur->vect_type();
-    uint vector_length    = vec_t->length();
-    BasicType bt          = vec_t->element_basic_type();
-
-    // Convert opcode from vector-reduction -> scalar -> normal-vector-op
-    const int sopc        = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
-    const int vopc        = VectorNode::opcode(sopc, bt);
-    if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
-        DEBUG_ONLY( last_ur->dump(); )
-        assert(false, "do not have normal vector op for this reduction");
-        continue; // not implemented -> fails
-    }
-
-    // Traverse up the chain of unordered Reductions, checking that it loops back to
-    // the phi. Check that all unordered Reductions only have a single use, except for
-    // the last (last_ur), which only has phi as a use in the loop, and all other uses
-    // are outside the loop.
-    ReductionNode* current = last_ur;
-    ReductionNode* first_ur = nullptr;
-    while (true) {
-      assert(!current->requires_strict_order(), "sanity");
-
-      // Expect no ctrl and a vector_input from within the loop.
-      Node* ctrl = current->in(0);
-      Node* vector_input = current->in(2);
-      if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
-        DEBUG_ONLY( current->dump(1); )
-        assert(false, "reduction has ctrl or bad vector_input");
-        break; // Chain traversal fails.
-      }
-
-      assert(current->vect_type() != nullptr, "must have vector type");
-      if (current->vect_type() != last_ur->vect_type()) {
-        // Reductions do not have the same vector type (length and element type).
-        break; // Chain traversal fails.
-      }
-
-      // Expect single use of an unordered Reduction, except for last_ur.
-      if (current == last_ur) {
-        // Expect all uses to be outside the loop, except phi.
-        for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
-          Node* use = current->fast_out(k);
-          if (use != phi && ctrl_or_self(use) == cl) {
-            DEBUG_ONLY( current->dump(-1); )
-            assert(false, "reduction has use inside loop");
-            // Should not be allowed by SuperWord::mark_reductions
-            return; // bail out of optimization
-          }
-        }
-      } else {
-        if (current->outcnt() != 1) {
-          break; // Chain traversal fails.
-        }
-      }
-
-      // Expect another unordered Reduction or phi as the scalar input.
-      Node* scalar_input = current->in(1);
-      if (is_unordered_reduction(scalar_input) &&
-          scalar_input->Opcode() == current->Opcode()) {
-        // Move up the unordered Reduction chain.
-        current = scalar_input->as_Reduction();
-        assert(!current->requires_strict_order(), "must be");
-      } else if (scalar_input == phi) {
-        // Chain terminates at phi.
-        first_ur = current;
-        current = nullptr;
-        break; // Success.
-      } else {
-        // scalar_input is neither phi nor a matching reduction
-        // Can for example be scalar reduction when we have
-        // partial vectorization.
-        break; // Chain traversal fails.
-      }
-    }
-    if (current != nullptr) {
-      // Chain traversal was not successful.
-      continue;
-    }
-    assert(first_ur != nullptr, "must have successfully terminated chain traversal");
-
-    Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
-    set_root_as_ctrl(identity_scalar);
-    VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt);
-    register_new_node(identity_vector, C->root());
-    assert(vec_t == identity_vector->vect_type(), "matching vector type");
-    VectorNode::trace_new_vector(identity_vector, "Unordered Reduction");
-
-    // Turn the scalar phi into a vector phi.
-    _igvn.rehash_node_delayed(phi);
-    Node* init = phi->in(1); // Remember init before replacing it.
-    phi->set_req_X(1, identity_vector, &_igvn);
-    phi->as_Type()->set_type(vec_t);
-    _igvn.set_type(phi, vec_t);
-
-    // Traverse down the chain of unordered Reductions, and replace them with vector_accumulators.
-    current = first_ur;
-    while (true) {
-      // Create vector_accumulator to replace current.
-      Node* last_vector_accumulator = current->in(1);
-      Node* vector_input            = current->in(2);
-      VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
-      register_new_node(vector_accumulator, cl);
-      _igvn.replace_node(current, vector_accumulator);
-      VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction");
-      if (current == last_ur) {
-        break;
-      }
-      current = vector_accumulator->unique_out()->as_Reduction();
-      assert(!current->requires_strict_order(), "must be");
-    }
-
-    // Create post-loop reduction.
-    Node* last_accumulator = phi->in(2);
-    Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);
-
-    // Take over uses of last_accumulator that are not in the loop.
-    for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
-      Node* use = last_accumulator->out(i);
-      if (use != phi && use != post_loop_reduction) {
-        assert(ctrl_or_self(use) != cl, "use must be outside loop");
-        use->replace_edge(last_accumulator, post_loop_reduction,  &_igvn);
-        --i;
-      }
-    }
-    register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
-    VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction");
-
-    assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
-    assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
-    assert(phi->outcnt() == 1, "accumulator is the only use of phi");
-  }
-}
-
 void DataNodeGraph::clone_data_nodes(Node* new_ctrl) {
  for (uint i = 0; i < _data_nodes.size(); i++) {
    clone(_data_nodes[i], new_ctrl);
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -1606,7 +1606,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
      // 3 instructions (1 shuffle and two reduction ops).
      // However, this optimization assumes that these reductions stay in the loop
      // which may not be true any more in most cases after the introduction of:
-      // PhaseIdealLoop::move_unordered_reduction_out_of_loop
+      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
      // Hence, this heuristic has room for improvement.
      bool is_two_element_int_or_long_reduction = (size == 2) &&
                                                  (arith_type->basic_type() == T_INT ||
@ -1782,7 +1782,7 @@ bool SuperWord::profitable(const Node_List* p) const {
      // This heuristic is a bit simplistic, and assumes that the reduction
      // vector stays in the loop. But in some cases, we can move the
      // reduction out of the loop, replacing it with a single vector op.
-      // See: PhaseIdealLoop::move_unordered_reduction_out_of_loop
+      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
      // Hence, this heuristic has room for improvement.
 #ifndef PRODUCT
        if (is_trace_superword_rejections()) {
@ -1947,6 +1947,8 @@ bool SuperWord::do_vtransform() const {
    SuperWordVTransformBuilder builder(_packset, vtransform);
  }

+  vtransform.optimize();
+
  if (!vtransform.schedule()) { return false; }
  if (vtransform.has_store_to_load_forwarding_failure()) { return false; }

--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@ -45,10 +45,11 @@
  flags(SW_PACKSET,                 "Trace SuperWord packset at different stages") \
  flags(SW_INFO,                    "Trace SuperWord info (equivalent to TraceSuperWord)") \
  flags(SW_VERBOSE,                 "Trace SuperWord verbose (all SW tags enabled)") \
+  flags(VTRANSFORM,                 "Trace VTransform Graph") \
+  flags(OPTIMIZATION,               "Trace VTransform::optimize") \
  flags(ALIGN_VECTOR,               "Trace AlignVector") \
  flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
  flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
-  flags(VTRANSFORM,                 "Trace VTransform Graph") \
  flags(ALL,                        "Trace everything (very verbose)")

 #define table_entry(name, description) name,
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -205,6 +205,10 @@ public:
    return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
  }

+  bool is_trace_optimization() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
+  }
+
  bool is_trace_speculative_runtime_checks() const {
    return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
  }
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -292,121 +292,6 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  }
 }

-// Return the scalar opcode for the specified vector opcode
-// and basic type.
-int VectorNode::scalar_opcode(int sopc, BasicType bt) {
-  switch (sopc) {
-    case Op_AddReductionVI:
-    case Op_AddVI:
-      return Op_AddI;
-    case Op_AddReductionVL:
-    case Op_AddVL:
-      return Op_AddL;
-    case Op_MulReductionVI:
-    case Op_MulVI:
-      return Op_MulI;
-    case Op_MulReductionVL:
-    case Op_MulVL:
-      return Op_MulL;
-    case Op_AndReductionV:
-    case Op_AndV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_AndI;
-        case T_LONG:
-          return Op_AndL;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_OrReductionV:
-    case Op_OrV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_OrI;
-        case T_LONG:
-          return Op_OrL;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_XorReductionV:
-    case Op_XorV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_XorI;
-        case T_LONG:
-          return Op_XorL;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_MinReductionV:
-    case Op_MinV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-          assert(false, "boolean and char are signed, not implemented for Min");
-          return 0;
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_MinI;
-        case T_LONG:
-          return Op_MinL;
-        case T_FLOAT:
-          return Op_MinF;
-        case T_DOUBLE:
-          return Op_MinD;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_MaxReductionV:
-    case Op_MaxV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-          assert(false, "boolean and char are signed, not implemented for Max");
-          return 0;
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_MaxI;
-        case T_LONG:
-          return Op_MaxL;
-        case T_FLOAT:
-          return Op_MaxF;
-        case T_DOUBLE:
-          return Op_MaxD;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_MinVHF:
-      return Op_MinHF;
-    case Op_MaxVHF:
-      return Op_MaxHF;
-    default:
-      assert(false,
-             "Vector node %s is not handled in VectorNode::scalar_opcode",
-             NodeClassNames[sopc]);
-      return 0; // Unimplemented
-  }
-}
-
 // Limits on vector size (number of elements) for auto-vectorization.
 bool VectorNode::vector_size_supported_auto_vectorization(const BasicType bt, int size) {
  return Matcher::max_vector_size_auto_vectorization(bt) >= size &&
@ -1727,6 +1612,34 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
  return false;
 }

+bool ReductionNode::auto_vectorization_requires_strict_order(int vopc) {
+  switch (vopc) {
+    case Op_AddReductionVI:
+    case Op_AddReductionVL:
+    case Op_MulReductionVI:
+    case Op_MulReductionVL:
+    case Op_MinReductionV:
+    case Op_MaxReductionV:
+    case Op_AndReductionV:
+    case Op_OrReductionV:
+    case Op_XorReductionV:
+      // These are cases that all have associative operations, which can
+      // thus be reordered, allowing non-strict order reductions.
+      return false;
+    case Op_AddReductionVF:
+    case Op_MulReductionVF:
+    case Op_AddReductionVD:
+    case Op_MulReductionVD:
+      // Floating-point addition and multiplication are non-associative,
+      // so AddReductionVF/D and MulReductionVF/D require strict ordering
+      // in auto-vectorization.
+      return true;
+    default:
+      assert(false, "not handled: %s", NodeClassNames[vopc]);
+      return true;
+  }
+}
+
 MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* in1, Node* in2, Node* in3,
                                       Node* mask, uint truth_table, const TypeVect* vt) {
  assert(truth_table <= 0xFF, "invalid");
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -95,7 +95,6 @@ class VectorNode : public TypeNode {
  static bool is_rotate_opcode(int opc);

  static int opcode(int sopc, BasicType bt);         // scalar_opc -> vector_opc
-  static int scalar_opcode(int vopc, BasicType bt);  // vector_opc -> scalar_opc

  static int shift_count_opcode(int opc);

@ -283,6 +282,8 @@ class ReductionNode : public Node {
    return false;
  }

+  static bool auto_vectorization_requires_strict_order(int vopc);
+
 #ifndef PRODUCT
  void dump_spec(outputStream* st) const {
    if (requires_strict_order()) {
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@ -23,6 +23,7 @@

 #include "opto/castnode.hpp"
 #include "opto/convertnode.hpp"
+#include "opto/rootnode.hpp"
 #include "opto/vectorization.hpp"
 #include "opto/vectornode.hpp"
 #include "opto/vtransform.hpp"
@ -32,6 +33,45 @@ void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
  _vtnodes.push(vtnode);
 }

+#define TRACE_OPTIMIZE(code)                          \
+  NOT_PRODUCT(                                        \
+    if (vtransform.vloop().is_trace_optimization()) { \
+      code                                            \
+    }                                                 \
+  )
+
+// This is similar to IGVN optimization. But we are a bit lazy, and don't care about
+// notification / worklist, since the list of nodes is rather small, and we don't
+// expect optimizations that trickle over the whole graph.
+void VTransformGraph::optimize(VTransform& vtransform) {
+  TRACE_OPTIMIZE( tty->print_cr("\nVTransformGraph::optimize"); )
+
+  bool progress = true;
+  DEBUG_ONLY(int pass_count = 0;)
+  while (progress) {
+    progress = false;
+    assert(++pass_count < 10, "ensure we do not have endless loops");
+    for (int i = 0; i < _vtnodes.length(); i++) {
+      VTransformNode* vtn = _vtnodes.at(i);
+      if (!vtn->is_alive()) { continue; }
+      progress |= vtn->optimize(_vloop_analyzer, vtransform);
+
+      // Nodes that have no use any more are dead.
+      if (vtn->out_strong_edges() == 0 &&
+          // There are some exceptions:
+          // 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive.
+          // 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive.
+          // 3. Outer node with strong inputs: is a use after the loop that we must keep alive.
+          !(vtn->isa_LoopPhi() != nullptr ||
+            vtn->is_load_or_store_in_loop() ||
+            (vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) {
+        vtn->mark_dead();
+        progress = true;
+      }
+    }
+  }
+}
+
 // Compute a linearization of the graph. We do this with a reverse-post-order of a DFS.
 // This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and
 // the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the
@ -59,10 +99,11 @@ bool VTransformGraph::schedule() {
  VectorSet post_visited;

  collect_nodes_without_strong_in_edges(stack);
+  const int num_alive_nodes = count_alive_vtnodes();

  // We create a reverse-post-visit order. This gives us a linearization, if there are
  // no cycles. Then, we simply reverse the order, and we have a schedule.
-  int rpo_idx = _vtnodes.length() - 1;
+  int rpo_idx = num_alive_nodes - 1;
  while (!stack.is_empty()) {
    VTransformNode* vtn = stack.top();
    if (!pre_visited.test_set(vtn->_idx)) {
@ -79,6 +120,9 @@ bool VTransformGraph::schedule() {
      for (uint i = 0; i < vtn->out_strong_edges(); i++) {
        VTransformNode* use = vtn->out_strong_edge(i);

+        // Skip dead nodes
+        if (!use->is_alive()) { continue; }
+
        // Skip LoopPhi backedge.
        if ((use->isa_LoopPhi() != nullptr || use->isa_CountedLoop() != nullptr) && use->in_req(2) == vtn) { continue; }

@ -121,6 +165,7 @@ bool VTransformGraph::schedule() {
 void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const {
  for (int i = 0; i < _vtnodes.length(); i++) {
    VTransformNode* vtn = _vtnodes.at(i);
+    if (!vtn->is_alive()) { continue; }
    if (!vtn->has_strong_in_edge()) {
      stack.push(vtn);
    }
@ -132,6 +177,15 @@ void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTrans
  }
 }

+int VTransformGraph::count_alive_vtnodes() const {
+  int count = 0;
+  for (int i = 0; i < _vtnodes.length(); i++) {
+    VTransformNode* vtn = _vtnodes.at(i);
+    if (vtn->is_alive()) { count++; }
+  }
+  return count;
+}
+
 #ifndef PRODUCT
 void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
                                           const VectorSet& pre_visited,
@ -801,6 +855,13 @@ VTransformApplyResult VTransformLoopPhiNode::apply(VTransformApplyState& apply_s
  phase->igvn().replace_input_of(_node, 0, in0);
  phase->igvn().replace_input_of(_node, 1, in1);
  // Note: the backedge is hooked up later.
+
+  // The Phi's inputs may have been modified, and the types changes,
+  // e.g. from scalar to vector.
+  const Type* t = in1->bottom_type();
+  _node->as_Type()->set_type(t);
+  phase->igvn().set_type(_node, t);
+
  return VTransformApplyResult::make_scalar(_node);
 }

@ -939,6 +1000,242 @@ VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& appl
  return VTransformApplyResult::make_vector(vn);
 }

+bool VTransformReductionVectorNode::optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
+  return optimize_move_non_strict_order_reductions_out_of_loop(vloop_analyzer, vtransform);
+}
+
+int VTransformReductionVectorNode::vector_reduction_opcode() const {
+  return ReductionNode::opcode(scalar_opcode(), element_basic_type());
+}
+
+bool VTransformReductionVectorNode::requires_strict_order() const {
+  int vopc = vector_reduction_opcode();
+  return ReductionNode::auto_vectorization_requires_strict_order(vopc);
+}
+
+// Having ReductionNodes in the loop is expensive. They need to recursively
+// fold together the vector values, for every vectorized loop iteration. If
+// we encounter the following pattern, we can vector accumulate the values
+// inside the loop, and only have a single UnorderedReduction after the loop.
+//
+// Note: UnorderedReduction represents a ReductionNode which does not require
+// calculating in strict order.
+//
+// CountedLoop     init
+//          |        |
+//          +------+ | +------------------------+
+//                 | | |                        |
+//                PhiNode (s)                   |
+//                  |                           |
+//                  |          Vector           |
+//                  |            |              |
+//               UnorderedReduction (first_red) |
+//                  |                           |
+//                 ...         Vector           |
+//                  |            |              |
+//               UnorderedReduction (last_red)  |
+//                       |                      |
+//                       +----------------------+
+//
+// We patch the graph to look like this:
+//
+// CountedLoop   identity_vector
+//         |         |
+//         +-------+ | +---------------+
+//                 | | |               |
+//                PhiNode (v)          |
+//                   |                 |
+//                   |         Vector  |
+//                   |           |     |
+//                 VectorAccumulator   |
+//                   |                 |
+//                  ...        Vector  |
+//                   |           |     |
+//      init       VectorAccumulator   |
+//        |          |     |           |
+//     UnorderedReduction  +-----------+
+//
+// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
+// use vector_accumulators, which do the same reductions, but only element
+// wise. This is a single operation per vector_accumulator, rather than many
+// for a UnorderedReduction. We can then reduce the last vector_accumulator
+// after the loop, and also reduce the init value into it.
+//
+// We can not do this with all reductions. Some reductions do not allow the
+// reordering of operations (for example float addition/multiplication require
+// strict order).
+//
+// Note: we must perform this optimization already during auto vectorization,
+//       before we evaluate the cost-model. Without this optimization, we may
+//       still have expensive reduction nodes in the loop which can make
+//       vectorization unprofitable. Only with the optimization does vectorization
+//       become profitable, since the expensive reduction node is moved
+//       outside the loop, and instead cheaper element-wise vector accumulations
+//       are performed inside the loop.
+bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform) {
+  // We have a phi with a single use.
+  VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi();
+  if (phi == nullptr) {
+    return false;
+  }
+  if (phi->out_strong_edges() != 1) {
+    TRACE_OPTIMIZE(
+      tty->print("  Cannot move out of loop, phi has multiple uses:");
+      print();
+      tty->print("  phi: ");
+      phi->print();
+    )
+    return false;
+  }
+
+  if (requires_strict_order()) {
+    TRACE_OPTIMIZE(
+      tty->print("  Cannot move out of loop, strict order required: ");
+      print();
+    )
+    return false;
+  }
+
+  const int sopc     = scalar_opcode();
+  const uint vlen    = vector_length();
+  const BasicType bt = element_basic_type();
+  const int ropc     = vector_reduction_opcode();
+  const int vopc     = VectorNode::opcode(sopc, bt);
+  if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
+    DEBUG_ONLY( this->print(); )
+    assert(false, "do not have normal vector op for this reduction");
+    return false; // not implemented
+  }
+
+  // Traverse up the chain of non strict order reductions, checking that it loops
+  // back to the phi. Check that all non strict order reductions only have a single
+  // use, except for the last (last_red), which only has phi as a use in the loop,
+  // and all other uses are outside the loop.
+  VTransformReductionVectorNode* first_red   = this;
+  VTransformReductionVectorNode* last_red    = phi->in_req(2)->isa_ReductionVector();
+  VTransformReductionVectorNode* current_red = last_red;
+  while (true) {
+    if (current_red == nullptr ||
+        current_red->vector_reduction_opcode() != ropc ||
+        current_red->element_basic_type() != bt ||
+        current_red->vector_length() != vlen) {
+      TRACE_OPTIMIZE(
+        tty->print("  Cannot move out of loop, other reduction node does not match:");
+        print();
+        tty->print("  other: ");
+        current_red->print();
+      )
+      return false; // not compatible
+    }
+
+    VTransformVectorNode* vector_input = current_red->in_req(2)->isa_Vector();
+    if (vector_input == nullptr) {
+      assert(false, "reduction has a bad vector input");
+      return false;
+    }
+
+    // Expect single use of the non strict order reduction. Except for the last_red.
+    if (current_red == last_red) {
+      // All uses must be outside loop body, except for the phi.
+      for (uint i = 0; i < current_red->out_strong_edges(); i++) {
+        VTransformNode* use = current_red->out_strong_edge(i);
+        if (use->isa_LoopPhi() == nullptr &&
+            use->isa_Outer() == nullptr) {
+          // Should not be allowed by SuperWord::mark_reductions
+          assert(false, "reduction has use inside loop");
+          return false;
+        }
+      }
+    } else {
+      if (current_red->out_strong_edges() != 1) {
+        TRACE_OPTIMIZE(
+          tty->print("  Cannot move out of loop, other reduction node has use outside loop:");
+          print();
+          tty->print("  other: ");
+          current_red->print();
+        )
+        return false; // Only single use allowed
+      }
+    }
+
+    // If the scalar input is a phi, we passed all checks.
+    VTransformNode* scalar_input = current_red->in_req(1);
+    if (scalar_input == phi) {
+      break;
+    }
+
+    // We expect another non strict reduction, verify it in the next iteration.
+    current_red = scalar_input->isa_ReductionVector();
+  }
+  return true; // success
+}
+
+bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
+  if (!optimize_move_non_strict_order_reductions_out_of_loop_preconditions(vtransform)) {
+    return false;
+  }
+
+  // All checks were successful. Edit the vtransform graph now.
+  TRACE_OPTIMIZE(
+    tty->print_cr("VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop");
+  )
+
+  const int sopc     = scalar_opcode();
+  const uint vlen    = vector_length();
+  const BasicType bt = element_basic_type();
+  const int vopc     = VectorNode::opcode(sopc, bt);
+  PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
+
+  // Create a vector of identity values.
+  Node* identity = ReductionNode::make_identity_con_scalar(phase->igvn(), sopc, bt);
+  phase->set_root_as_ctrl(identity);
+  VTransformNode* vtn_identity = new (vtransform.arena()) VTransformOuterNode(vtransform, identity);
+
+  VTransformNode* vtn_identity_vector = new (vtransform.arena()) VTransformReplicateNode(vtransform, vlen, bt);
+  vtn_identity_vector->init_req(1, vtn_identity);
+
+  // Turn the scalar phi into a vector phi.
+  VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi();
+  VTransformNode* init = phi->in_req(1);
+  phi->set_req(1, vtn_identity_vector);
+
+  // Traverse down the chain of reductions, and replace them with vector_accumulators.
+  VTransformReductionVectorNode* first_red   = this;
+  VTransformReductionVectorNode* last_red    = phi->in_req(2)->isa_ReductionVector();
+  VTransformReductionVectorNode* current_red = first_red;
+  VTransformNode* current_vector_accumulator = phi;
+  while (true) {
+    VTransformNode* vector_input = current_red->in_req(2);
+    VTransformVectorNode* vector_accumulator = new (vtransform.arena()) VTransformElementWiseVectorNode(vtransform, 3, current_red->properties(), vopc);
+    vector_accumulator->init_req(1, current_vector_accumulator);
+    vector_accumulator->init_req(2, vector_input);
+    TRACE_OPTIMIZE(
+      tty->print("  replace    ");
+      current_red->print();
+      tty->print("  with       ");
+      vector_accumulator->print();
+    )
+    current_vector_accumulator = vector_accumulator;
+    if (current_red == last_red) { break; }
+    current_red = current_red->unique_out_strong_edge()->isa_ReductionVector();
+  }
+
+  // Feed vector accumulator into the backedge.
+  phi->set_req(2, current_vector_accumulator);
+
+  // Create post-loop reduction. last_red keeps all uses outside the loop.
+  last_red->set_req(1, init);
+  last_red->set_req(2, current_vector_accumulator);
+
+  TRACE_OPTIMIZE(
+    tty->print("  phi        ");
+    phi->print();
+    tty->print("  after loop ");
+    last_red->print();
+  )
+  return true; // success
+}
+
 VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
  Node* init = apply_state.transformed_node(in_req(1));
  Node* vec  = apply_state.transformed_node(in_req(2));
@ -1041,7 +1338,7 @@ void VTransformNode::print() const {
      print_node_idx(_in.at(i));
    }
  }
-  tty->print(") [");
+  tty->print(") %s[", _is_alive ? "" : "dead ");
  for (uint i = 0; i < _out_end_strong_edges; i++) {
    print_node_idx(_out.at(i));
  }
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@ -41,7 +41,11 @@
 // - Construction:
 //   - From SuperWord PackSet, with the SuperWordVTransformBuilder.
 //
-// - Future Plans: optimize, if-conversion, etc.
+// - Optimize:
+//   - Move non-strict order reductions out of the loop. This means we have
+//     only element-wise operations inside the loop, rather than the much
+//     more expensive lane-crossing reductions. We need to do this before
+//     assessing profitability with the cost-model.
 //
 // - Schedule:
 //   - Compute linearization of the VTransformGraph, into an order that respects
@ -62,12 +66,12 @@
 //
 // Future Plans with VTransform:
 // - Cost model: estimate if vectorization is profitable.
-// - Optimizations: moving unordered reductions out of the loop, whih decreases cost.
 // - Pack/Unpack/Shuffle: introduce additional nodes not present in the scalar loop.
 //                        This is difficult to do with the SuperWord packset approach.
 // - If-conversion: convert predicated nodes into CFG.

 typedef int VTransformNodeIDX;
+class VTransform;
 class VTransformNode;
 class VTransformMemopScalarNode;
 class VTransformDataScalarNode;
@ -183,6 +187,7 @@ public:
  const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
  const GrowableArray<VTransformNode*>& get_schedule() const { return _schedule; }

+  void optimize(VTransform& vtransform);
  bool schedule();
  bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
  void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
@ -194,6 +199,7 @@ private:
  bool in_bb(const Node* n)   const { return _vloop.in_bb(n); }

  void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
+  int count_alive_vtnodes() const;

 #ifndef PRODUCT
  void print_vtnodes() const;
@ -239,10 +245,12 @@ public:
    _aw_for_main_loop_alignment(aw_for_main_loop_alignment) {}

  const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; }
+  const VLoop& vloop() const { return _vloop; }
  Arena* arena() { return &_arena; }
  DEBUG_ONLY( bool has_graph() const { return !_graph.is_empty(); } )
  VTransformGraph& graph() { return _graph; }

+  void optimize() { return _graph.optimize(*this); }
  bool schedule() { return _graph.schedule(); }
  bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
  void apply();
@ -372,6 +380,8 @@ public:
  const VTransformNodeIDX _idx;

 private:
+  bool _is_alive;
+
  // We split _in into 3 sections:
  // - data edges (req):     _in[0                           .. _req-1]
  // - strong memory edges:  _in[_req                        .. _in_end_strong_memory_edges-1]
@ -389,6 +399,7 @@ private:
 public:
  VTransformNode(VTransform& vtransform, const uint req) :
    _idx(vtransform.graph().new_idx()),
+    _is_alive(true),
    _req(req),
    _in_end_strong_memory_edges(req),
    _in(vtransform.arena(),  req, req, nullptr),
@ -405,6 +416,14 @@ public:
    n->add_out_strong_edge(this);
  }

+  void set_req(uint i, VTransformNode* n) {
+    assert(i < _req, "must be a req");
+    VTransformNode* old = _in.at(i);
+    if (old != nullptr) { old->del_out_strong_edge(this); }
+    _in.at_put(i, n);
+    if (n != nullptr) { n->add_out_strong_edge(this); }
+  }
+
  void swap_req(uint i, uint j) {
    assert(i < _req, "must be a req");
    assert(j < _req, "must be a req");
@ -452,6 +471,23 @@ private:
    _out.push(n);
  }

+  void del_out_strong_edge(VTransformNode* n) {
+    int i = _out.find(n);
+    assert(0 <= i && i < (int)_out_end_strong_edges, "must be in strong edges");
+
+    // Replace n with the last strong edge.
+    VTransformNode* last_strong = _out.at(_out_end_strong_edges - 1);
+    _out.at_put(i, last_strong);
+
+    if (_out_end_strong_edges < (uint)_out.length()) {
+      // Now replace where last_strong was with the last weak edge.
+      VTransformNode* last_weak = _out.top();
+      _out.at_put(_out_end_strong_edges - 1, last_weak);
+    }
+    _out.pop();
+    _out_end_strong_edges--;
+  }
+
 public:
  uint req() const { return _req; }
  uint out_strong_edges() const { return _out_end_strong_edges; }
@ -479,6 +515,21 @@ public:
    return false;
  }

+  VTransformNode* unique_out_strong_edge() const {
+    assert(out_strong_edges() == 1, "must be unique");
+    return _out.at(0);
+  }
+
+  bool is_alive() const { return _is_alive; }
+
+  void mark_dead() {
+    _is_alive = false;
+    // Remove all inputs
+    for (uint i = 0; i < req(); i++) {
+      set_req(i, nullptr);
+    }
+  }
+
  virtual VTransformMemopScalarNode* isa_MemopScalar() { return nullptr; }
  virtual VTransformLoopPhiNode* isa_LoopPhi() { return nullptr; }
  virtual VTransformCountedLoopNode* isa_CountedLoop() { return nullptr; }
@ -496,6 +547,8 @@ public:
  virtual bool is_load_or_store_in_loop() const { return false; }
  virtual const VPointer& vpointer() const { ShouldNotReachHere(); }

+  virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
+
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
  virtual void apply_backedge(VTransformApplyState& apply_state) const {};
  void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
@ -701,6 +754,7 @@ public:
  NOT_PRODUCT(virtual void print_spec() const override;)

 protected:
+  const VTransformVectorNodeProperties& properties() const { return _properties; }
  Node* approximate_origin()     const { return _properties.approximate_origin(); }
  int scalar_opcode()            const { return _properties.scalar_opcode(); }
  uint vector_length()           const { return _properties.vector_length(); }
@ -780,8 +834,15 @@ public:
  VTransformReductionVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
    VTransformVectorNode(vtransform, 3, properties) {}
  virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
+  virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
+
+private:
+  int vector_reduction_opcode() const;
+  bool requires_strict_order() const;
+  bool optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform);
+  bool optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform);
 };

 class VTransformMemVectorNode : public VTransformVectorNode {