From 4786f8bee5c79c1bcf652758a25360b4d308ce1c Mon Sep 17 00:00:00 2001
From: Emanuel Peter <epeter@openjdk.org>
Date: Tue, 14 Oct 2025 08:32:32 +0000
Subject: [PATCH] 8369448: C2 SuperWord: refactor VTransform to do
 move_unordered_reduction_out_of_loop during VTransform::optimize

Reviewed-by: chagedorn, kvn
---
 src/hotspot/share/opto/loopnode.cpp           |  10 -
 src/hotspot/share/opto/loopnode.hpp           |   3 -
 src/hotspot/share/opto/loopopts.cpp           | 205 ------------
 src/hotspot/share/opto/superword.cpp          |   6 +-
 .../share/opto/traceAutoVectorizationTag.hpp  |   3 +-
 src/hotspot/share/opto/vectorization.hpp      |   4 +
 src/hotspot/share/opto/vectornode.cpp         | 143 ++-------
 src/hotspot/share/opto/vectornode.hpp         |   3 +-
 src/hotspot/share/opto/vtransform.cpp         | 301 +++++++++++++++++-
 src/hotspot/share/opto/vtransform.hpp         |  65 +++-
 10 files changed, 402 insertions(+), 341 deletions(-)

diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp
index a3e3be66583..4cb1862cbb9 100644
--- a/src/hotspot/share/opto/loopnode.cpp
+++ b/src/hotspot/share/opto/loopnode.cpp
@@ -5287,16 +5287,6 @@ void PhaseIdealLoop::build_and_optimize() {
     }
   }
 
-  // Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization.
-  if (C->has_loops() && !C->major_progress()) {
-    for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
-      IdealLoopTree* lpt = iter.current();
-      if (lpt->is_counted() && lpt->is_innermost()) {
-        move_unordered_reduction_out_of_loop(lpt);
-      }
-    }
-  }
-
   // Keep loop predicates and perform optimizations with them
   // until no more loop optimizations could be done.
   // After that switch predicates off and do more loop optimizations.
diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp
index 2645df86d96..1101de81595 100644
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@@ -1550,9 +1550,6 @@ public:
   IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj);
   bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt);
 
-  // Move an unordered Reduction out of loop if possible
-  void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
-
   // Create a scheduled list of nodes control dependent on ctrl set.
   void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
   // Has a use in the vector set
diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp
index a9baac394a2..ae7b318ece4 100644
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@@ -4548,211 +4548,6 @@ void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(Id
   do_multiversioning(lpt, old_new);
 }
 
-// Returns true if the Reduction node is unordered.
-static bool is_unordered_reduction(Node* n) {
-  return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
-}
-
-// Having ReductionNodes in the loop is expensive. They need to recursively
-// fold together the vector values, for every vectorized loop iteration. If
-// we encounter the following pattern, we can vector accumulate the values
-// inside the loop, and only have a single UnorderedReduction after the loop.
-//
-// Note: UnorderedReduction represents a ReductionNode which does not require
-// calculating in strict order.
-//
-// CountedLoop     init
-//          |        |
-//          +------+ | +-----------------------+
-//                 | | |                       |
-//                PhiNode (s)                  |
-//                  |                          |
-//                  |          Vector          |
-//                  |            |             |
-//               UnorderedReduction (first_ur) |
-//                  |                          |
-//                 ...         Vector          |
-//                  |            |             |
-//               UnorderedReduction (last_ur)  |
-//                       |                     |
-//                       +---------------------+
-//
-// We patch the graph to look like this:
-//
-// CountedLoop   identity_vector
-//         |         |
-//         +-------+ | +---------------+
-//                 | | |               |
-//                PhiNode (v)          |
-//                   |                 |
-//                   |         Vector  |
-//                   |           |     |
-//                 VectorAccumulator   |
-//                   |                 |
-//                  ...        Vector  |
-//                   |           |     |
-//      init       VectorAccumulator   |
-//        |          |     |           |
-//     UnorderedReduction  +-----------+
-//
-// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
-// use vector_accumulators, which do the same reductions, but only element
-// wise. This is a single operation per vector_accumulator, rather than many
-// for a UnorderedReduction. We can then reduce the last vector_accumulator
-// after the loop, and also reduce the init value into it.
-//
-// We can not do this with all reductions. Some reductions do not allow the
-// reordering of operations (for example float addition/multiplication require
-// strict order).
-void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
-  assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
-
-  // Find all Phi nodes with an unordered Reduction on backedge.
-  CountedLoopNode* cl = loop->_head->as_CountedLoop();
-  for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
-    Node* phi = cl->fast_out(j);
-    // We have a phi with a single use, and an unordered Reduction on the backedge.
-    if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) {
-      continue;
-    }
-
-    ReductionNode* last_ur = phi->in(2)->as_Reduction();
-    assert(!last_ur->requires_strict_order(), "must be");
-
-    // Determine types
-    const TypeVect* vec_t = last_ur->vect_type();
-    uint vector_length    = vec_t->length();
-    BasicType bt          = vec_t->element_basic_type();
-
-    // Convert opcode from vector-reduction -> scalar -> normal-vector-op
-    const int sopc        = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
-    const int vopc        = VectorNode::opcode(sopc, bt);
-    if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
-        DEBUG_ONLY( last_ur->dump(); )
-        assert(false, "do not have normal vector op for this reduction");
-        continue; // not implemented -> fails
-    }
-
-    // Traverse up the chain of unordered Reductions, checking that it loops back to
-    // the phi. Check that all unordered Reductions only have a single use, except for
-    // the last (last_ur), which only has phi as a use in the loop, and all other uses
-    // are outside the loop.
-    ReductionNode* current = last_ur;
-    ReductionNode* first_ur = nullptr;
-    while (true) {
-      assert(!current->requires_strict_order(), "sanity");
-
-      // Expect no ctrl and a vector_input from within the loop.
-      Node* ctrl = current->in(0);
-      Node* vector_input = current->in(2);
-      if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
-        DEBUG_ONLY( current->dump(1); )
-        assert(false, "reduction has ctrl or bad vector_input");
-        break; // Chain traversal fails.
-      }
-
-      assert(current->vect_type() != nullptr, "must have vector type");
-      if (current->vect_type() != last_ur->vect_type()) {
-        // Reductions do not have the same vector type (length and element type).
-        break; // Chain traversal fails.
-      }
-
-      // Expect single use of an unordered Reduction, except for last_ur.
-      if (current == last_ur) {
-        // Expect all uses to be outside the loop, except phi.
-        for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
-          Node* use = current->fast_out(k);
-          if (use != phi && ctrl_or_self(use) == cl) {
-            DEBUG_ONLY( current->dump(-1); )
-            assert(false, "reduction has use inside loop");
-            // Should not be allowed by SuperWord::mark_reductions
-            return; // bail out of optimization
-          }
-        }
-      } else {
-        if (current->outcnt() != 1) {
-          break; // Chain traversal fails.
-        }
-      }
-
-      // Expect another unordered Reduction or phi as the scalar input.
-      Node* scalar_input = current->in(1);
-      if (is_unordered_reduction(scalar_input) &&
-          scalar_input->Opcode() == current->Opcode()) {
-        // Move up the unordered Reduction chain.
-        current = scalar_input->as_Reduction();
-        assert(!current->requires_strict_order(), "must be");
-      } else if (scalar_input == phi) {
-        // Chain terminates at phi.
-        first_ur = current;
-        current = nullptr;
-        break; // Success.
-      } else {
-        // scalar_input is neither phi nor a matching reduction
-        // Can for example be scalar reduction when we have
-        // partial vectorization.
-        break; // Chain traversal fails.
-      }
-    }
-    if (current != nullptr) {
-      // Chain traversal was not successful.
-      continue;
-    }
-    assert(first_ur != nullptr, "must have successfully terminated chain traversal");
-
-    Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
-    set_root_as_ctrl(identity_scalar);
-    VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt);
-    register_new_node(identity_vector, C->root());
-    assert(vec_t == identity_vector->vect_type(), "matching vector type");
-    VectorNode::trace_new_vector(identity_vector, "Unordered Reduction");
-
-    // Turn the scalar phi into a vector phi.
-    _igvn.rehash_node_delayed(phi);
-    Node* init = phi->in(1); // Remember init before replacing it.
-    phi->set_req_X(1, identity_vector, &_igvn);
-    phi->as_Type()->set_type(vec_t);
-    _igvn.set_type(phi, vec_t);
-
-    // Traverse down the chain of unordered Reductions, and replace them with vector_accumulators.
-    current = first_ur;
-    while (true) {
-      // Create vector_accumulator to replace current.
-      Node* last_vector_accumulator = current->in(1);
-      Node* vector_input            = current->in(2);
-      VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
-      register_new_node(vector_accumulator, cl);
-      _igvn.replace_node(current, vector_accumulator);
-      VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction");
-      if (current == last_ur) {
-        break;
-      }
-      current = vector_accumulator->unique_out()->as_Reduction();
-      assert(!current->requires_strict_order(), "must be");
-    }
-
-    // Create post-loop reduction.
-    Node* last_accumulator = phi->in(2);
-    Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);
-
-    // Take over uses of last_accumulator that are not in the loop.
-    for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
-      Node* use = last_accumulator->out(i);
-      if (use != phi && use != post_loop_reduction) {
-        assert(ctrl_or_self(use) != cl, "use must be outside loop");
-        use->replace_edge(last_accumulator, post_loop_reduction,  &_igvn);
-        --i;
-      }
-    }
-    register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
-    VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction");
-
-    assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
-    assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
-    assert(phi->outcnt() == 1, "accumulator is the only use of phi");
-  }
-}
-
 void DataNodeGraph::clone_data_nodes(Node* new_ctrl) {
   for (uint i = 0; i < _data_nodes.size(); i++) {
     clone(_data_nodes[i], new_ctrl);
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 41a4339e4c9..c0f005048ec 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -1606,7 +1606,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
       // 3 instructions (1 shuffle and two reduction ops).
       // However, this optimization assumes that these reductions stay in the loop
       // which may not be true any more in most cases after the introduction of:
-      // PhaseIdealLoop::move_unordered_reduction_out_of_loop
+      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
       // Hence, this heuristic has room for improvement.
       bool is_two_element_int_or_long_reduction = (size == 2) &&
                                                   (arith_type->basic_type() == T_INT ||
@@ -1782,7 +1782,7 @@ bool SuperWord::profitable(const Node_List* p) const {
       // This heuristic is a bit simplistic, and assumes that the reduction
       // vector stays in the loop. But in some cases, we can move the
       // reduction out of the loop, replacing it with a single vector op.
-      // See: PhaseIdealLoop::move_unordered_reduction_out_of_loop
+      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
       // Hence, this heuristic has room for improvement.
 #ifndef PRODUCT
         if (is_trace_superword_rejections()) {
@@ -1947,6 +1947,8 @@ bool SuperWord::do_vtransform() const {
     SuperWordVTransformBuilder builder(_packset, vtransform);
   }
 
+  vtransform.optimize();
+
   if (!vtransform.schedule()) { return false; }
   if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
 
diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
index 6713ed6cac6..d996173aeb4 100644
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@@ -45,10 +45,11 @@
   flags(SW_PACKSET,                 "Trace SuperWord packset at different stages") \
   flags(SW_INFO,                    "Trace SuperWord info (equivalent to TraceSuperWord)") \
   flags(SW_VERBOSE,                 "Trace SuperWord verbose (all SW tags enabled)") \
+  flags(VTRANSFORM,                 "Trace VTransform Graph") \
+  flags(OPTIMIZATION,               "Trace VTransform::optimize") \
   flags(ALIGN_VECTOR,               "Trace AlignVector") \
   flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
   flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
-  flags(VTRANSFORM,                 "Trace VTransform Graph") \
   flags(ALL,                        "Trace everything (very verbose)")
 
 #define table_entry(name, description) name,
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index e006589cce9..b1be52d531a 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -205,6 +205,10 @@ public:
     return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
   }
 
+  bool is_trace_optimization() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
+  }
+
   bool is_trace_speculative_runtime_checks() const {
     return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
   }
diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
index d656bf3127b..6ae8bbe8aa0 100644
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -292,121 +292,6 @@ int VectorNode::opcode(int sopc, BasicType bt) {
   }
 }
 
-// Return the scalar opcode for the specified vector opcode
-// and basic type.
-int VectorNode::scalar_opcode(int sopc, BasicType bt) {
-  switch (sopc) {
-    case Op_AddReductionVI:
-    case Op_AddVI:
-      return Op_AddI;
-    case Op_AddReductionVL:
-    case Op_AddVL:
-      return Op_AddL;
-    case Op_MulReductionVI:
-    case Op_MulVI:
-      return Op_MulI;
-    case Op_MulReductionVL:
-    case Op_MulVL:
-      return Op_MulL;
-    case Op_AndReductionV:
-    case Op_AndV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_AndI;
-        case T_LONG:
-          return Op_AndL;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_OrReductionV:
-    case Op_OrV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_OrI;
-        case T_LONG:
-          return Op_OrL;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_XorReductionV:
-    case Op_XorV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_XorI;
-        case T_LONG:
-          return Op_XorL;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_MinReductionV:
-    case Op_MinV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-          assert(false, "boolean and char are signed, not implemented for Min");
-          return 0;
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_MinI;
-        case T_LONG:
-          return Op_MinL;
-        case T_FLOAT:
-          return Op_MinF;
-        case T_DOUBLE:
-          return Op_MinD;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_MaxReductionV:
-    case Op_MaxV:
-      switch (bt) {
-        case T_BOOLEAN:
-        case T_CHAR:
-          assert(false, "boolean and char are signed, not implemented for Max");
-          return 0;
-        case T_BYTE:
-        case T_SHORT:
-        case T_INT:
-          return Op_MaxI;
-        case T_LONG:
-          return Op_MaxL;
-        case T_FLOAT:
-          return Op_MaxF;
-        case T_DOUBLE:
-          return Op_MaxD;
-        default:
-          assert(false, "basic type not handled");
-          return 0;
-      }
-    case Op_MinVHF:
-      return Op_MinHF;
-    case Op_MaxVHF:
-      return Op_MaxHF;
-    default:
-      assert(false,
-             "Vector node %s is not handled in VectorNode::scalar_opcode",
-             NodeClassNames[sopc]);
-      return 0; // Unimplemented
-  }
-}
-
 // Limits on vector size (number of elements) for auto-vectorization.
 bool VectorNode::vector_size_supported_auto_vectorization(const BasicType bt, int size) {
   return Matcher::max_vector_size_auto_vectorization(bt) >= size &&
@@ -1727,6 +1612,34 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
   return false;
 }
 
+bool ReductionNode::auto_vectorization_requires_strict_order(int vopc) {
+  switch (vopc) {
+    case Op_AddReductionVI:
+    case Op_AddReductionVL:
+    case Op_MulReductionVI:
+    case Op_MulReductionVL:
+    case Op_MinReductionV:
+    case Op_MaxReductionV:
+    case Op_AndReductionV:
+    case Op_OrReductionV:
+    case Op_XorReductionV:
+      // These are cases that all have associative operations, which can
+      // thus be reordered, allowing non-strict order reductions.
+      return false;
+    case Op_AddReductionVF:
+    case Op_MulReductionVF:
+    case Op_AddReductionVD:
+    case Op_MulReductionVD:
+      // Floating-point addition and multiplication are non-associative,
+      // so AddReductionVF/D and MulReductionVF/D require strict ordering
+      // in auto-vectorization.
+      return true;
+    default:
+      assert(false, "not handled: %s", NodeClassNames[vopc]);
+      return true;
+  }
+}
+
 MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* in1, Node* in2, Node* in3,
                                        Node* mask, uint truth_table, const TypeVect* vt) {
   assert(truth_table <= 0xFF, "invalid");
diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp
index 53778b61d0e..427aeff53fc 100644
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@@ -95,7 +95,6 @@ class VectorNode : public TypeNode {
   static bool is_rotate_opcode(int opc);
 
   static int opcode(int sopc, BasicType bt);         // scalar_opc -> vector_opc
-  static int scalar_opcode(int vopc, BasicType bt);  // vector_opc -> scalar_opc
 
   static int shift_count_opcode(int opc);
 
@@ -283,6 +282,8 @@ class ReductionNode : public Node {
     return false;
   }
 
+  static bool auto_vectorization_requires_strict_order(int vopc);
+
 #ifndef PRODUCT
   void dump_spec(outputStream* st) const {
     if (requires_strict_order()) {
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 27c541c2732..46e8f43cb65 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -23,6 +23,7 @@
 
 #include "opto/castnode.hpp"
 #include "opto/convertnode.hpp"
+#include "opto/rootnode.hpp"
 #include "opto/vectorization.hpp"
 #include "opto/vectornode.hpp"
 #include "opto/vtransform.hpp"
@@ -32,6 +33,45 @@ void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
   _vtnodes.push(vtnode);
 }
 
+#define TRACE_OPTIMIZE(code)                          \
+  NOT_PRODUCT(                                        \
+    if (vtransform.vloop().is_trace_optimization()) { \
+      code                                            \
+    }                                                 \
+  )
+
+// This is similar to IGVN optimization. But we are a bit lazy, and don't care about
+// notification / worklist, since the list of nodes is rather small, and we don't
+// expect optimizations that trickle over the whole graph.
+void VTransformGraph::optimize(VTransform& vtransform) {
+  TRACE_OPTIMIZE( tty->print_cr("\nVTransformGraph::optimize"); )
+
+  bool progress = true;
+  DEBUG_ONLY(int pass_count = 0;)
+  while (progress) {
+    progress = false;
+    assert(++pass_count < 10, "ensure we do not have endless loops");
+    for (int i = 0; i < _vtnodes.length(); i++) {
+      VTransformNode* vtn = _vtnodes.at(i);
+      if (!vtn->is_alive()) { continue; }
+      progress |= vtn->optimize(_vloop_analyzer, vtransform);
+
+      // Nodes that have no use any more are dead.
+      if (vtn->out_strong_edges() == 0 &&
+          // There are some exceptions:
+          // 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive.
+          // 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive.
+          // 3. Outer node with strong inputs: is a use after the loop that we must keep alive.
+          !(vtn->isa_LoopPhi() != nullptr ||
+            vtn->is_load_or_store_in_loop() ||
+            (vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) {
+        vtn->mark_dead();
+        progress = true;
+      }
+    }
+  }
+}
+
 // Compute a linearization of the graph. We do this with a reverse-post-order of a DFS.
 // This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and
 // the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the
@@ -59,10 +99,11 @@ bool VTransformGraph::schedule() {
   VectorSet post_visited;
 
   collect_nodes_without_strong_in_edges(stack);
+  const int num_alive_nodes = count_alive_vtnodes();
 
   // We create a reverse-post-visit order. This gives us a linearization, if there are
   // no cycles. Then, we simply reverse the order, and we have a schedule.
-  int rpo_idx = _vtnodes.length() - 1;
+  int rpo_idx = num_alive_nodes - 1;
   while (!stack.is_empty()) {
     VTransformNode* vtn = stack.top();
     if (!pre_visited.test_set(vtn->_idx)) {
@@ -79,6 +120,9 @@ bool VTransformGraph::schedule() {
       for (uint i = 0; i < vtn->out_strong_edges(); i++) {
         VTransformNode* use = vtn->out_strong_edge(i);
 
+        // Skip dead nodes
+        if (!use->is_alive()) { continue; }
+
         // Skip LoopPhi backedge.
         if ((use->isa_LoopPhi() != nullptr || use->isa_CountedLoop() != nullptr) && use->in_req(2) == vtn) { continue; }
 
@@ -121,6 +165,7 @@ bool VTransformGraph::schedule() {
 void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const {
   for (int i = 0; i < _vtnodes.length(); i++) {
     VTransformNode* vtn = _vtnodes.at(i);
+    if (!vtn->is_alive()) { continue; }
     if (!vtn->has_strong_in_edge()) {
       stack.push(vtn);
     }
@@ -132,6 +177,15 @@ void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTrans
   }
 }
 
+int VTransformGraph::count_alive_vtnodes() const {
+  int count = 0;
+  for (int i = 0; i < _vtnodes.length(); i++) {
+    VTransformNode* vtn = _vtnodes.at(i);
+    if (vtn->is_alive()) { count++; }
+  }
+  return count;
+}
+
 #ifndef PRODUCT
 void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
                                            const VectorSet& pre_visited,
@@ -801,6 +855,13 @@ VTransformApplyResult VTransformLoopPhiNode::apply(VTransformApplyState& apply_s
   phase->igvn().replace_input_of(_node, 0, in0);
   phase->igvn().replace_input_of(_node, 1, in1);
   // Note: the backedge is hooked up later.
+
+  // The Phi's inputs may have been modified, and the types changes,
+  // e.g. from scalar to vector.
+  const Type* t = in1->bottom_type();
+  _node->as_Type()->set_type(t);
+  phase->igvn().set_type(_node, t);
+
   return VTransformApplyResult::make_scalar(_node);
 }
 
@@ -939,6 +1000,242 @@ VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& appl
   return VTransformApplyResult::make_vector(vn);
 }
 
+bool VTransformReductionVectorNode::optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
+  return optimize_move_non_strict_order_reductions_out_of_loop(vloop_analyzer, vtransform);
+}
+
+int VTransformReductionVectorNode::vector_reduction_opcode() const {
+  return ReductionNode::opcode(scalar_opcode(), element_basic_type());
+}
+
+bool VTransformReductionVectorNode::requires_strict_order() const {
+  int vopc = vector_reduction_opcode();
+  return ReductionNode::auto_vectorization_requires_strict_order(vopc);
+}
+
+// Having ReductionNodes in the loop is expensive. They need to recursively
+// fold together the vector values, for every vectorized loop iteration. If
+// we encounter the following pattern, we can vector accumulate the values
+// inside the loop, and only have a single UnorderedReduction after the loop.
+//
+// Note: UnorderedReduction represents a ReductionNode which does not require
+// calculating in strict order.
+//
+// CountedLoop     init
+//          |        |
+//          +------+ | +------------------------+
+//                 | | |                        |
+//                PhiNode (s)                   |
+//                  |                           |
+//                  |          Vector           |
+//                  |            |              |
+//               UnorderedReduction (first_red) |
+//                  |                           |
+//                 ...         Vector           |
+//                  |            |              |
+//               UnorderedReduction (last_red)  |
+//                       |                      |
+//                       +----------------------+
+//
+// We patch the graph to look like this:
+//
+// CountedLoop   identity_vector
+//         |         |
+//         +-------+ | +---------------+
+//                 | | |               |
+//                PhiNode (v)          |
+//                   |                 |
+//                   |         Vector  |
+//                   |           |     |
+//                 VectorAccumulator   |
+//                   |                 |
+//                  ...        Vector  |
+//                   |           |     |
+//      init       VectorAccumulator   |
+//        |          |     |           |
+//     UnorderedReduction  +-----------+
+//
+// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
+// use vector_accumulators, which do the same reductions, but only element
+// wise. This is a single operation per vector_accumulator, rather than many
+// for a UnorderedReduction. We can then reduce the last vector_accumulator
+// after the loop, and also reduce the init value into it.
+//
+// We can not do this with all reductions. Some reductions do not allow the
+// reordering of operations (for example float addition/multiplication require
+// strict order).
+//
+// Note: we must perform this optimization already during auto vectorization,
+//       before we evaluate the cost-model. Without this optimization, we may
+//       still have expensive reduction nodes in the loop which can make
+//       vectorization unprofitable. Only with the optimization does vectorization
+//       become profitable, since the expensive reduction node is moved
+//       outside the loop, and instead cheaper element-wise vector accumulations
+//       are performed inside the loop.
+bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform) {
+  // We have a phi with a single use.
+  VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi();
+  if (phi == nullptr) {
+    return false;
+  }
+  if (phi->out_strong_edges() != 1) {
+    TRACE_OPTIMIZE(
+      tty->print("  Cannot move out of loop, phi has multiple uses:");
+      print();
+      tty->print("  phi: ");
+      phi->print();
+    )
+    return false;
+  }
+
+  if (requires_strict_order()) {
+    TRACE_OPTIMIZE(
+      tty->print("  Cannot move out of loop, strict order required: ");
+      print();
+    )
+    return false;
+  }
+
+  const int sopc     = scalar_opcode();
+  const uint vlen    = vector_length();
+  const BasicType bt = element_basic_type();
+  const int ropc     = vector_reduction_opcode();
+  const int vopc     = VectorNode::opcode(sopc, bt);
+  if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
+    DEBUG_ONLY( this->print(); )
+    assert(false, "do not have normal vector op for this reduction");
+    return false; // not implemented
+  }
+
+  // Traverse up the chain of non strict order reductions, checking that it loops
+  // back to the phi. Check that all non strict order reductions only have a single
+  // use, except for the last (last_red), which only has phi as a use in the loop,
+  // and all other uses are outside the loop.
+  VTransformReductionVectorNode* first_red   = this;
+  VTransformReductionVectorNode* last_red    = phi->in_req(2)->isa_ReductionVector();
+  VTransformReductionVectorNode* current_red = last_red;
+  while (true) {
+    if (current_red == nullptr ||
+        current_red->vector_reduction_opcode() != ropc ||
+        current_red->element_basic_type() != bt ||
+        current_red->vector_length() != vlen) {
+      TRACE_OPTIMIZE(
+        tty->print("  Cannot move out of loop, other reduction node does not match:");
+        print();
+        tty->print("  other: ");
+        current_red->print();
+      )
+      return false; // not compatible
+    }
+
+    VTransformVectorNode* vector_input = current_red->in_req(2)->isa_Vector();
+    if (vector_input == nullptr) {
+      assert(false, "reduction has a bad vector input");
+      return false;
+    }
+
+    // Expect single use of the non strict order reduction. Except for the last_red.
+    if (current_red == last_red) {
+      // All uses must be outside loop body, except for the phi.
+      for (uint i = 0; i < current_red->out_strong_edges(); i++) {
+        VTransformNode* use = current_red->out_strong_edge(i);
+        if (use->isa_LoopPhi() == nullptr &&
+            use->isa_Outer() == nullptr) {
+          // Should not be allowed by SuperWord::mark_reductions
+          assert(false, "reduction has use inside loop");
+          return false;
+        }
+      }
+    } else {
+      if (current_red->out_strong_edges() != 1) {
+        TRACE_OPTIMIZE(
+          tty->print("  Cannot move out of loop, other reduction node has use outside loop:");
+          print();
+          tty->print("  other: ");
+          current_red->print();
+        )
+        return false; // Only single use allowed
+      }
+    }
+
+    // If the scalar input is a phi, we passed all checks.
+    VTransformNode* scalar_input = current_red->in_req(1);
+    if (scalar_input == phi) {
+      break;
+    }
+
+    // We expect another non strict reduction, verify it in the next iteration.
+    current_red = scalar_input->isa_ReductionVector();
+  }
+  return true; // success
+}
+
+bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
+  if (!optimize_move_non_strict_order_reductions_out_of_loop_preconditions(vtransform)) {
+    return false;
+  }
+
+  // All checks were successful. Edit the vtransform graph now.
+  TRACE_OPTIMIZE(
+    tty->print_cr("VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop");
+  )
+
+  const int sopc     = scalar_opcode();
+  const uint vlen    = vector_length();
+  const BasicType bt = element_basic_type();
+  const int vopc     = VectorNode::opcode(sopc, bt);
+  PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
+
+  // Create a vector of identity values.
+  Node* identity = ReductionNode::make_identity_con_scalar(phase->igvn(), sopc, bt);
+  phase->set_root_as_ctrl(identity);
+  VTransformNode* vtn_identity = new (vtransform.arena()) VTransformOuterNode(vtransform, identity);
+
+  VTransformNode* vtn_identity_vector = new (vtransform.arena()) VTransformReplicateNode(vtransform, vlen, bt);
+  vtn_identity_vector->init_req(1, vtn_identity);
+
+  // Turn the scalar phi into a vector phi.
+  VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi();
+  VTransformNode* init = phi->in_req(1);
+  phi->set_req(1, vtn_identity_vector);
+
+  // Traverse down the chain of reductions, and replace them with vector_accumulators.
+  VTransformReductionVectorNode* first_red   = this;
+  VTransformReductionVectorNode* last_red    = phi->in_req(2)->isa_ReductionVector();
+  VTransformReductionVectorNode* current_red = first_red;
+  VTransformNode* current_vector_accumulator = phi;
+  while (true) {
+    VTransformNode* vector_input = current_red->in_req(2);
+    VTransformVectorNode* vector_accumulator = new (vtransform.arena()) VTransformElementWiseVectorNode(vtransform, 3, current_red->properties(), vopc);
+    vector_accumulator->init_req(1, current_vector_accumulator);
+    vector_accumulator->init_req(2, vector_input);
+    TRACE_OPTIMIZE(
+      tty->print("  replace    ");
+      current_red->print();
+      tty->print("  with       ");
+      vector_accumulator->print();
+    )
+    current_vector_accumulator = vector_accumulator;
+    if (current_red == last_red) { break; }
+    current_red = current_red->unique_out_strong_edge()->isa_ReductionVector();
+  }
+
+  // Feed vector accumulator into the backedge.
+  phi->set_req(2, current_vector_accumulator);
+
+  // Create post-loop reduction. last_red keeps all uses outside the loop.
+  last_red->set_req(1, init);
+  last_red->set_req(2, current_vector_accumulator);
+
+  TRACE_OPTIMIZE(
+    tty->print("  phi        ");
+    phi->print();
+    tty->print("  after loop ");
+    last_red->print();
+  )
+  return true; // success
+}
+
 VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
   Node* init = apply_state.transformed_node(in_req(1));
   Node* vec  = apply_state.transformed_node(in_req(2));
@@ -1041,7 +1338,7 @@ void VTransformNode::print() const {
       print_node_idx(_in.at(i));
     }
   }
-  tty->print(") [");
+  tty->print(") %s[", _is_alive ? "" : "dead ");
   for (uint i = 0; i < _out_end_strong_edges; i++) {
     print_node_idx(_out.at(i));
   }
diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index a004962eea7..7ad7b432e9b 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -41,7 +41,11 @@
 // - Construction:
 //   - From SuperWord PackSet, with the SuperWordVTransformBuilder.
 //
-// - Future Plans: optimize, if-conversion, etc.
+// - Optimize:
+//   - Move non-strict order reductions out of the loop. This means we have
+//     only element-wise operations inside the loop, rather than the much
+//     more expensive lane-crossing reductions. We need to do this before
+//     assessing profitability with the cost-model.
 //
 // - Schedule:
 //   - Compute linearization of the VTransformGraph, into an order that respects
@@ -62,12 +66,12 @@
 //
 // Future Plans with VTransform:
 // - Cost model: estimate if vectorization is profitable.
-// - Optimizations: moving unordered reductions out of the loop, whih decreases cost.
 // - Pack/Unpack/Shuffle: introduce additional nodes not present in the scalar loop.
 //                        This is difficult to do with the SuperWord packset approach.
 // - If-conversion: convert predicated nodes into CFG.
 
 typedef int VTransformNodeIDX;
+class VTransform;
 class VTransformNode;
 class VTransformMemopScalarNode;
 class VTransformDataScalarNode;
@@ -183,6 +187,7 @@ public:
   const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
   const GrowableArray<VTransformNode*>& get_schedule() const { return _schedule; }
 
+  void optimize(VTransform& vtransform);
   bool schedule();
   bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
   void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
@@ -194,6 +199,7 @@ private:
   bool in_bb(const Node* n)   const { return _vloop.in_bb(n); }
 
   void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
+  int count_alive_vtnodes() const;
 
 #ifndef PRODUCT
   void print_vtnodes() const;
@@ -239,10 +245,12 @@ public:
     _aw_for_main_loop_alignment(aw_for_main_loop_alignment) {}
 
   const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; }
+  const VLoop& vloop() const { return _vloop; }
   Arena* arena() { return &_arena; }
   DEBUG_ONLY( bool has_graph() const { return !_graph.is_empty(); } )
   VTransformGraph& graph() { return _graph; }
 
+  void optimize() { return _graph.optimize(*this); }
   bool schedule() { return _graph.schedule(); }
   bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
   void apply();
@@ -372,6 +380,8 @@ public:
   const VTransformNodeIDX _idx;
 
 private:
+  bool _is_alive;
+
   // We split _in into 3 sections:
   // - data edges (req):     _in[0                           .. _req-1]
   // - strong memory edges:  _in[_req                        .. _in_end_strong_memory_edges-1]
@@ -389,6 +399,7 @@ private:
 public:
   VTransformNode(VTransform& vtransform, const uint req) :
     _idx(vtransform.graph().new_idx()),
+    _is_alive(true),
     _req(req),
     _in_end_strong_memory_edges(req),
     _in(vtransform.arena(),  req, req, nullptr),
@@ -405,6 +416,14 @@ public:
     n->add_out_strong_edge(this);
   }
 
+  void set_req(uint i, VTransformNode* n) {
+    assert(i < _req, "must be a req");
+    VTransformNode* old = _in.at(i);
+    if (old != nullptr) { old->del_out_strong_edge(this); }
+    _in.at_put(i, n);
+    if (n != nullptr) { n->add_out_strong_edge(this); }
+  }
+
   void swap_req(uint i, uint j) {
     assert(i < _req, "must be a req");
     assert(j < _req, "must be a req");
@@ -452,6 +471,23 @@ private:
     _out.push(n);
   }
 
+  void del_out_strong_edge(VTransformNode* n) {
+    int i = _out.find(n);
+    assert(0 <= i && i < (int)_out_end_strong_edges, "must be in strong edges");
+
+    // Replace n with the last strong edge.
+    VTransformNode* last_strong = _out.at(_out_end_strong_edges - 1);
+    _out.at_put(i, last_strong);
+
+    if (_out_end_strong_edges < (uint)_out.length()) {
+      // Now replace where last_strong was with the last weak edge.
+      VTransformNode* last_weak = _out.top();
+      _out.at_put(_out_end_strong_edges - 1, last_weak);
+    }
+    _out.pop();
+    _out_end_strong_edges--;
+  }
+
 public:
   uint req() const { return _req; }
   uint out_strong_edges() const { return _out_end_strong_edges; }
@@ -479,6 +515,21 @@ public:
     return false;
   }
 
+  VTransformNode* unique_out_strong_edge() const {
+    assert(out_strong_edges() == 1, "must be unique");
+    return _out.at(0);
+  }
+
+  bool is_alive() const { return _is_alive; }
+
+  void mark_dead() {
+    _is_alive = false;
+    // Remove all inputs
+    for (uint i = 0; i < req(); i++) {
+      set_req(i, nullptr);
+    }
+  }
+
   virtual VTransformMemopScalarNode* isa_MemopScalar() { return nullptr; }
   virtual VTransformLoopPhiNode* isa_LoopPhi() { return nullptr; }
   virtual VTransformCountedLoopNode* isa_CountedLoop() { return nullptr; }
@@ -496,6 +547,8 @@ public:
   virtual bool is_load_or_store_in_loop() const { return false; }
   virtual const VPointer& vpointer() const { ShouldNotReachHere(); }
 
+  virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
+
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
   virtual void apply_backedge(VTransformApplyState& apply_state) const {};
   void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
@@ -701,6 +754,7 @@ public:
   NOT_PRODUCT(virtual void print_spec() const override;)
 
 protected:
+  const VTransformVectorNodeProperties& properties() const { return _properties; }
   Node* approximate_origin()     const { return _properties.approximate_origin(); }
   int scalar_opcode()            const { return _properties.scalar_opcode(); }
   uint vector_length()           const { return _properties.vector_length(); }
@@ -780,8 +834,15 @@ public:
   VTransformReductionVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
     VTransformVectorNode(vtransform, 3, properties) {}
   virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
+  virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
+
+private:
+  int vector_reduction_opcode() const;
+  bool requires_strict_order() const;
+  bool optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform);
+  bool optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform);
 };
 
 class VTransformMemVectorNode : public VTransformVectorNode {