8340093: C2 SuperWord: implement cost model

Reviewed-by: kvn, qamai
2026-02-03 23:18:28 +00:00 · 2025-11-10 15:56:49 +00:00 · 2025-11-10 15:56:49 +00:00 · 72989e0fac
commit 72989e0fac
parent 6e838d6f9a
13 changed files with 2884 additions and 94 deletions
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@ -129,18 +129,24 @@ source %{
  bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
    if (UseSVE == 0) {
      // These operations are not profitable to be vectorized on NEON, because no direct
-      // NEON instructions support them. But the match rule support for them is profitable for
-      // Vector API intrinsics.
+      // NEON instructions support them. They use multiple instructions which is more
+      // expensive in almost all cases where we would auto vectorize.
+      // But the match rule support for them is profitable for Vector API intrinsics.
      if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
          (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
          (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
          (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
+          opcode == Op_MulVL ||
          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
          // They are not suitable for auto-vectorization because the result would not conform
          // to the JLS, Section Evaluation Order.
+          // Note: we could implement sequential reductions for these reduction operators, but
+          //       this will still almost never lead to speedups, because the sequential
+          //       reductions are latency limited along the reduction chain, and not
+          //       throughput limited. This is unlike unordered reductions (associative op)
+          //       and element-wise ops which are usually throughput limited.
          opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
-          opcode == Op_MulVL) {
+          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
        return false;
      }
    }
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@ -119,18 +119,24 @@ source %{
  bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
    if (UseSVE == 0) {
      // These operations are not profitable to be vectorized on NEON, because no direct
-      // NEON instructions support them. But the match rule support for them is profitable for
-      // Vector API intrinsics.
+      // NEON instructions support them. They use multiple instructions which is more
+      // expensive in almost all cases where we would auto vectorize.
+      // But the match rule support for them is profitable for Vector API intrinsics.
      if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
          (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
          (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
          (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
+          opcode == Op_MulVL ||
          // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
          // They are not suitable for auto-vectorization because the result would not conform
          // to the JLS, Section Evaluation Order.
+          // Note: we could implement sequential reductions for these reduction operators, but
+          //       this will still almost never lead to speedups, because the sequential
+          //       reductions are latency limited along the reduction chain, and not
+          //       throughput limited. This is unlike unordered reductions (associative op)
+          //       and element-wise ops which are usually throughput limited.
          opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
-          opcode == Op_MulVL) {
+          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
        return false;
      }
    }
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
           ),
  _vpointer_for_main_loop_alignment(nullptr),
  _aw_for_main_loop_alignment(0),
-  _do_vector_loop(phase()->C->do_vector_loop()),            // whether to do vectorization/simd style
-  _num_work_vecs(0),                                        // amount of vector work we have
-  _num_reductions(0)                                        // amount of reduction work we have
+  _do_vector_loop(phase()->C->do_vector_loop())             // whether to do vectorization/simd style
 {
 }

@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() {

 // Remove packs that are not profitable.
 void SuperWord::filter_packs_for_profitable() {
-  // Count the number of reductions vs other vector ops, for the
-  // reduction profitability heuristic.
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* pack = _packset.at(i);
-    Node* n = pack->at(0);
-    if (is_marked_reduction(n)) {
-      _num_reductions++;
-    } else {
-      _num_work_vecs++;
-    }
-  }
-
  // Remove packs that are not profitable
  auto filter = [&](const Node_List* pack) {
    return profitable(pack);
@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
  if (p0 != nullptr) {
    int opc = p0->Opcode();
    if (is_marked_reduction(p0)) {
-      const Type *arith_type = p0->bottom_type();
-      // This heuristic predicts that 2-element reductions for INT/LONG are not
-      // profitable. This heuristic was added in JDK-8078563. The argument
-      // was that reductions are not just a single instruction, but multiple, and
-      // hence it is not directly clear that they are profitable. If we only have
-      // two elements per vector, then the performance gains from non-reduction
-      // vectors are at most going from 2 scalar instructions to 1 vector instruction.
-      // But a 2-element reduction vector goes from 2 scalar instructions to
-      // 3 instructions (1 shuffle and two reduction ops).
-      // However, this optimization assumes that these reductions stay in the loop
-      // which may not be true any more in most cases after the introduction of:
-      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
-      // Hence, this heuristic has room for improvement.
-      bool is_two_element_int_or_long_reduction = (size == 2) &&
-                                                  (arith_type->basic_type() == T_INT ||
-                                                   arith_type->basic_type() == T_LONG);
-      if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
-#ifndef PRODUCT
-        if (is_trace_superword_rejections()) {
-          tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
-          tty->print_cr("  Can override with AutoVectorizationOverrideProfitability=2");
-        }
-#endif
-        return false;
-      }
+      const Type* arith_type = p0->bottom_type();
      retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
    } else if (VectorNode::is_convert_opcode(opc)) {
      retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
@ -1791,26 +1753,6 @@ bool SuperWord::profitable(const Node_List* p) const {
      // The second input has to be the vector we wanted to reduce,
      // but it was not packed.
      return false;
-    } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
-      // This heuristic predicts that the reduction is not profitable.
-      // Reduction vectors can be expensive, because they require multiple
-      // operations to fold all the lanes together. Hence, vectorizing the
-      // reduction is not profitable on its own. Hence, we need a lot of
-      // other "work vectors" that deliver performance improvements to
-      // balance out the performance loss due to reductions.
-      // This heuristic is a bit simplistic, and assumes that the reduction
-      // vector stays in the loop. But in some cases, we can move the
-      // reduction out of the loop, replacing it with a single vector op.
-      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
-      // Hence, this heuristic has room for improvement.
-#ifndef PRODUCT
-        if (is_trace_superword_rejections()) {
-          tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
-          tty->print_cr("  reduction profitable.");
-          tty->print_cr("  Can override with AutoVectorizationOverrideProfitability=2");
-        }
-#endif
-      return false;
    } else if (second_pk->size() != p->size()) {
      return false;
    }
@ -1969,19 +1911,53 @@ bool SuperWord::do_vtransform() const {
  vtransform.optimize();

  if (!vtransform.schedule()) { return false; }
-  if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
+
+  if (!vtransform.is_profitable()) { return false; }
+
+  vtransform.apply();
+  return true;
+}
+
+// Check Cost-Model, and other heuristics.
+// Can be overridden with AutoVectorizationOverrideProfitability.
+bool VTransform::is_profitable() const {
+  assert(_graph.is_scheduled(), "must already be scheduled");

  if (AutoVectorizationOverrideProfitability == 0) {
 #ifndef PRODUCT
-    if (is_trace_superword_any()) {
+    if (_trace._info) {
      tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
    }
 #endif
    return false;
  }

-  vtransform.apply();
-  return true;
+  if (AutoVectorizationOverrideProfitability == 2) {
+#ifndef PRODUCT
+    if (_trace._info) {
+      tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2).");
+    }
+#endif
+    return true;
+  }
+
+  // Note: currently we only do throughput-based cost-modeling. In the future, we could
+  //       also implement latency-based cost-modeling and take store-to-load-forwarding
+  //       failures into account as the latency between the load and store. This would
+  //       allow a more precise tradeoff between the forwarding failure penalty versus
+  //       the vectorization gains.
+  if (has_store_to_load_forwarding_failure()) { return false; }
+
+  // Cost-model
+  float scalar_cost = _vloop_analyzer.cost_for_scalar_loop();
+  float vector_cost = cost_for_vector_loop();
+#ifndef PRODUCT
+  if (_trace._info) {
+    tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f",
+                  scalar_cost, vector_cost);
+  }
+#endif
+  return vector_cost < scalar_cost;
 }

 // Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@ -549,8 +549,6 @@ class SuperWord : public ResourceObj {

 private:
  bool           _do_vector_loop;  // whether to do vectorization/simd style
-  int            _num_work_vecs;   // Number of non memory vector operations
-  int            _num_reductions;  // Number of reduction expressions applied

  // Accessors
  Arena* arena()                   { return &_arena; }
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@ -38,7 +38,7 @@
  flags(MEMORY_SLICES,              "Trace VLoopMemorySlices") \
  flags(BODY,                       "Trace VLoopBody") \
  flags(TYPES,                      "Trace VLoopTypes") \
-  flags(POINTERS,                   "Trace VLoopPointers") \
+  flags(POINTERS,                   "Trace VLoopVPointers") \
  flags(DEPENDENCY_GRAPH,           "Trace VLoopDependencyGraph") \
  flags(SW_ADJACENT_MEMOPS,         "Trace SuperWord::find_adjacent_memop_pairs") \
  flags(SW_REJECTIONS,              "Trace SuperWord rejections (non vectorizations)") \
@ -47,6 +47,8 @@
  flags(SW_VERBOSE,                 "Trace SuperWord verbose (all SW tags enabled)") \
  flags(VTRANSFORM,                 "Trace VTransform Graph") \
  flags(OPTIMIZATION,               "Trace VTransform::optimize") \
+  flags(COST,                       "Trace cost of VLoop (scalar) and VTransform (vector)") \
+  flags(COST_VERBOSE,               "Trace like COST, but more verbose") \
  flags(ALIGN_VECTOR,               "Trace AlignVector") \
  flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
  flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() {
  int pointers_idx = 0;
  _body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
    // Placement new: construct directly into the array.
-    ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
+    ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes);
    _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
    pointers_idx++;
  });
@ -541,6 +541,108 @@ void VLoopDependencyGraph::PredsIterator::next() {
  }
 }

+// Cost-model heuristic for nodes that do not contribute to computational
+// cost inside the loop.
+bool VLoopAnalyzer::has_zero_cost(Node* n) const {
+  // Outside body?
+  if (!_vloop.in_bb(n)) { return true; }
+
+  // Internal nodes of pointer expressions are most likely folded into
+  // the load / store and have no additional cost.
+  if (vpointers().is_in_pointer_expression(n)) { return true; }
+
+  // Not all AddP nodes can be detected in VPointer parsing, so
+  // we filter them out here.
+  // We don't want to explicitly model the cost of control flow,
+  // since we have the same CFG structure before and after
+  // vectorization: A loop head, a loop exit, with a backedge.
+  if (n->is_AddP() || // Pointer expression
+      n->is_CFG() ||  // CFG
+      n->is_Phi() ||  // CFG
+      n->is_Cmp() ||  // CFG
+      n->is_Bool()) { // CFG
+    return true;
+  }
+
+  // All other nodes have a non-zero cost.
+  return false;
+}
+
+// Compute the cost over all operations in the (scalar) loop.
+float VLoopAnalyzer::cost_for_scalar_loop() const {
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:");
+  }
+#endif
+
+  float sum = 0;
+  for (int j = 0; j < body().body().length(); j++) {
+    Node* n = body().body().at(j);
+    if (!has_zero_cost(n)) {
+      float c = cost_for_scalar_node(n->Opcode());
+      sum += c;
+#ifndef PRODUCT
+      if (_vloop.is_trace_cost_verbose()) {
+        tty->print_cr("  -> cost = %.2f for %d %s", c, n->_idx, n->Name());
+      }
+#endif
+    }
+  }
+
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  total_cost = %.2f", sum);
+  }
+#endif
+  return sum;
+}
+
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float VLoopAnalyzer::cost_for_scalar_node(int opcode) const {
+  float c = 1;
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s", c, NodeClassNames[opcode]);
+  }
+#endif
+  return c;
+}
+
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const {
+  float c = 1;
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s",
+                  c, NodeClassNames[opcode], vlen, type2name(bt));
+  }
+#endif
+  return c;
+}
+
+// For now, we use unit cost, i.e. we count the number of backend instructions
+// that the vtnode will use. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
+  // Each reduction is composed of multiple instructions, each estimated with a unit cost.
+  //                                Linear: shuffle and reduce    Recursive: shuffle and reduce
+  float c = requires_strict_order ? 2 * vlen                    : 2 * exact_log2(vlen);
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",
+                  c, NodeClassNames[opcode], vlen, type2name(bt),
+                  requires_strict_order ? "true" : "false");
+  }
+#endif
+  return c;
+}
+
 // Computing aliasing runtime check using init and last of main-loop
 // -----------------------------------------------------------------
 //
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -209,6 +209,14 @@ public:
    return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
  }

+  bool is_trace_cost() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::COST);
+  }
+
+  bool is_trace_cost_verbose() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::COST_VERBOSE);
+  }
+
  bool is_trace_speculative_runtime_checks() const {
    return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
  }
@ -584,6 +592,32 @@ private:
  const Type* container_type(Node* n) const;
 };

+// Mark all nodes from the loop that are part of any VPointer expression.
+class PointerExpressionNodes : public MemPointerParserCallback {
+private:
+  const VLoop&     _vloop;
+  const VLoopBody& _body;
+  VectorSet        _in_pointer_expression;
+
+public:
+  PointerExpressionNodes(Arena* arena,
+                         const VLoop& vloop,
+                         const VLoopBody& body) :
+    _vloop(vloop),
+    _body(body),
+    _in_pointer_expression(arena) {}
+
+  virtual void callback(Node* n) override {
+    if (!_vloop.in_bb(n)) { return; }
+    _in_pointer_expression.set(_body.bb_idx(n));
+  }
+
+  bool contains(const Node* n) const {
+    if (!_vloop.in_bb(n)) { return false; }
+    return _in_pointer_expression.test(_body.bb_idx(n));
+  }
+};
+
 // Submodule of VLoopAnalyzer.
 // We compute and cache the VPointer for every load and store.
 class VLoopVPointers : public StackObj {
@ -599,6 +633,9 @@ private:
  // Map bb_idx -> index in _vpointers. -1 if not mapped.
  GrowableArray<int> _bb_idx_to_vpointer;

+  // Mark all nodes that are part of any pointers expression.
+  PointerExpressionNodes _pointer_expression_nodes;
+
 public:
  VLoopVPointers(Arena* arena,
                 const VLoop& vloop,
@ -610,13 +647,18 @@ public:
    _bb_idx_to_vpointer(arena,
                        vloop.estimated_body_length(),
                        vloop.estimated_body_length(),
-                        -1) {}
+                        -1),
+    _pointer_expression_nodes(arena, _vloop, _body) {}
  NONCOPYABLE(VLoopVPointers);

  void compute_vpointers();
  const VPointer& vpointer(const MemNode* mem) const;
  NOT_PRODUCT( void print() const; )

+  bool is_in_pointer_expression(const Node* n) const {
+    return _pointer_expression_nodes.contains(n);
+  }
+
 private:
  void count_vpointers();
  void allocate_vpointers_array();
@ -810,6 +852,15 @@ public:
  const VLoopVPointers& vpointers()              const { return _vpointers; }
  const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; }

+  // Compute the cost of the (scalar) body.
+  float cost_for_scalar_loop() const;
+  bool has_zero_cost(Node* n) const;
+
+  // Cost-modeling with tracing.
+  float cost_for_scalar_node(int opcode) const;
+  float cost_for_vector_node(int opcode, int vlen, BasicType bt) const;
+  float cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
+
 private:
  bool setup_submodules();
  VStatus setup_submodules_helper();
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@ -186,6 +186,99 @@ int VTransformGraph::count_alive_vtnodes() const {
  return count;
 }

+// Find all nodes that in the loop, in a 2-phase process:
+// - First, find all nodes that are not before the loop:
+//   - loop-phis
+//   - loads and stores that are in the loop
+//   - and all their transitive uses.
+// - Second, we find all nodes that are not after the loop:
+//   - backedges
+//   - loads and stores that are in the loop
+//   - and all their transitive uses.
+//
+// in_loop: vtn->_idx -> bool
+void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
+  assert(is_scheduled(), "must already be scheduled");
+
+  // Phase 1: find all nodes that are not before the loop.
+  VectorSet is_not_before_loop;
+  for (int i = 0; i < _schedule.length(); i++) {
+    VTransformNode* vtn = _schedule.at(i);
+    // Is vtn a loop-phi?
+    if (vtn->isa_LoopPhi() != nullptr ||
+        vtn->is_load_or_store_in_loop()) {
+      is_not_before_loop.set(vtn->_idx);
+      continue;
+    }
+    // Or one of its transitive uses?
+    for (uint j = 0; j < vtn->req(); j++) {
+      VTransformNode* def = vtn->in_req(j);
+      if (def != nullptr && is_not_before_loop.test(def->_idx)) {
+        is_not_before_loop.set(vtn->_idx);
+        break;
+      }
+    }
+  }
+
+  // Phase 2: find all nodes that are not after the loop.
+  for (int i = _schedule.length()-1; i >= 0; i--) {
+    VTransformNode* vtn = _schedule.at(i);
+    if (!is_not_before_loop.test(vtn->_idx)) { continue; }
+    // Is load or store?
+    if (vtn->is_load_or_store_in_loop()) {
+        in_loop.set(vtn->_idx);
+        continue;
+    }
+    for (uint i = 0; i < vtn->out_strong_edges(); i++) {
+      VTransformNode* use = vtn->out_strong_edge(i);
+      // Or is vtn a backedge or one of its transitive defs?
+      if (in_loop.test(use->_idx) ||
+          use->isa_LoopPhi() != nullptr) {
+        in_loop.set(vtn->_idx);
+        break;
+      }
+    }
+  }
+}
+
+float VTransformGraph::cost_for_vector_loop() const {
+  assert(is_scheduled(), "must already be scheduled");
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("\nVTransformGraph::cost_for_vector_loop:");
+  }
+#endif
+
+  // We only want to count the cost of nodes that are in the loop.
+  // This is especially important for cases where we were able to move
+  // some nodes outside the loop during VTransform::optimize, e.g.:
+  // VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
+  ResourceMark rm;
+  VectorSet in_loop; // vtn->_idx -> bool
+  mark_vtnodes_in_loop(in_loop);
+
+  float sum = 0;
+  for (int i = 0; i < _schedule.length(); i++) {
+    VTransformNode* vtn = _schedule.at(i);
+    if (!in_loop.test(vtn->_idx)) { continue; }
+    float c = vtn->cost(_vloop_analyzer);
+    sum += c;
+#ifndef PRODUCT
+    if (c != 0 && _vloop.is_trace_cost_verbose()) {
+      tty->print("  -> cost = %.2f for ", c);
+      vtn->print();
+    }
+#endif
+  }
+
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  total_cost = %.2f", sum);
+  }
+#endif
+  return sum;
+}
+
 #ifndef PRODUCT
 void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
                                           const VectorSet& pre_visited,
@ -831,6 +924,12 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app
  }
 }

+float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  // This is an identity transform, but loads and stores must be counted.
+  assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted");
+  return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
+}
+
 VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const {
  apply_vtn_inputs_to_node(_node, apply_state);
  // The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
@ -843,6 +942,16 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app
  return VTransformApplyResult::make_scalar(_node);
 }

+float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  // Since this is an identity transform, we may have nodes that also
+  // VLoopAnalyzer::cost does not count for the scalar loop.
+  if (vloop_analyzer.has_zero_cost(_node)) {
+    return 0;
+  } else {
+    return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
+  }
+}
+
 VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const {
  apply_vtn_inputs_to_node(_node, apply_state);
  return VTransformApplyResult::make_scalar(_node);
@ -895,6 +1004,10 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta
  return VTransformApplyResult::make_scalar(_node);
 }

+float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type);
+}
+
 VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const {
  Node* val = apply_state.transformed_node(in_req(1));
  VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type);
@ -902,6 +1015,10 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply
  return VTransformApplyResult::make_vector(vn);
 }

+float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L);
+}
+
 VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const {
  Node* val = apply_state.transformed_node(in_req(1));
  Node* n = new ConvI2LNode(val);
@ -909,6 +1026,12 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s
  return VTransformApplyResult::make_scalar(n);
 }

+float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode);
+  return vloop_analyzer.cost_for_scalar_node(Op_AndI) +
+         vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt);
+}
+
 VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const {
  PhaseIdealLoop* phase = apply_state.phase();
  Node* shift_count_in = apply_state.transformed_node(in_req(1));
@ -924,6 +1047,9 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl
  return VTransformApplyResult::make_vector(vn);
 }

+float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt);
+}

 VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const {
  PhaseIdealLoop* phase = apply_state.phase();
@ -936,6 +1062,10 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a
  return VTransformApplyResult::make_vector(vn);
 }

+float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type());
+}
+
 VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const {
  assert(2 <= req() && req() <= 4, "Must have 1-3 inputs");
  const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
@ -954,6 +1084,12 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat
  return VTransformApplyResult::make_vector(vn);
 }

+float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type());
+  return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) +
+         vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT);
+}
+
 VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
  uint vlen = vector_length();
  int sopc  = scalar_opcode();
@ -969,6 +1105,10 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(
  return VTransformApplyResult::make_vector(vn);
 }

+float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type());
+}
+
 VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {
  const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length());
  const TypeVect* src_vt = TypeVect::make(_src_bt,              vector_length());
@ -981,6 +1121,11 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat
  return VTransformApplyResult::make_vector(vn);
 }

+float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  assert(scalar_opcode() == Op_Bool, "");
+  return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type());
+}
+
 VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const {
  const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
  assert(scalar_opcode() == Op_Bool, "");
@ -1101,10 +1246,10 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
  const BasicType bt = element_basic_type();
  const int ropc     = vector_reduction_opcode();
  const int vopc     = VectorNode::opcode(sopc, bt);
-  if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
-    DEBUG_ONLY( this->print(); )
-    assert(false, "do not have normal vector op for this reduction");
-    return false; // not implemented
+  if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) {
+    // The element-wise vector operation needed for the vector accumulator
+    // is not implemented / supported.
+    return false;
  }

  // Traverse up the chain of non strict order reductions, checking that it loops
@ -1236,6 +1381,14 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
  return true; // success
 }

+float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  int vopc = vector_reduction_opcode();
+  bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc);
+  return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order);
+}
+
 VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
  Node* init = apply_state.transformed_node(in_req(1));
  Node* vec  = apply_state.transformed_node(in_req(2));
@ -1245,6 +1398,12 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState&
  return VTransformApplyResult::make_vector(vn, vn->vect_type());
 }

+float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt);
+}
+
 VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const {
  int sopc     = scalar_opcode();
  uint vlen    = vector_length();
@ -1274,6 +1433,12 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl
  return VTransformApplyResult::make_vector(vn, vn->vect_type());
 }

+float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt);
+}
+
 VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const {
  int sopc  = scalar_opcode();
  uint vlen = vector_length();
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@ -51,6 +51,10 @@
 //   - Compute linearization of the VTransformGraph, into an order that respects
 //     all edges in the graph (bailout if cycle detected).
 //
+// - Cost-Model:
+//   - We use a cost-model as a heuristic to determine if vectorization is profitable.
+//     Compute the cost of the loop with and without vectorization.
+//
 // - Apply:
 //   - Changes to the C2 IR are only made once the "apply" method is called.
 //   - Align the main loop, by adjusting pre loop limit.
@ -190,6 +194,7 @@ public:
  void optimize(VTransform& vtransform);
  bool schedule();
  bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
+  float cost_for_vector_loop() const;
  void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;

 private:
@ -200,6 +205,7 @@ private:

  void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
  int count_alive_vtnodes() const;
+  void mark_vtnodes_in_loop(VectorSet& in_loop) const;

 #ifndef PRODUCT
  void print_vtnodes() const;
@ -252,6 +258,8 @@ public:

  void optimize() { return _graph.optimize(*this); }
  bool schedule() { return _graph.schedule(); }
+  bool is_profitable() const;
+  float cost_for_vector_loop() const { return _graph.cost_for_vector_loop(); }
  bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
  void apply();

@ -549,6 +557,8 @@ public:

  virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }

+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const = 0;
+
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
  virtual void apply_backedge(VTransformApplyState& apply_state) const {};
  void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
@ -579,6 +589,7 @@ public:
  virtual bool is_load_or_store_in_loop() const override { return true; }

  virtual const VPointer& vpointer() const override { return _vpointer; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "MemopScalar"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -595,6 +606,7 @@ public:
    assert(!_node->is_Mem() && !_node->is_Phi() && !_node->is_CFG(), "must be data node: %s", _node->Name());
  }

+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "DataScalar"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -612,6 +624,7 @@ public:
  }

  virtual VTransformLoopPhiNode* isa_LoopPhi() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  virtual void apply_backedge(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "LoopPhi"; };)
@ -629,6 +642,7 @@ public:
    assert(_node->is_CFG(), "must be CFG node: %s", _node->Name());
  }

+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "CFG"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -655,6 +669,7 @@ public:
    VTransformNode(vtransform, n->req()), _node(n) {}

  virtual VTransformOuterNode* isa_Outer() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { ShouldNotReachHere(); }
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "Outer"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -668,6 +683,7 @@ private:
 public:
  VTransformReplicateNode(VTransform& vtransform, int vlen, BasicType element_type) :
    VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -677,6 +693,7 @@ public:
 class VTransformConvI2LNode : public VTransformNode {
 public:
  VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };)
 };
@ -691,6 +708,7 @@ private:
 public:
  VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) :
    VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -704,6 +722,7 @@ private:
 public:
  VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) :
    VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -769,6 +788,7 @@ public:
  VTransformElementWiseVectorNode(VTransform& vtransform, uint req, const VTransformVectorNodeProperties properties, const int vector_opcode) :
    VTransformVectorNode(vtransform, req, properties), _vector_opcode(vector_opcode) {}
  virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -781,6 +801,7 @@ class VTransformElementWiseLongOpWithCastToIntVectorNode : public VTransformVect
 public:
  VTransformElementWiseLongOpWithCastToIntVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
    VTransformVectorNode(vtransform, 2, properties) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseLongOpWithCastToIntVector"; };)
 };
@ -791,6 +812,7 @@ private:
 public:
  VTransformReinterpretVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties, const BasicType src_bt) :
    VTransformVectorNode(vtransform, 2, properties), _src_bt(src_bt) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "ReinterpretVector"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -811,6 +833,7 @@ public:
  VTransformCmpVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
    VTransformVectorNode(vtransform, 3, properties) {}
  virtual VTransformCmpVectorNode* isa_CmpVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override { return VTransformApplyResult::make_empty(); }
  NOT_PRODUCT(virtual const char* name() const override { return "CmpVector"; };)
 };
@ -823,6 +846,7 @@ public:
    VTransformVectorNode(vtransform, 2, properties), _test(test) {}
  VTransformBoolTest test() const { return _test; }
  virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };)
  NOT_PRODUCT(virtual void print_spec() const override;)
@ -835,6 +859,7 @@ public:
    VTransformVectorNode(vtransform, 3, properties) {}
  virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
  virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)

@ -877,6 +902,7 @@ public:
  LoadNode::ControlDependency control_dependency() const;
  virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; }
  virtual bool is_load_in_loop() const override { return true; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
 };
@ -888,6 +914,7 @@ public:
    VTransformMemVectorNode(vtransform, 4, properties, vpointer, adr_type) {}
  virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; }
  virtual bool is_load_in_loop() const override { return false; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
  virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
  NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
 };
--- a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java
+++ b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java
@ -410,12 +410,12 @@ public class TestIntVect {

    }

-    // Not vectorized: simple addition not profitable, see JDK-8307516. NOTE:
-    // This check does not document the _desired_ behavior of the system but
-    // the current behavior (no vectorization)
    @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_I, "= 0",
-                   IRNode.STORE_VECTOR,  "= 0" })
+    @IR(counts = { IRNode.LOAD_VECTOR_I,     "> 0",
+                   IRNode.ADD_REDUCTION_VI,  "> 0",
+                   IRNode.ADD_VI,            "> 0" })
+    // The reduction is moved outside the loop, and we use a
+    // element-wise accumulator inside the loop.
    int test_sum(int[] a1) {
        int sum = 0;
        for (int i = 0; i < a1.length; i+=1) {
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
@ -115,17 +115,18 @@ public class TestAutoVectorizationOverrideProfitability {
    @Test
    @Warmup(10)
    @IR(applyIfCPUFeatureOr = {"avx", "true"},
-        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
        counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"})
    @IR(applyIfCPUFeatureOr = {"avx", "true"},
-        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
        counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"})
-    // Current heuristics say that this simple int reduction is not profitable.
-    // But it would actually be profitable, since we are able to move the
-    // reduction out of the loop (we can reorder the reduction). When moving
-    // the reduction out of the loop, we instead accumulate with a simple
-    // ADD_VI inside the loop.
-    // See: JDK-8307516 JDK-8345044
+    // We are able to vectorize the reduction. But on its own, that would
+    // not reduce the cost sufficiently in all cases, because vectorized
+    // reduction nodes are expensive. But since integer addition is associative
+    // we can move the reduction vector out of the loop. Instead, we accumulate
+    // with a simple ADD_VI inside the loop, which is very cheap. After the
+    // loop, we only need to use the vectorized reduction once, to collapse
+    // the partial sums contained in the lanes.
    private static int simpleIntReduction() {
        int sum = 0;
        for (int i = 0; i < aI.length; i++) {
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
--- a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
@ -28,6 +28,10 @@ import org.openjdk.jmh.infra.*;
 import java.util.concurrent.TimeUnit;
 import java.util.Random;

+/**
+ * Note: there is a corresponding IR test:
+ * test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+ */
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)