From e8eff4d25b984d503a4daa5d291b52a8d1e2f186 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <epeter@openjdk.org>
Date: Mon, 26 May 2025 18:31:19 +0000
Subject: [PATCH] 8357530: C2 SuperWord: Diagnostic flag
 AutoVectorizationOverrideProfitability

Reviewed-by: thartmann, kvn
---
 src/hotspot/share/opto/c2_globals.hpp         |  12 ++
 src/hotspot/share/opto/superword.cpp          |  65 ++++++-
 ...utoVectorizationOverrideProfitability.java | 164 ++++++++++++++++++
 3 files changed, 233 insertions(+), 8 deletions(-)
 create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java

diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp
index 0bb842907a3..fd55f2fd666 100644
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@@ -375,6 +375,18 @@
           "2 = Prefer alignment with vector load.")                         \
           range(0, 2)                                                       \
                                                                             \
+  product(uint, AutoVectorizationOverrideProfitability, 1, DIAGNOSTIC,      \
+          "Override the auto vectorization profitability heuristics."       \
+          "0 = Run auto vectorizer, but abort just before applying"         \
+          "    vectorization, as though it was not profitable."             \
+          "1 = Run auto vectorizer with the default profitability"          \
+          "    heuristics. This is the default, and hopefully"              \
+          "    delivers the best performance."                              \
+          "2 = Run auto vectorizer, and vectorize even if the"              \
+          "    profitability heuristics predict that vectorization"         \
+          "    is not profitable.")                                         \
+          range(0, 2)                                                       \
+                                                                            \
   product(bool, UseCMoveUnconditionally, false,                             \
           "Use CMove (scalar and vector) ignoring profitability test.")     \
                                                                             \
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index b663bed3938..47735f6bbba 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -1605,12 +1605,31 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
     int opc = p0->Opcode();
     if (is_marked_reduction(p0)) {
       const Type *arith_type = p0->bottom_type();
-      // Length 2 reductions of INT/LONG do not offer performance benefits
-      if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
-        retValue = false;
-      } else {
-        retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
+      // This heuristic predicts that 2-element reductions for INT/LONG are not
+      // profitable. This heuristic was added in JDK-8078563. The argument
+      // was that reductions are not just a single instruction, but multiple, and
+      // hence it is not directly clear that they are profitable. If we only have
+      // two elements per vector, then the performance gains from non-reduction
+      // vectors are at most going from 2 scalar instructions to 1 vector instruction.
+      // But a 2-element reduction vector goes from 2 scalar instructions to
+      // 3 instructions (1 shuffle and two reduction ops).
+      // However, this optimization assumes that these reductions stay in the loop
+      // which may not be true any more in most cases after the introduction of:
+      // PhaseIdealLoop::move_unordered_reduction_out_of_loop
+      // Hence, this heuristic has room for improvement.
+      bool is_two_element_int_or_long_reduction = (size == 2) &&
+                                                  (arith_type->basic_type() == T_INT ||
+                                                   arith_type->basic_type() == T_LONG);
+      if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
+#ifndef PRODUCT
+        if (is_trace_superword_rejections()) {
+          tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
+          tty->print_cr("  Can override with AutoVectorizationOverrideProfitability=2");
+        }
+#endif
+        return false;
       }
+      retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
     } else if (VectorNode::is_convert_opcode(opc)) {
       retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
     } else if (VectorNode::is_minmax_opcode(opc) && is_subword_type(velt_basic_type(p0))) {
@@ -1756,9 +1775,29 @@ bool SuperWord::profitable(const Node_List* p) const {
   if (is_marked_reduction(p0)) {
     Node* second_in = p0->in(2);
     Node_List* second_pk = get_pack(second_in);
-    if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
-      // No parent pack or not enough work
-      // to cover reduction expansion overhead
+    if (second_pk == nullptr) {
+      // The second input has to be the vector we wanted to reduce,
+      // but it was not packed.
+      return false;
+    } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
+      // This heuristic predicts that the reduction is not profitable.
+      // Reduction vectors can be expensive, because they require multiple
+      // operations to fold all the lanes together. Hence, vectorizing the
+      // reduction is not profitable on its own. Hence, we need a lot of
+      // other "work vectors" that deliver performance improvements to
+      // balance out the performance loss due to reductions.
+      // This heuristic is a bit simplistic, and assumes that the reduction
+      // vector stays in the loop. But in some cases, we can move the
+      // reduction out of the loop, replacing it with a single vector op.
+      // See: PhaseIdealLoop::move_unordered_reduction_out_of_loop
+      // Hence, this heuristic has room for improvement.
+#ifndef PRODUCT
+        if (is_trace_superword_rejections()) {
+          tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
+          tty->print_cr("  reduction profitable.");
+          tty->print_cr("  Can override with AutoVectorizationOverrideProfitability=2");
+        }
+#endif
       return false;
     } else if (second_pk->size() != p->size()) {
       return false;
@@ -1914,6 +1953,16 @@ bool SuperWord::schedule_and_apply() const {
 
   if (!vtransform.schedule()) { return false; }
   if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
+
+  if (AutoVectorizationOverrideProfitability == 0) {
+#ifndef PRODUCT
+    if (is_trace_superword_any()) {
+      tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
+    }
+#endif
+    return false;
+  }
+
   vtransform.apply();
   return true;
 }
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
new file mode 100644
index 00000000000..10ad19d03a7
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8357530
+ * @summary Test the effect of AutoVectorizationOverrideProfitability.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestAutoVectorizationOverrideProfitability
+ */
+
+package compiler.loopopts.superword;
+
+import compiler.lib.ir_framework.*;
+import compiler.lib.verify.*;
+import compiler.lib.generators.Generator;
+import static compiler.lib.generators.Generators.G;
+
+public class TestAutoVectorizationOverrideProfitability {
+    public static final Generator<Integer> GEN_I = G.ints();
+    public static final Generator<Float>   GEN_F = G.floats();
+
+    public static int[] aI = new int[10_000];
+    public static int[] rI = new int[10_000];
+    public static float[] aF = new float[10_000];
+    public static float[] rF = new float[10_000];
+
+    static {
+        G.fill(GEN_I, aI);
+        G.fill(GEN_F, aF);
+    }
+
+    public static void main(String[] args) throws Exception {
+        // Do not vectorize, even if profitable.
+        TestFramework.runWithFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0");
+
+        // Normal run, i.e. with normal heuristic. In some cases this vectorizes, in some not.
+        // By default, we have AutoVectorizationOverrideProfitability=1
+        TestFramework.run();
+
+        // Vectorize even if not profitable.
+        TestFramework.runWithFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2");
+    }
+
+    public static final float GOLD_SIMPLE_FLOAT_REDUCTION = simpleFloatReduction();
+
+    @Test
+    @Warmup(10)
+    @IR(applyIfCPUFeatureOr = {"avx", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
+        counts = {IRNode.ADD_REDUCTION_VF, "> 0"})
+    @IR(applyIfCPUFeatureOr = {"avx", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
+        counts = {IRNode.ADD_REDUCTION_VF, "= 0"})
+    // The simple float reduction is not profitable. We need to sequentially
+    // add up the values, and so we cannot move the reduction out of the loop.
+    private static float simpleFloatReduction() {
+        float sum = 0;
+        for (int i = 0; i < aF.length; i++) {
+            sum += aF[i];
+        }
+        return sum;
+    }
+
+    @Check(test="simpleFloatReduction")
+    public static void checkSimpleFloatReduction(float result) {
+        Verify.checkEQ(GOLD_SIMPLE_FLOAT_REDUCTION, result);
+    }
+
+    static { simpleFloatCopy(); }
+    public static final float[] GOLD_SIMPLE_FLOAT_COPY = rF.clone();
+
+    @Test
+    @Warmup(10)
+    @IR(applyIfCPUFeatureOr = {"avx", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
+        counts = {IRNode.LOAD_VECTOR_F, "> 0"})
+    @IR(applyIfCPUFeatureOr = {"avx", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
+        counts = {IRNode.LOAD_VECTOR_F, "= 0"})
+    // The simple float copy is always profitable.
+    private static void simpleFloatCopy() {
+        for (int i = 0; i < aF.length; i++) {
+            rF[i] = aF[i];
+        }
+    }
+
+    @Check(test="simpleFloatCopy")
+    public static void checkSimpleFloatCopy() {
+        Verify.checkEQ(GOLD_SIMPLE_FLOAT_COPY, rF);
+    }
+
+    public static final int GOLD_SIMPLE_INT_REDUCTION = simpleIntReduction();
+
+    @Test
+    @Warmup(10)
+    @IR(applyIfCPUFeatureOr = {"avx", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
+        counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"})
+    @IR(applyIfCPUFeatureOr = {"avx", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
+        counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"})
+    // Current heuristics say that this simple int reduction is not profitable.
+    // But it would actually be profitable, since we are able to move the
+    // reduction out of the loop (we can reorder the reduction). When moving
+    // the reduction out of the loop, we instead accumulate with a simple
+    // ADD_VI inside the loop.
+    // See: JDK-8307516 JDK-8345044
+    private static int simpleIntReduction() {
+        int sum = 0;
+        for (int i = 0; i < aI.length; i++) {
+            sum += aI[i];
+        }
+        return sum;
+    }
+
+    @Check(test="simpleIntReduction")
+    public static void checkSimpleIntReduction(int result) {
+        Verify.checkEQ(GOLD_SIMPLE_INT_REDUCTION, result);
+    }
+
+    static { simpleIntCopy(); }
+    public static final int[] GOLD_SIMPLE_INT_COPY = rI.clone();
+
+    @Test
+    @Warmup(10)
+    @IR(applyIfCPUFeatureOr = {"avx", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
+        counts = {IRNode.LOAD_VECTOR_I, "> 0"})
+    @IR(applyIfCPUFeatureOr = {"avx", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
+        counts = {IRNode.LOAD_VECTOR_I, "= 0"})
+    // The simple int copy is always profitable.
+    private static void simpleIntCopy() {
+        for (int i = 0; i < aI.length; i++) {
+            rI[i] = aI[i];
+        }
+    }
+
+    @Check(test="simpleIntCopy")
+    public static void checkSimpleIntCopy() {
+        Verify.checkEQ(GOLD_SIMPLE_INT_COPY, rI);
+    }
+}