From e8eff4d25b984d503a4daa5d291b52a8d1e2f186 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 26 May 2025 18:31:19 +0000 Subject: [PATCH] 8357530: C2 SuperWord: Diagnostic flag AutoVectorizationOverrideProfitability Reviewed-by: thartmann, kvn --- src/hotspot/share/opto/c2_globals.hpp | 12 ++ src/hotspot/share/opto/superword.cpp | 65 ++++++- ...utoVectorizationOverrideProfitability.java | 164 ++++++++++++++++++ 3 files changed, 233 insertions(+), 8 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp index 0bb842907a3..fd55f2fd666 100644 --- a/src/hotspot/share/opto/c2_globals.hpp +++ b/src/hotspot/share/opto/c2_globals.hpp @@ -375,6 +375,18 @@ "2 = Prefer alignment with vector load.") \ range(0, 2) \ \ + product(uint, AutoVectorizationOverrideProfitability, 1, DIAGNOSTIC, \ + "Override the auto vectorization profitability heuristics." \ + "0 = Run auto vectorizer, but abort just before applying" \ + " vectorization, as though it was not profitable." \ + "1 = Run auto vectorizer with the default profitability" \ + " heuristics. This is the default, and hopefully" \ + " delivers the best performance." \ + "2 = Run auto vectorizer, and vectorize even if the" \ + " profitability heuristics predict that vectorization" \ + " is not profitable.") \ + range(0, 2) \ + \ product(bool, UseCMoveUnconditionally, false, \ "Use CMove (scalar and vector) ignoring profitability test.") \ \ diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index b663bed3938..47735f6bbba 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1605,12 +1605,31 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const { int opc = p0->Opcode(); if (is_marked_reduction(p0)) { const Type *arith_type = p0->bottom_type(); - // Length 2 reductions of INT/LONG do not offer performance benefits - if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) { - retValue = false; - } else { - retValue = ReductionNode::implemented(opc, size, arith_type->basic_type()); + // This heuristic predicts that 2-element reductions for INT/LONG are not + // profitable. This heuristic was added in JDK-8078563. The argument + // was that reductions are not just a single instruction, but multiple, and + // hence it is not directly clear that they are profitable. If we only have + // two elements per vector, then the performance gains from non-reduction + // vectors are at most going from 2 scalar instructions to 1 vector instruction. + // But a 2-element reduction vector goes from 2 scalar instructions to + // 3 instructions (1 shuffle and two reduction ops). + // However, this optimization assumes that these reductions stay in the loop + // which may not be true any more in most cases after the introduction of: + // PhaseIdealLoop::move_unordered_reduction_out_of_loop + // Hence, this heuristic has room for improvement. + bool is_two_element_int_or_long_reduction = (size == 2) && + (arith_type->basic_type() == T_INT || + arith_type->basic_type() == T_LONG); + if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) { +#ifndef PRODUCT + if (is_trace_superword_rejections()) { + tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable."); + tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2"); + } +#endif + return false; } + retValue = ReductionNode::implemented(opc, size, arith_type->basic_type()); } else if (VectorNode::is_convert_opcode(opc)) { retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0)); } else if (VectorNode::is_minmax_opcode(opc) && is_subword_type(velt_basic_type(p0))) { @@ -1756,9 +1775,29 @@ bool SuperWord::profitable(const Node_List* p) const { if (is_marked_reduction(p0)) { Node* second_in = p0->in(2); Node_List* second_pk = get_pack(second_in); - if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) { - // No parent pack or not enough work - // to cover reduction expansion overhead + if (second_pk == nullptr) { + // The second input has to be the vector we wanted to reduce, + // but it was not packed. + return false; + } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) { + // This heuristic predicts that the reduction is not profitable. + // Reduction vectors can be expensive, because they require multiple + // operations to fold all the lanes together. Hence, vectorizing the + // reduction is not profitable on its own. Hence, we need a lot of + // other "work vectors" that deliver performance improvements to + // balance out the performance loss due to reductions. + // This heuristic is a bit simplistic, and assumes that the reduction + // vector stays in the loop. But in some cases, we can move the + // reduction out of the loop, replacing it with a single vector op. + // See: PhaseIdealLoop::move_unordered_reduction_out_of_loop + // Hence, this heuristic has room for improvement. +#ifndef PRODUCT + if (is_trace_superword_rejections()) { + tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make"); + tty->print_cr(" reduction profitable."); + tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2"); + } +#endif return false; } else if (second_pk->size() != p->size()) { return false; @@ -1914,6 +1953,16 @@ bool SuperWord::schedule_and_apply() const { if (!vtransform.schedule()) { return false; } if (vtransform.has_store_to_load_forwarding_failure()) { return false; } + + if (AutoVectorizationOverrideProfitability == 0) { +#ifndef PRODUCT + if (is_trace_superword_any()) { + tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0)."); + } +#endif + return false; + } + vtransform.apply(); return true; } diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java new file mode 100644 index 00000000000..10ad19d03a7 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8357530 + * @summary Test the effect of AutoVectorizationOverrideProfitability. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestAutoVectorizationOverrideProfitability + */ + +package compiler.loopopts.superword; + +import compiler.lib.ir_framework.*; +import compiler.lib.verify.*; +import compiler.lib.generators.Generator; +import static compiler.lib.generators.Generators.G; + +public class TestAutoVectorizationOverrideProfitability { + public static final Generator GEN_I = G.ints(); + public static final Generator GEN_F = G.floats(); + + public static int[] aI = new int[10_000]; + public static int[] rI = new int[10_000]; + public static float[] aF = new float[10_000]; + public static float[] rF = new float[10_000]; + + static { + G.fill(GEN_I, aI); + G.fill(GEN_F, aF); + } + + public static void main(String[] args) throws Exception { + // Do not vectorize, even if profitable. + TestFramework.runWithFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0"); + + // Normal run, i.e. with normal heuristic. In some cases this vectorizes, in some not. + // By default, we have AutoVectorizationOverrideProfitability=1 + TestFramework.run(); + + // Vectorize even if not profitable. + TestFramework.runWithFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2"); + } + + public static final float GOLD_SIMPLE_FLOAT_REDUCTION = simpleFloatReduction(); + + @Test + @Warmup(10) + @IR(applyIfCPUFeatureOr = {"avx", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}, + counts = {IRNode.ADD_REDUCTION_VF, "> 0"}) + @IR(applyIfCPUFeatureOr = {"avx", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}, + counts = {IRNode.ADD_REDUCTION_VF, "= 0"}) + // The simple float reduction is not profitable. We need to sequentially + // add up the values, and so we cannot move the reduction out of the loop. + private static float simpleFloatReduction() { + float sum = 0; + for (int i = 0; i < aF.length; i++) { + sum += aF[i]; + } + return sum; + } + + @Check(test="simpleFloatReduction") + public static void checkSimpleFloatReduction(float result) { + Verify.checkEQ(GOLD_SIMPLE_FLOAT_REDUCTION, result); + } + + static { simpleFloatCopy(); } + public static final float[] GOLD_SIMPLE_FLOAT_COPY = rF.clone(); + + @Test + @Warmup(10) + @IR(applyIfCPUFeatureOr = {"avx", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}, + counts = {IRNode.LOAD_VECTOR_F, "> 0"}) + @IR(applyIfCPUFeatureOr = {"avx", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}, + counts = {IRNode.LOAD_VECTOR_F, "= 0"}) + // The simple float copy is always profitable. + private static void simpleFloatCopy() { + for (int i = 0; i < aF.length; i++) { + rF[i] = aF[i]; + } + } + + @Check(test="simpleFloatCopy") + public static void checkSimpleFloatCopy() { + Verify.checkEQ(GOLD_SIMPLE_FLOAT_COPY, rF); + } + + public static final int GOLD_SIMPLE_INT_REDUCTION = simpleIntReduction(); + + @Test + @Warmup(10) + @IR(applyIfCPUFeatureOr = {"avx", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}, + counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"}) + @IR(applyIfCPUFeatureOr = {"avx", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}, + counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"}) + // Current heuristics say that this simple int reduction is not profitable. + // But it would actually be profitable, since we are able to move the + // reduction out of the loop (we can reorder the reduction). When moving + // the reduction out of the loop, we instead accumulate with a simple + // ADD_VI inside the loop. + // See: JDK-8307516 JDK-8345044 + private static int simpleIntReduction() { + int sum = 0; + for (int i = 0; i < aI.length; i++) { + sum += aI[i]; + } + return sum; + } + + @Check(test="simpleIntReduction") + public static void checkSimpleIntReduction(int result) { + Verify.checkEQ(GOLD_SIMPLE_INT_REDUCTION, result); + } + + static { simpleIntCopy(); } + public static final int[] GOLD_SIMPLE_INT_COPY = rI.clone(); + + @Test + @Warmup(10) + @IR(applyIfCPUFeatureOr = {"avx", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}, + counts = {IRNode.LOAD_VECTOR_I, "> 0"}) + @IR(applyIfCPUFeatureOr = {"avx", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}, + counts = {IRNode.LOAD_VECTOR_I, "= 0"}) + // The simple int copy is always profitable. + private static void simpleIntCopy() { + for (int i = 0; i < aI.length; i++) { + rI[i] = aI[i]; + } + } + + @Check(test="simpleIntCopy") + public static void checkSimpleIntCopy() { + Verify.checkEQ(GOLD_SIMPLE_INT_COPY, rI); + } +}