8357530: C2 SuperWord: Diagnostic flag AutoVectorizationOverrideProfitability

Reviewed-by: thartmann, kvn
This commit is contained in:
Emanuel Peter 2025-05-26 18:31:19 +00:00
parent a300c35655
commit e8eff4d25b
3 changed files with 233 additions and 8 deletions

View File

@ -375,6 +375,18 @@
"2 = Prefer alignment with vector load.") \
range(0, 2) \
\
product(uint, AutoVectorizationOverrideProfitability, 1, DIAGNOSTIC, \
"Override the auto vectorization profitability heuristics." \
"0 = Run auto vectorizer, but abort just before applying" \
" vectorization, as though it was not profitable." \
"1 = Run auto vectorizer with the default profitability" \
" heuristics. This is the default, and hopefully" \
" delivers the best performance." \
"2 = Run auto vectorizer, and vectorize even if the" \
" profitability heuristics predict that vectorization" \
" is not profitable.") \
range(0, 2) \
\
product(bool, UseCMoveUnconditionally, false, \
"Use CMove (scalar and vector) ignoring profitability test.") \
\

View File

@ -1605,12 +1605,31 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
int opc = p0->Opcode();
if (is_marked_reduction(p0)) {
const Type *arith_type = p0->bottom_type();
// Length 2 reductions of INT/LONG do not offer performance benefits
if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
retValue = false;
} else {
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
// This heuristic predicts that 2-element reductions for INT/LONG are not
// profitable. This heuristic was added in JDK-8078563. The argument
// was that reductions are not just a single instruction, but multiple, and
// hence it is not directly clear that they are profitable. If we only have
// two elements per vector, then the performance gains from non-reduction
// vectors are at most going from 2 scalar instructions to 1 vector instruction.
// But a 2-element reduction vector goes from 2 scalar instructions to
// 3 instructions (1 shuffle and two reduction ops).
// However, this optimization assumes that these reductions stay in the loop
// which may not be true any more in most cases after the introduction of:
// PhaseIdealLoop::move_unordered_reduction_out_of_loop
// Hence, this heuristic has room for improvement.
bool is_two_element_int_or_long_reduction = (size == 2) &&
(arith_type->basic_type() == T_INT ||
arith_type->basic_type() == T_LONG);
if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
}
#endif
return false;
}
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
} else if (VectorNode::is_convert_opcode(opc)) {
retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
} else if (VectorNode::is_minmax_opcode(opc) && is_subword_type(velt_basic_type(p0))) {
@ -1756,9 +1775,29 @@ bool SuperWord::profitable(const Node_List* p) const {
if (is_marked_reduction(p0)) {
Node* second_in = p0->in(2);
Node_List* second_pk = get_pack(second_in);
if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
// No parent pack or not enough work
// to cover reduction expansion overhead
if (second_pk == nullptr) {
// The second input has to be the vector we wanted to reduce,
// but it was not packed.
return false;
} else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
// This heuristic predicts that the reduction is not profitable.
// Reduction vectors can be expensive, because they require multiple
// operations to fold all the lanes together. Hence, vectorizing the
// reduction is not profitable on its own. Hence, we need a lot of
// other "work vectors" that deliver performance improvements to
// balance out the performance loss due to reductions.
// This heuristic is a bit simplistic, and assumes that the reduction
// vector stays in the loop. But in some cases, we can move the
// reduction out of the loop, replacing it with a single vector op.
// See: PhaseIdealLoop::move_unordered_reduction_out_of_loop
// Hence, this heuristic has room for improvement.
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
tty->print_cr(" reduction profitable.");
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
}
#endif
return false;
} else if (second_pk->size() != p->size()) {
return false;
@ -1914,6 +1953,16 @@ bool SuperWord::schedule_and_apply() const {
if (!vtransform.schedule()) { return false; }
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
if (AutoVectorizationOverrideProfitability == 0) {
#ifndef PRODUCT
if (is_trace_superword_any()) {
tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
}
#endif
return false;
}
vtransform.apply();
return true;
}

View File

@ -0,0 +1,164 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8357530
* @summary Test the effect of AutoVectorizationOverrideProfitability.
* @library /test/lib /
* @run driver compiler.loopopts.superword.TestAutoVectorizationOverrideProfitability
*/
package compiler.loopopts.superword;
import compiler.lib.ir_framework.*;
import compiler.lib.verify.*;
import compiler.lib.generators.Generator;
import static compiler.lib.generators.Generators.G;
public class TestAutoVectorizationOverrideProfitability {
public static final Generator<Integer> GEN_I = G.ints();
public static final Generator<Float> GEN_F = G.floats();
public static int[] aI = new int[10_000];
public static int[] rI = new int[10_000];
public static float[] aF = new float[10_000];
public static float[] rF = new float[10_000];
static {
G.fill(GEN_I, aI);
G.fill(GEN_F, aF);
}
public static void main(String[] args) throws Exception {
// Do not vectorize, even if profitable.
TestFramework.runWithFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0");
// Normal run, i.e. with normal heuristic. In some cases this vectorizes, in some not.
// By default, we have AutoVectorizationOverrideProfitability=1
TestFramework.run();
// Vectorize even if not profitable.
TestFramework.runWithFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2");
}
public static final float GOLD_SIMPLE_FLOAT_REDUCTION = simpleFloatReduction();
@Test
@Warmup(10)
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
counts = {IRNode.ADD_REDUCTION_VF, "> 0"})
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
counts = {IRNode.ADD_REDUCTION_VF, "= 0"})
// The simple float reduction is not profitable. We need to sequentially
// add up the values, and so we cannot move the reduction out of the loop.
private static float simpleFloatReduction() {
float sum = 0;
for (int i = 0; i < aF.length; i++) {
sum += aF[i];
}
return sum;
}
@Check(test="simpleFloatReduction")
public static void checkSimpleFloatReduction(float result) {
Verify.checkEQ(GOLD_SIMPLE_FLOAT_REDUCTION, result);
}
static { simpleFloatCopy(); }
public static final float[] GOLD_SIMPLE_FLOAT_COPY = rF.clone();
@Test
@Warmup(10)
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
counts = {IRNode.LOAD_VECTOR_F, "> 0"})
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
counts = {IRNode.LOAD_VECTOR_F, "= 0"})
// The simple float copy is always profitable.
private static void simpleFloatCopy() {
for (int i = 0; i < aF.length; i++) {
rF[i] = aF[i];
}
}
@Check(test="simpleFloatCopy")
public static void checkSimpleFloatCopy() {
Verify.checkEQ(GOLD_SIMPLE_FLOAT_COPY, rF);
}
public static final int GOLD_SIMPLE_INT_REDUCTION = simpleIntReduction();
@Test
@Warmup(10)
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"})
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"})
// Current heuristics say that this simple int reduction is not profitable.
// But it would actually be profitable, since we are able to move the
// reduction out of the loop (we can reorder the reduction). When moving
// the reduction out of the loop, we instead accumulate with a simple
// ADD_VI inside the loop.
// See: JDK-8307516 JDK-8345044
private static int simpleIntReduction() {
int sum = 0;
for (int i = 0; i < aI.length; i++) {
sum += aI[i];
}
return sum;
}
@Check(test="simpleIntReduction")
public static void checkSimpleIntReduction(int result) {
Verify.checkEQ(GOLD_SIMPLE_INT_REDUCTION, result);
}
static { simpleIntCopy(); }
public static final int[] GOLD_SIMPLE_INT_COPY = rI.clone();
@Test
@Warmup(10)
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
counts = {IRNode.LOAD_VECTOR_I, "> 0"})
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
counts = {IRNode.LOAD_VECTOR_I, "= 0"})
// The simple int copy is always profitable.
private static void simpleIntCopy() {
for (int i = 0; i < aI.length; i++) {
rI[i] = aI[i];
}
}
@Check(test="simpleIntCopy")
public static void checkSimpleIntCopy() {
Verify.checkEQ(GOLD_SIMPLE_INT_COPY, rI);
}
}