mirror of
https://github.com/openjdk/jdk.git
synced 2026-03-14 18:03:44 +00:00
8357530: C2 SuperWord: Diagnostic flag AutoVectorizationOverrideProfitability
Reviewed-by: thartmann, kvn
This commit is contained in:
parent
a300c35655
commit
e8eff4d25b
@ -375,6 +375,18 @@
|
||||
"2 = Prefer alignment with vector load.") \
|
||||
range(0, 2) \
|
||||
\
|
||||
product(uint, AutoVectorizationOverrideProfitability, 1, DIAGNOSTIC, \
|
||||
"Override the auto vectorization profitability heuristics." \
|
||||
"0 = Run auto vectorizer, but abort just before applying" \
|
||||
" vectorization, as though it was not profitable." \
|
||||
"1 = Run auto vectorizer with the default profitability" \
|
||||
" heuristics. This is the default, and hopefully" \
|
||||
" delivers the best performance." \
|
||||
"2 = Run auto vectorizer, and vectorize even if the" \
|
||||
" profitability heuristics predict that vectorization" \
|
||||
" is not profitable.") \
|
||||
range(0, 2) \
|
||||
\
|
||||
product(bool, UseCMoveUnconditionally, false, \
|
||||
"Use CMove (scalar and vector) ignoring profitability test.") \
|
||||
\
|
||||
|
||||
@ -1605,12 +1605,31 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
|
||||
int opc = p0->Opcode();
|
||||
if (is_marked_reduction(p0)) {
|
||||
const Type *arith_type = p0->bottom_type();
|
||||
// Length 2 reductions of INT/LONG do not offer performance benefits
|
||||
if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
|
||||
retValue = false;
|
||||
} else {
|
||||
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
|
||||
// This heuristic predicts that 2-element reductions for INT/LONG are not
|
||||
// profitable. This heuristic was added in JDK-8078563. The argument
|
||||
// was that reductions are not just a single instruction, but multiple, and
|
||||
// hence it is not directly clear that they are profitable. If we only have
|
||||
// two elements per vector, then the performance gains from non-reduction
|
||||
// vectors are at most going from 2 scalar instructions to 1 vector instruction.
|
||||
// But a 2-element reduction vector goes from 2 scalar instructions to
|
||||
// 3 instructions (1 shuffle and two reduction ops).
|
||||
// However, this optimization assumes that these reductions stay in the loop
|
||||
// which may not be true any more in most cases after the introduction of:
|
||||
// PhaseIdealLoop::move_unordered_reduction_out_of_loop
|
||||
// Hence, this heuristic has room for improvement.
|
||||
bool is_two_element_int_or_long_reduction = (size == 2) &&
|
||||
(arith_type->basic_type() == T_INT ||
|
||||
arith_type->basic_type() == T_LONG);
|
||||
if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_rejections()) {
|
||||
tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
|
||||
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
|
||||
} else if (VectorNode::is_convert_opcode(opc)) {
|
||||
retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
|
||||
} else if (VectorNode::is_minmax_opcode(opc) && is_subword_type(velt_basic_type(p0))) {
|
||||
@ -1756,9 +1775,29 @@ bool SuperWord::profitable(const Node_List* p) const {
|
||||
if (is_marked_reduction(p0)) {
|
||||
Node* second_in = p0->in(2);
|
||||
Node_List* second_pk = get_pack(second_in);
|
||||
if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
|
||||
// No parent pack or not enough work
|
||||
// to cover reduction expansion overhead
|
||||
if (second_pk == nullptr) {
|
||||
// The second input has to be the vector we wanted to reduce,
|
||||
// but it was not packed.
|
||||
return false;
|
||||
} else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
|
||||
// This heuristic predicts that the reduction is not profitable.
|
||||
// Reduction vectors can be expensive, because they require multiple
|
||||
// operations to fold all the lanes together. Hence, vectorizing the
|
||||
// reduction is not profitable on its own. Hence, we need a lot of
|
||||
// other "work vectors" that deliver performance improvements to
|
||||
// balance out the performance loss due to reductions.
|
||||
// This heuristic is a bit simplistic, and assumes that the reduction
|
||||
// vector stays in the loop. But in some cases, we can move the
|
||||
// reduction out of the loop, replacing it with a single vector op.
|
||||
// See: PhaseIdealLoop::move_unordered_reduction_out_of_loop
|
||||
// Hence, this heuristic has room for improvement.
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_rejections()) {
|
||||
tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
|
||||
tty->print_cr(" reduction profitable.");
|
||||
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
} else if (second_pk->size() != p->size()) {
|
||||
return false;
|
||||
@ -1914,6 +1953,16 @@ bool SuperWord::schedule_and_apply() const {
|
||||
|
||||
if (!vtransform.schedule()) { return false; }
|
||||
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
|
||||
|
||||
if (AutoVectorizationOverrideProfitability == 0) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_any()) {
|
||||
tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
vtransform.apply();
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -0,0 +1,164 @@
|
||||
/*
|
||||
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8357530
|
||||
* @summary Test the effect of AutoVectorizationOverrideProfitability.
|
||||
* @library /test/lib /
|
||||
* @run driver compiler.loopopts.superword.TestAutoVectorizationOverrideProfitability
|
||||
*/
|
||||
|
||||
package compiler.loopopts.superword;
|
||||
|
||||
import compiler.lib.ir_framework.*;
|
||||
import compiler.lib.verify.*;
|
||||
import compiler.lib.generators.Generator;
|
||||
import static compiler.lib.generators.Generators.G;
|
||||
|
||||
public class TestAutoVectorizationOverrideProfitability {
|
||||
public static final Generator<Integer> GEN_I = G.ints();
|
||||
public static final Generator<Float> GEN_F = G.floats();
|
||||
|
||||
public static int[] aI = new int[10_000];
|
||||
public static int[] rI = new int[10_000];
|
||||
public static float[] aF = new float[10_000];
|
||||
public static float[] rF = new float[10_000];
|
||||
|
||||
static {
|
||||
G.fill(GEN_I, aI);
|
||||
G.fill(GEN_F, aF);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Do not vectorize, even if profitable.
|
||||
TestFramework.runWithFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0");
|
||||
|
||||
// Normal run, i.e. with normal heuristic. In some cases this vectorizes, in some not.
|
||||
// By default, we have AutoVectorizationOverrideProfitability=1
|
||||
TestFramework.run();
|
||||
|
||||
// Vectorize even if not profitable.
|
||||
TestFramework.runWithFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2");
|
||||
}
|
||||
|
||||
public static final float GOLD_SIMPLE_FLOAT_REDUCTION = simpleFloatReduction();
|
||||
|
||||
@Test
|
||||
@Warmup(10)
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
|
||||
counts = {IRNode.ADD_REDUCTION_VF, "> 0"})
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
|
||||
counts = {IRNode.ADD_REDUCTION_VF, "= 0"})
|
||||
// The simple float reduction is not profitable. We need to sequentially
|
||||
// add up the values, and so we cannot move the reduction out of the loop.
|
||||
private static float simpleFloatReduction() {
|
||||
float sum = 0;
|
||||
for (int i = 0; i < aF.length; i++) {
|
||||
sum += aF[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Check(test="simpleFloatReduction")
|
||||
public static void checkSimpleFloatReduction(float result) {
|
||||
Verify.checkEQ(GOLD_SIMPLE_FLOAT_REDUCTION, result);
|
||||
}
|
||||
|
||||
static { simpleFloatCopy(); }
|
||||
public static final float[] GOLD_SIMPLE_FLOAT_COPY = rF.clone();
|
||||
|
||||
@Test
|
||||
@Warmup(10)
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
|
||||
counts = {IRNode.LOAD_VECTOR_F, "> 0"})
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
|
||||
counts = {IRNode.LOAD_VECTOR_F, "= 0"})
|
||||
// The simple float copy is always profitable.
|
||||
private static void simpleFloatCopy() {
|
||||
for (int i = 0; i < aF.length; i++) {
|
||||
rF[i] = aF[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Check(test="simpleFloatCopy")
|
||||
public static void checkSimpleFloatCopy() {
|
||||
Verify.checkEQ(GOLD_SIMPLE_FLOAT_COPY, rF);
|
||||
}
|
||||
|
||||
public static final int GOLD_SIMPLE_INT_REDUCTION = simpleIntReduction();
|
||||
|
||||
@Test
|
||||
@Warmup(10)
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
|
||||
counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"})
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
|
||||
counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"})
|
||||
// Current heuristics say that this simple int reduction is not profitable.
|
||||
// But it would actually be profitable, since we are able to move the
|
||||
// reduction out of the loop (we can reorder the reduction). When moving
|
||||
// the reduction out of the loop, we instead accumulate with a simple
|
||||
// ADD_VI inside the loop.
|
||||
// See: JDK-8307516 JDK-8345044
|
||||
private static int simpleIntReduction() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < aI.length; i++) {
|
||||
sum += aI[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Check(test="simpleIntReduction")
|
||||
public static void checkSimpleIntReduction(int result) {
|
||||
Verify.checkEQ(GOLD_SIMPLE_INT_REDUCTION, result);
|
||||
}
|
||||
|
||||
static { simpleIntCopy(); }
|
||||
public static final int[] GOLD_SIMPLE_INT_COPY = rI.clone();
|
||||
|
||||
@Test
|
||||
@Warmup(10)
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
|
||||
counts = {IRNode.LOAD_VECTOR_I, "> 0"})
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
|
||||
counts = {IRNode.LOAD_VECTOR_I, "= 0"})
|
||||
// The simple int copy is always profitable.
|
||||
private static void simpleIntCopy() {
|
||||
for (int i = 0; i < aI.length; i++) {
|
||||
rI[i] = aI[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Check(test="simpleIntCopy")
|
||||
public static void checkSimpleIntCopy() {
|
||||
Verify.checkEQ(GOLD_SIMPLE_INT_COPY, rI);
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user