8340093: C2 SuperWord: implement cost model

Reviewed-by: kvn, qamai
This commit is contained in:
Emanuel Peter 2025-11-10 15:56:49 +00:00
parent 6e838d6f9a
commit 72989e0fac
13 changed files with 2884 additions and 94 deletions

View File

@ -129,18 +129,24 @@ source %{
bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
if (UseSVE == 0) {
// These operations are not profitable to be vectorized on NEON, because no direct
// NEON instructions support them. But the match rule support for them is profitable for
// Vector API intrinsics.
// NEON instructions support them. They use multiple instructions which is more
// expensive in almost all cases where we would auto vectorize.
// But the match rule support for them is profitable for Vector API intrinsics.
if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
opcode == Op_MulVL ||
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
// They are not suitable for auto-vectorization because the result would not conform
// to the JLS, Section Evaluation Order.
// Note: we could implement sequential reductions for these reduction operators, but
// this will still almost never lead to speedups, because the sequential
// reductions are latency limited along the reduction chain, and not
// throughput limited. This is unlike unordered reductions (associative op)
// and element-wise ops which are usually throughput limited.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
return false;
}
}

View File

@ -119,18 +119,24 @@ source %{
bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
if (UseSVE == 0) {
// These operations are not profitable to be vectorized on NEON, because no direct
// NEON instructions support them. But the match rule support for them is profitable for
// Vector API intrinsics.
// NEON instructions support them. They use multiple instructions which is more
// expensive in almost all cases where we would auto vectorize.
// But the match rule support for them is profitable for Vector API intrinsics.
if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
opcode == Op_MulVL ||
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
// They are not suitable for auto-vectorization because the result would not conform
// to the JLS, Section Evaluation Order.
// Note: we could implement sequential reductions for these reduction operators, but
// this will still almost never lead to speedups, because the sequential
// reductions are latency limited along the reduction chain, and not
// throughput limited. This is unlike unordered reductions (associative op)
// and element-wise ops which are usually throughput limited.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
return false;
}
}

View File

@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
),
_vpointer_for_main_loop_alignment(nullptr),
_aw_for_main_loop_alignment(0),
_do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
_num_work_vecs(0), // amount of vector work we have
_num_reductions(0) // amount of reduction work we have
_do_vector_loop(phase()->C->do_vector_loop()) // whether to do vectorization/simd style
{
}
@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() {
// Remove packs that are not profitable.
void SuperWord::filter_packs_for_profitable() {
// Count the number of reductions vs other vector ops, for the
// reduction profitability heuristic.
for (int i = 0; i < _packset.length(); i++) {
Node_List* pack = _packset.at(i);
Node* n = pack->at(0);
if (is_marked_reduction(n)) {
_num_reductions++;
} else {
_num_work_vecs++;
}
}
// Remove packs that are not profitable
auto filter = [&](const Node_List* pack) {
return profitable(pack);
@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
if (p0 != nullptr) {
int opc = p0->Opcode();
if (is_marked_reduction(p0)) {
const Type *arith_type = p0->bottom_type();
// This heuristic predicts that 2-element reductions for INT/LONG are not
// profitable. This heuristic was added in JDK-8078563. The argument
// was that reductions are not just a single instruction, but multiple, and
// hence it is not directly clear that they are profitable. If we only have
// two elements per vector, then the performance gains from non-reduction
// vectors are at most going from 2 scalar instructions to 1 vector instruction.
// But a 2-element reduction vector goes from 2 scalar instructions to
// 3 instructions (1 shuffle and two reduction ops).
// However, this optimization assumes that these reductions stay in the loop
// which may not be true any more in most cases after the introduction of:
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
// Hence, this heuristic has room for improvement.
bool is_two_element_int_or_long_reduction = (size == 2) &&
(arith_type->basic_type() == T_INT ||
arith_type->basic_type() == T_LONG);
if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
}
#endif
return false;
}
const Type* arith_type = p0->bottom_type();
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
} else if (VectorNode::is_convert_opcode(opc)) {
retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
@ -1791,26 +1753,6 @@ bool SuperWord::profitable(const Node_List* p) const {
// The second input has to be the vector we wanted to reduce,
// but it was not packed.
return false;
} else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
// This heuristic predicts that the reduction is not profitable.
// Reduction vectors can be expensive, because they require multiple
// operations to fold all the lanes together. Hence, vectorizing the
// reduction is not profitable on its own. Hence, we need a lot of
// other "work vectors" that deliver performance improvements to
// balance out the performance loss due to reductions.
// This heuristic is a bit simplistic, and assumes that the reduction
// vector stays in the loop. But in some cases, we can move the
// reduction out of the loop, replacing it with a single vector op.
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
// Hence, this heuristic has room for improvement.
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
tty->print_cr(" reduction profitable.");
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
}
#endif
return false;
} else if (second_pk->size() != p->size()) {
return false;
}
@ -1969,19 +1911,53 @@ bool SuperWord::do_vtransform() const {
vtransform.optimize();
if (!vtransform.schedule()) { return false; }
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
if (!vtransform.is_profitable()) { return false; }
vtransform.apply();
return true;
}
// Check Cost-Model, and other heuristics.
// Can be overridden with AutoVectorizationOverrideProfitability.
bool VTransform::is_profitable() const {
assert(_graph.is_scheduled(), "must already be scheduled");
if (AutoVectorizationOverrideProfitability == 0) {
#ifndef PRODUCT
if (is_trace_superword_any()) {
if (_trace._info) {
tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
}
#endif
return false;
}
vtransform.apply();
return true;
if (AutoVectorizationOverrideProfitability == 2) {
#ifndef PRODUCT
if (_trace._info) {
tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2).");
}
#endif
return true;
}
// Note: currently we only do throughput-based cost-modeling. In the future, we could
// also implement latency-based cost-modeling and take store-to-load-forwarding
// failures into account as the latency between the load and store. This would
// allow a more precise tradeoff between the forwarding failure penalty versus
// the vectorization gains.
if (has_store_to_load_forwarding_failure()) { return false; }
// Cost-model
float scalar_cost = _vloop_analyzer.cost_for_scalar_loop();
float vector_cost = cost_for_vector_loop();
#ifndef PRODUCT
if (_trace._info) {
tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f",
scalar_cost, vector_cost);
}
#endif
return vector_cost < scalar_cost;
}
// Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all

View File

@ -549,8 +549,6 @@ class SuperWord : public ResourceObj {
private:
bool _do_vector_loop; // whether to do vectorization/simd style
int _num_work_vecs; // Number of non memory vector operations
int _num_reductions; // Number of reduction expressions applied
// Accessors
Arena* arena() { return &_arena; }

View File

@ -38,7 +38,7 @@
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
flags(BODY, "Trace VLoopBody") \
flags(TYPES, "Trace VLoopTypes") \
flags(POINTERS, "Trace VLoopPointers") \
flags(POINTERS, "Trace VLoopVPointers") \
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
@ -47,6 +47,8 @@
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
flags(VTRANSFORM, "Trace VTransform Graph") \
flags(OPTIMIZATION, "Trace VTransform::optimize") \
flags(COST, "Trace cost of VLoop (scalar) and VTransform (vector)") \
flags(COST_VERBOSE, "Trace like COST, but more verbose") \
flags(ALIGN_VECTOR, "Trace AlignVector") \
flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \

View File

@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() {
int pointers_idx = 0;
_body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
// Placement new: construct directly into the array.
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes);
_bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
pointers_idx++;
});
@ -541,6 +541,108 @@ void VLoopDependencyGraph::PredsIterator::next() {
}
}
// Cost-model heuristic for nodes that do not contribute to computational
// cost inside the loop.
bool VLoopAnalyzer::has_zero_cost(Node* n) const {
// Outside body?
if (!_vloop.in_bb(n)) { return true; }
// Internal nodes of pointer expressions are most likely folded into
// the load / store and have no additional cost.
if (vpointers().is_in_pointer_expression(n)) { return true; }
// Not all AddP nodes can be detected in VPointer parsing, so
// we filter them out here.
// We don't want to explicitly model the cost of control flow,
// since we have the same CFG structure before and after
// vectorization: A loop head, a loop exit, with a backedge.
if (n->is_AddP() || // Pointer expression
n->is_CFG() || // CFG
n->is_Phi() || // CFG
n->is_Cmp() || // CFG
n->is_Bool()) { // CFG
return true;
}
// All other nodes have a non-zero cost.
return false;
}
// Compute the cost over all operations in the (scalar) loop.
float VLoopAnalyzer::cost_for_scalar_loop() const {
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:");
}
#endif
float sum = 0;
for (int j = 0; j < body().body().length(); j++) {
Node* n = body().body().at(j);
if (!has_zero_cost(n)) {
float c = cost_for_scalar_node(n->Opcode());
sum += c;
#ifndef PRODUCT
if (_vloop.is_trace_cost_verbose()) {
tty->print_cr(" -> cost = %.2f for %d %s", c, n->_idx, n->Name());
}
#endif
}
}
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr(" total_cost = %.2f", sum);
}
#endif
return sum;
}
// For now, we use unit cost. We might refine that in the future.
// If needed, we could also use platform specific costs, if the
// default here is not accurate enough.
float VLoopAnalyzer::cost_for_scalar_node(int opcode) const {
float c = 1;
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]);
}
#endif
return c;
}
// For now, we use unit cost. We might refine that in the future.
// If needed, we could also use platform specific costs, if the
// default here is not accurate enough.
float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const {
float c = 1;
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s",
c, NodeClassNames[opcode], vlen, type2name(bt));
}
#endif
return c;
}
// For now, we use unit cost, i.e. we count the number of backend instructions
// that the vtnode will use. We might refine that in the future.
// If needed, we could also use platform specific costs, if the
// default here is not accurate enough.
float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
// Each reduction is composed of multiple instructions, each estimated with a unit cost.
// Linear: shuffle and reduce Recursive: shuffle and reduce
float c = requires_strict_order ? 2 * vlen : 2 * exact_log2(vlen);
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",
c, NodeClassNames[opcode], vlen, type2name(bt),
requires_strict_order ? "true" : "false");
}
#endif
return c;
}
// Computing aliasing runtime check using init and last of main-loop
// -----------------------------------------------------------------
//

View File

@ -209,6 +209,14 @@ public:
return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
}
bool is_trace_cost() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::COST);
}
bool is_trace_cost_verbose() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::COST_VERBOSE);
}
bool is_trace_speculative_runtime_checks() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
}
@ -584,6 +592,32 @@ private:
const Type* container_type(Node* n) const;
};
// Mark all nodes from the loop that are part of any VPointer expression.
class PointerExpressionNodes : public MemPointerParserCallback {
private:
const VLoop& _vloop;
const VLoopBody& _body;
VectorSet _in_pointer_expression;
public:
PointerExpressionNodes(Arena* arena,
const VLoop& vloop,
const VLoopBody& body) :
_vloop(vloop),
_body(body),
_in_pointer_expression(arena) {}
virtual void callback(Node* n) override {
if (!_vloop.in_bb(n)) { return; }
_in_pointer_expression.set(_body.bb_idx(n));
}
bool contains(const Node* n) const {
if (!_vloop.in_bb(n)) { return false; }
return _in_pointer_expression.test(_body.bb_idx(n));
}
};
// Submodule of VLoopAnalyzer.
// We compute and cache the VPointer for every load and store.
class VLoopVPointers : public StackObj {
@ -599,6 +633,9 @@ private:
// Map bb_idx -> index in _vpointers. -1 if not mapped.
GrowableArray<int> _bb_idx_to_vpointer;
// Mark all nodes that are part of any pointers expression.
PointerExpressionNodes _pointer_expression_nodes;
public:
VLoopVPointers(Arena* arena,
const VLoop& vloop,
@ -610,13 +647,18 @@ public:
_bb_idx_to_vpointer(arena,
vloop.estimated_body_length(),
vloop.estimated_body_length(),
-1) {}
-1),
_pointer_expression_nodes(arena, _vloop, _body) {}
NONCOPYABLE(VLoopVPointers);
void compute_vpointers();
const VPointer& vpointer(const MemNode* mem) const;
NOT_PRODUCT( void print() const; )
bool is_in_pointer_expression(const Node* n) const {
return _pointer_expression_nodes.contains(n);
}
private:
void count_vpointers();
void allocate_vpointers_array();
@ -810,6 +852,15 @@ public:
const VLoopVPointers& vpointers() const { return _vpointers; }
const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; }
// Compute the cost of the (scalar) body.
float cost_for_scalar_loop() const;
bool has_zero_cost(Node* n) const;
// Cost-modeling with tracing.
float cost_for_scalar_node(int opcode) const;
float cost_for_vector_node(int opcode, int vlen, BasicType bt) const;
float cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
private:
bool setup_submodules();
VStatus setup_submodules_helper();

View File

@ -186,6 +186,99 @@ int VTransformGraph::count_alive_vtnodes() const {
return count;
}
// Find all nodes that in the loop, in a 2-phase process:
// - First, find all nodes that are not before the loop:
// - loop-phis
// - loads and stores that are in the loop
// - and all their transitive uses.
// - Second, we find all nodes that are not after the loop:
// - backedges
// - loads and stores that are in the loop
// - and all their transitive uses.
//
// in_loop: vtn->_idx -> bool
void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
assert(is_scheduled(), "must already be scheduled");
// Phase 1: find all nodes that are not before the loop.
VectorSet is_not_before_loop;
for (int i = 0; i < _schedule.length(); i++) {
VTransformNode* vtn = _schedule.at(i);
// Is vtn a loop-phi?
if (vtn->isa_LoopPhi() != nullptr ||
vtn->is_load_or_store_in_loop()) {
is_not_before_loop.set(vtn->_idx);
continue;
}
// Or one of its transitive uses?
for (uint j = 0; j < vtn->req(); j++) {
VTransformNode* def = vtn->in_req(j);
if (def != nullptr && is_not_before_loop.test(def->_idx)) {
is_not_before_loop.set(vtn->_idx);
break;
}
}
}
// Phase 2: find all nodes that are not after the loop.
for (int i = _schedule.length()-1; i >= 0; i--) {
VTransformNode* vtn = _schedule.at(i);
if (!is_not_before_loop.test(vtn->_idx)) { continue; }
// Is load or store?
if (vtn->is_load_or_store_in_loop()) {
in_loop.set(vtn->_idx);
continue;
}
for (uint i = 0; i < vtn->out_strong_edges(); i++) {
VTransformNode* use = vtn->out_strong_edge(i);
// Or is vtn a backedge or one of its transitive defs?
if (in_loop.test(use->_idx) ||
use->isa_LoopPhi() != nullptr) {
in_loop.set(vtn->_idx);
break;
}
}
}
}
float VTransformGraph::cost_for_vector_loop() const {
assert(is_scheduled(), "must already be scheduled");
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr("\nVTransformGraph::cost_for_vector_loop:");
}
#endif
// We only want to count the cost of nodes that are in the loop.
// This is especially important for cases where we were able to move
// some nodes outside the loop during VTransform::optimize, e.g.:
// VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
ResourceMark rm;
VectorSet in_loop; // vtn->_idx -> bool
mark_vtnodes_in_loop(in_loop);
float sum = 0;
for (int i = 0; i < _schedule.length(); i++) {
VTransformNode* vtn = _schedule.at(i);
if (!in_loop.test(vtn->_idx)) { continue; }
float c = vtn->cost(_vloop_analyzer);
sum += c;
#ifndef PRODUCT
if (c != 0 && _vloop.is_trace_cost_verbose()) {
tty->print(" -> cost = %.2f for ", c);
vtn->print();
}
#endif
}
#ifndef PRODUCT
if (_vloop.is_trace_cost()) {
tty->print_cr(" total_cost = %.2f", sum);
}
#endif
return sum;
}
#ifndef PRODUCT
void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
const VectorSet& pre_visited,
@ -831,6 +924,12 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app
}
}
float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
// This is an identity transform, but loads and stores must be counted.
assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted");
return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
}
VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const {
apply_vtn_inputs_to_node(_node, apply_state);
// The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
@ -843,6 +942,16 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app
return VTransformApplyResult::make_scalar(_node);
}
float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
// Since this is an identity transform, we may have nodes that also
// VLoopAnalyzer::cost does not count for the scalar loop.
if (vloop_analyzer.has_zero_cost(_node)) {
return 0;
} else {
return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
}
}
VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const {
apply_vtn_inputs_to_node(_node, apply_state);
return VTransformApplyResult::make_scalar(_node);
@ -895,6 +1004,10 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta
return VTransformApplyResult::make_scalar(_node);
}
float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type);
}
VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const {
Node* val = apply_state.transformed_node(in_req(1));
VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type);
@ -902,6 +1015,10 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply
return VTransformApplyResult::make_vector(vn);
}
float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L);
}
VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const {
Node* val = apply_state.transformed_node(in_req(1));
Node* n = new ConvI2LNode(val);
@ -909,6 +1026,12 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s
return VTransformApplyResult::make_scalar(n);
}
float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode);
return vloop_analyzer.cost_for_scalar_node(Op_AndI) +
vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt);
}
VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const {
PhaseIdealLoop* phase = apply_state.phase();
Node* shift_count_in = apply_state.transformed_node(in_req(1));
@ -924,6 +1047,9 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl
return VTransformApplyResult::make_vector(vn);
}
float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt);
}
VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const {
PhaseIdealLoop* phase = apply_state.phase();
@ -936,6 +1062,10 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a
return VTransformApplyResult::make_vector(vn);
}
float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type());
}
VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const {
assert(2 <= req() && req() <= 4, "Must have 1-3 inputs");
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
@ -954,6 +1084,12 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat
return VTransformApplyResult::make_vector(vn);
}
float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type());
return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) +
vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT);
}
VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
uint vlen = vector_length();
int sopc = scalar_opcode();
@ -969,6 +1105,10 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(
return VTransformApplyResult::make_vector(vn);
}
float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type());
}
VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {
const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length());
const TypeVect* src_vt = TypeVect::make(_src_bt, vector_length());
@ -981,6 +1121,11 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat
return VTransformApplyResult::make_vector(vn);
}
float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
assert(scalar_opcode() == Op_Bool, "");
return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type());
}
VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const {
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
assert(scalar_opcode() == Op_Bool, "");
@ -1101,10 +1246,10 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
const BasicType bt = element_basic_type();
const int ropc = vector_reduction_opcode();
const int vopc = VectorNode::opcode(sopc, bt);
if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
DEBUG_ONLY( this->print(); )
assert(false, "do not have normal vector op for this reduction");
return false; // not implemented
if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) {
// The element-wise vector operation needed for the vector accumulator
// is not implemented / supported.
return false;
}
// Traverse up the chain of non strict order reductions, checking that it loops
@ -1236,6 +1381,14 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
return true; // success
}
float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
uint vlen = vector_length();
BasicType bt = element_basic_type();
int vopc = vector_reduction_opcode();
bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc);
return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order);
}
VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
Node* init = apply_state.transformed_node(in_req(1));
Node* vec = apply_state.transformed_node(in_req(2));
@ -1245,6 +1398,12 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState&
return VTransformApplyResult::make_vector(vn, vn->vect_type());
}
float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
uint vlen = vector_length();
BasicType bt = element_basic_type();
return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt);
}
VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const {
int sopc = scalar_opcode();
uint vlen = vector_length();
@ -1274,6 +1433,12 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl
return VTransformApplyResult::make_vector(vn, vn->vect_type());
}
float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
uint vlen = vector_length();
BasicType bt = element_basic_type();
return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt);
}
VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const {
int sopc = scalar_opcode();
uint vlen = vector_length();

View File

@ -51,6 +51,10 @@
// - Compute linearization of the VTransformGraph, into an order that respects
// all edges in the graph (bailout if cycle detected).
//
// - Cost-Model:
// - We use a cost-model as a heuristic to determine if vectorization is profitable.
// Compute the cost of the loop with and without vectorization.
//
// - Apply:
// - Changes to the C2 IR are only made once the "apply" method is called.
// - Align the main loop, by adjusting pre loop limit.
@ -190,6 +194,7 @@ public:
void optimize(VTransform& vtransform);
bool schedule();
bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
float cost_for_vector_loop() const;
void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
private:
@ -200,6 +205,7 @@ private:
void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
int count_alive_vtnodes() const;
void mark_vtnodes_in_loop(VectorSet& in_loop) const;
#ifndef PRODUCT
void print_vtnodes() const;
@ -252,6 +258,8 @@ public:
void optimize() { return _graph.optimize(*this); }
bool schedule() { return _graph.schedule(); }
bool is_profitable() const;
float cost_for_vector_loop() const { return _graph.cost_for_vector_loop(); }
bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
void apply();
@ -549,6 +557,8 @@ public:
virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const = 0;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
virtual void apply_backedge(VTransformApplyState& apply_state) const {};
void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
@ -579,6 +589,7 @@ public:
virtual bool is_load_or_store_in_loop() const override { return true; }
virtual const VPointer& vpointer() const override { return _vpointer; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "MemopScalar"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -595,6 +606,7 @@ public:
assert(!_node->is_Mem() && !_node->is_Phi() && !_node->is_CFG(), "must be data node: %s", _node->Name());
}
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "DataScalar"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -612,6 +624,7 @@ public:
}
virtual VTransformLoopPhiNode* isa_LoopPhi() override { return this; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
virtual void apply_backedge(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "LoopPhi"; };)
@ -629,6 +642,7 @@ public:
assert(_node->is_CFG(), "must be CFG node: %s", _node->Name());
}
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "CFG"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -655,6 +669,7 @@ public:
VTransformNode(vtransform, n->req()), _node(n) {}
virtual VTransformOuterNode* isa_Outer() override { return this; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { ShouldNotReachHere(); }
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "Outer"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -668,6 +683,7 @@ private:
public:
VTransformReplicateNode(VTransform& vtransform, int vlen, BasicType element_type) :
VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {}
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -677,6 +693,7 @@ public:
class VTransformConvI2LNode : public VTransformNode {
public:
VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {}
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };)
};
@ -691,6 +708,7 @@ private:
public:
VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) :
VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {}
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -704,6 +722,7 @@ private:
public:
VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) :
VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {}
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -769,6 +788,7 @@ public:
VTransformElementWiseVectorNode(VTransform& vtransform, uint req, const VTransformVectorNodeProperties properties, const int vector_opcode) :
VTransformVectorNode(vtransform, req, properties), _vector_opcode(vector_opcode) {}
virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -781,6 +801,7 @@ class VTransformElementWiseLongOpWithCastToIntVectorNode : public VTransformVect
public:
VTransformElementWiseLongOpWithCastToIntVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
VTransformVectorNode(vtransform, 2, properties) {}
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseLongOpWithCastToIntVector"; };)
};
@ -791,6 +812,7 @@ private:
public:
VTransformReinterpretVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties, const BasicType src_bt) :
VTransformVectorNode(vtransform, 2, properties), _src_bt(src_bt) {}
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "ReinterpretVector"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -811,6 +833,7 @@ public:
VTransformCmpVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
VTransformVectorNode(vtransform, 3, properties) {}
virtual VTransformCmpVectorNode* isa_CmpVector() override { return this; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override { return VTransformApplyResult::make_empty(); }
NOT_PRODUCT(virtual const char* name() const override { return "CmpVector"; };)
};
@ -823,6 +846,7 @@ public:
VTransformVectorNode(vtransform, 2, properties), _test(test) {}
VTransformBoolTest test() const { return _test; }
virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };)
NOT_PRODUCT(virtual void print_spec() const override;)
@ -835,6 +859,7 @@ public:
VTransformVectorNode(vtransform, 3, properties) {}
virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
@ -877,6 +902,7 @@ public:
LoadNode::ControlDependency control_dependency() const;
virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; }
virtual bool is_load_in_loop() const override { return true; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
};
@ -888,6 +914,7 @@ public:
VTransformMemVectorNode(vtransform, 4, properties, vpointer, adr_type) {}
virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; }
virtual bool is_load_in_loop() const override { return false; }
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
};

View File

@ -410,12 +410,12 @@ public class TestIntVect {
}
// Not vectorized: simple addition not profitable, see JDK-8307516. NOTE:
// This check does not document the _desired_ behavior of the system but
// the current behavior (no vectorization)
@Test
@IR(counts = { IRNode.LOAD_VECTOR_I, "= 0",
IRNode.STORE_VECTOR, "= 0" })
@IR(counts = { IRNode.LOAD_VECTOR_I, "> 0",
IRNode.ADD_REDUCTION_VI, "> 0",
IRNode.ADD_VI, "> 0" })
// The reduction is moved outside the loop, and we use a
// element-wise accumulator inside the loop.
int test_sum(int[] a1) {
int sum = 0;
for (int i = 0; i < a1.length; i+=1) {

View File

@ -115,17 +115,18 @@ public class TestAutoVectorizationOverrideProfitability {
@Test
@Warmup(10)
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"})
@IR(applyIfCPUFeatureOr = {"avx", "true"},
applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"})
// Current heuristics say that this simple int reduction is not profitable.
// But it would actually be profitable, since we are able to move the
// reduction out of the loop (we can reorder the reduction). When moving
// the reduction out of the loop, we instead accumulate with a simple
// ADD_VI inside the loop.
// See: JDK-8307516 JDK-8345044
// We are able to vectorize the reduction. But on its own, that would
// not reduce the cost sufficiently in all cases, because vectorized
// reduction nodes are expensive. But since integer addition is associative
// we can move the reduction vector out of the loop. Instead, we accumulate
// with a simple ADD_VI inside the loop, which is very cheap. After the
// loop, we only need to use the vectorized reduction once, to collapse
// the partial sums contained in the lanes.
private static int simpleIntReduction() {
int sum = 0;
for (int i = 0; i < aI.length; i++) {

File diff suppressed because it is too large Load Diff

View File

@ -28,6 +28,10 @@ import org.openjdk.jmh.infra.*;
import java.util.concurrent.TimeUnit;
import java.util.Random;
/**
* Note: there is a corresponding IR test:
* test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)