mirror of
https://github.com/openjdk/jdk.git
synced 2026-02-03 23:18:28 +00:00
8340093: C2 SuperWord: implement cost model
Reviewed-by: kvn, qamai
This commit is contained in:
parent
6e838d6f9a
commit
72989e0fac
@ -129,18 +129,24 @@ source %{
|
||||
bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
|
||||
if (UseSVE == 0) {
|
||||
// These operations are not profitable to be vectorized on NEON, because no direct
|
||||
// NEON instructions support them. But the match rule support for them is profitable for
|
||||
// Vector API intrinsics.
|
||||
// NEON instructions support them. They use multiple instructions which is more
|
||||
// expensive in almost all cases where we would auto vectorize.
|
||||
// But the match rule support for them is profitable for Vector API intrinsics.
|
||||
if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
|
||||
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
|
||||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
|
||||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
|
||||
opcode == Op_MulVL ||
|
||||
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
|
||||
// They are not suitable for auto-vectorization because the result would not conform
|
||||
// to the JLS, Section Evaluation Order.
|
||||
// Note: we could implement sequential reductions for these reduction operators, but
|
||||
// this will still almost never lead to speedups, because the sequential
|
||||
// reductions are latency limited along the reduction chain, and not
|
||||
// throughput limited. This is unlike unordered reductions (associative op)
|
||||
// and element-wise ops which are usually throughput limited.
|
||||
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
|
||||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
|
||||
opcode == Op_MulVL) {
|
||||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -119,18 +119,24 @@ source %{
|
||||
bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
|
||||
if (UseSVE == 0) {
|
||||
// These operations are not profitable to be vectorized on NEON, because no direct
|
||||
// NEON instructions support them. But the match rule support for them is profitable for
|
||||
// Vector API intrinsics.
|
||||
// NEON instructions support them. They use multiple instructions which is more
|
||||
// expensive in almost all cases where we would auto vectorize.
|
||||
// But the match rule support for them is profitable for Vector API intrinsics.
|
||||
if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
|
||||
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
|
||||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
|
||||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
|
||||
opcode == Op_MulVL ||
|
||||
// The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
|
||||
// They are not suitable for auto-vectorization because the result would not conform
|
||||
// to the JLS, Section Evaluation Order.
|
||||
// Note: we could implement sequential reductions for these reduction operators, but
|
||||
// this will still almost never lead to speedups, because the sequential
|
||||
// reductions are latency limited along the reduction chain, and not
|
||||
// throughput limited. This is unlike unordered reductions (associative op)
|
||||
// and element-wise ops which are usually throughput limited.
|
||||
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
|
||||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
|
||||
opcode == Op_MulVL) {
|
||||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
|
||||
),
|
||||
_vpointer_for_main_loop_alignment(nullptr),
|
||||
_aw_for_main_loop_alignment(0),
|
||||
_do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style
|
||||
_num_work_vecs(0), // amount of vector work we have
|
||||
_num_reductions(0) // amount of reduction work we have
|
||||
_do_vector_loop(phase()->C->do_vector_loop()) // whether to do vectorization/simd style
|
||||
{
|
||||
}
|
||||
|
||||
@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() {
|
||||
|
||||
// Remove packs that are not profitable.
|
||||
void SuperWord::filter_packs_for_profitable() {
|
||||
// Count the number of reductions vs other vector ops, for the
|
||||
// reduction profitability heuristic.
|
||||
for (int i = 0; i < _packset.length(); i++) {
|
||||
Node_List* pack = _packset.at(i);
|
||||
Node* n = pack->at(0);
|
||||
if (is_marked_reduction(n)) {
|
||||
_num_reductions++;
|
||||
} else {
|
||||
_num_work_vecs++;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove packs that are not profitable
|
||||
auto filter = [&](const Node_List* pack) {
|
||||
return profitable(pack);
|
||||
@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
|
||||
if (p0 != nullptr) {
|
||||
int opc = p0->Opcode();
|
||||
if (is_marked_reduction(p0)) {
|
||||
const Type *arith_type = p0->bottom_type();
|
||||
// This heuristic predicts that 2-element reductions for INT/LONG are not
|
||||
// profitable. This heuristic was added in JDK-8078563. The argument
|
||||
// was that reductions are not just a single instruction, but multiple, and
|
||||
// hence it is not directly clear that they are profitable. If we only have
|
||||
// two elements per vector, then the performance gains from non-reduction
|
||||
// vectors are at most going from 2 scalar instructions to 1 vector instruction.
|
||||
// But a 2-element reduction vector goes from 2 scalar instructions to
|
||||
// 3 instructions (1 shuffle and two reduction ops).
|
||||
// However, this optimization assumes that these reductions stay in the loop
|
||||
// which may not be true any more in most cases after the introduction of:
|
||||
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
|
||||
// Hence, this heuristic has room for improvement.
|
||||
bool is_two_element_int_or_long_reduction = (size == 2) &&
|
||||
(arith_type->basic_type() == T_INT ||
|
||||
arith_type->basic_type() == T_LONG);
|
||||
if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_rejections()) {
|
||||
tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
|
||||
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
const Type* arith_type = p0->bottom_type();
|
||||
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
|
||||
} else if (VectorNode::is_convert_opcode(opc)) {
|
||||
retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
|
||||
@ -1791,26 +1753,6 @@ bool SuperWord::profitable(const Node_List* p) const {
|
||||
// The second input has to be the vector we wanted to reduce,
|
||||
// but it was not packed.
|
||||
return false;
|
||||
} else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
|
||||
// This heuristic predicts that the reduction is not profitable.
|
||||
// Reduction vectors can be expensive, because they require multiple
|
||||
// operations to fold all the lanes together. Hence, vectorizing the
|
||||
// reduction is not profitable on its own. Hence, we need a lot of
|
||||
// other "work vectors" that deliver performance improvements to
|
||||
// balance out the performance loss due to reductions.
|
||||
// This heuristic is a bit simplistic, and assumes that the reduction
|
||||
// vector stays in the loop. But in some cases, we can move the
|
||||
// reduction out of the loop, replacing it with a single vector op.
|
||||
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
|
||||
// Hence, this heuristic has room for improvement.
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_rejections()) {
|
||||
tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
|
||||
tty->print_cr(" reduction profitable.");
|
||||
tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2");
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
} else if (second_pk->size() != p->size()) {
|
||||
return false;
|
||||
}
|
||||
@ -1969,19 +1911,53 @@ bool SuperWord::do_vtransform() const {
|
||||
vtransform.optimize();
|
||||
|
||||
if (!vtransform.schedule()) { return false; }
|
||||
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
|
||||
|
||||
if (!vtransform.is_profitable()) { return false; }
|
||||
|
||||
vtransform.apply();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check Cost-Model, and other heuristics.
|
||||
// Can be overridden with AutoVectorizationOverrideProfitability.
|
||||
bool VTransform::is_profitable() const {
|
||||
assert(_graph.is_scheduled(), "must already be scheduled");
|
||||
|
||||
if (AutoVectorizationOverrideProfitability == 0) {
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_any()) {
|
||||
if (_trace._info) {
|
||||
tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
vtransform.apply();
|
||||
return true;
|
||||
if (AutoVectorizationOverrideProfitability == 2) {
|
||||
#ifndef PRODUCT
|
||||
if (_trace._info) {
|
||||
tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2).");
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
// Note: currently we only do throughput-based cost-modeling. In the future, we could
|
||||
// also implement latency-based cost-modeling and take store-to-load-forwarding
|
||||
// failures into account as the latency between the load and store. This would
|
||||
// allow a more precise tradeoff between the forwarding failure penalty versus
|
||||
// the vectorization gains.
|
||||
if (has_store_to_load_forwarding_failure()) { return false; }
|
||||
|
||||
// Cost-model
|
||||
float scalar_cost = _vloop_analyzer.cost_for_scalar_loop();
|
||||
float vector_cost = cost_for_vector_loop();
|
||||
#ifndef PRODUCT
|
||||
if (_trace._info) {
|
||||
tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f",
|
||||
scalar_cost, vector_cost);
|
||||
}
|
||||
#endif
|
||||
return vector_cost < scalar_cost;
|
||||
}
|
||||
|
||||
// Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all
|
||||
|
||||
@ -549,8 +549,6 @@ class SuperWord : public ResourceObj {
|
||||
|
||||
private:
|
||||
bool _do_vector_loop; // whether to do vectorization/simd style
|
||||
int _num_work_vecs; // Number of non memory vector operations
|
||||
int _num_reductions; // Number of reduction expressions applied
|
||||
|
||||
// Accessors
|
||||
Arena* arena() { return &_arena; }
|
||||
|
||||
@ -38,7 +38,7 @@
|
||||
flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \
|
||||
flags(BODY, "Trace VLoopBody") \
|
||||
flags(TYPES, "Trace VLoopTypes") \
|
||||
flags(POINTERS, "Trace VLoopPointers") \
|
||||
flags(POINTERS, "Trace VLoopVPointers") \
|
||||
flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \
|
||||
flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \
|
||||
flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \
|
||||
@ -47,6 +47,8 @@
|
||||
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
|
||||
flags(VTRANSFORM, "Trace VTransform Graph") \
|
||||
flags(OPTIMIZATION, "Trace VTransform::optimize") \
|
||||
flags(COST, "Trace cost of VLoop (scalar) and VTransform (vector)") \
|
||||
flags(COST_VERBOSE, "Trace like COST, but more verbose") \
|
||||
flags(ALIGN_VECTOR, "Trace AlignVector") \
|
||||
flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
|
||||
flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
|
||||
|
||||
@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() {
|
||||
int pointers_idx = 0;
|
||||
_body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
|
||||
// Placement new: construct directly into the array.
|
||||
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
|
||||
::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes);
|
||||
_bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
|
||||
pointers_idx++;
|
||||
});
|
||||
@ -541,6 +541,108 @@ void VLoopDependencyGraph::PredsIterator::next() {
|
||||
}
|
||||
}
|
||||
|
||||
// Cost-model heuristic for nodes that do not contribute to computational
|
||||
// cost inside the loop.
|
||||
bool VLoopAnalyzer::has_zero_cost(Node* n) const {
|
||||
// Outside body?
|
||||
if (!_vloop.in_bb(n)) { return true; }
|
||||
|
||||
// Internal nodes of pointer expressions are most likely folded into
|
||||
// the load / store and have no additional cost.
|
||||
if (vpointers().is_in_pointer_expression(n)) { return true; }
|
||||
|
||||
// Not all AddP nodes can be detected in VPointer parsing, so
|
||||
// we filter them out here.
|
||||
// We don't want to explicitly model the cost of control flow,
|
||||
// since we have the same CFG structure before and after
|
||||
// vectorization: A loop head, a loop exit, with a backedge.
|
||||
if (n->is_AddP() || // Pointer expression
|
||||
n->is_CFG() || // CFG
|
||||
n->is_Phi() || // CFG
|
||||
n->is_Cmp() || // CFG
|
||||
n->is_Bool()) { // CFG
|
||||
return true;
|
||||
}
|
||||
|
||||
// All other nodes have a non-zero cost.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Compute the cost over all operations in the (scalar) loop.
|
||||
float VLoopAnalyzer::cost_for_scalar_loop() const {
|
||||
#ifndef PRODUCT
|
||||
if (_vloop.is_trace_cost()) {
|
||||
tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:");
|
||||
}
|
||||
#endif
|
||||
|
||||
float sum = 0;
|
||||
for (int j = 0; j < body().body().length(); j++) {
|
||||
Node* n = body().body().at(j);
|
||||
if (!has_zero_cost(n)) {
|
||||
float c = cost_for_scalar_node(n->Opcode());
|
||||
sum += c;
|
||||
#ifndef PRODUCT
|
||||
if (_vloop.is_trace_cost_verbose()) {
|
||||
tty->print_cr(" -> cost = %.2f for %d %s", c, n->_idx, n->Name());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (_vloop.is_trace_cost()) {
|
||||
tty->print_cr(" total_cost = %.2f", sum);
|
||||
}
|
||||
#endif
|
||||
return sum;
|
||||
}
|
||||
|
||||
// For now, we use unit cost. We might refine that in the future.
|
||||
// If needed, we could also use platform specific costs, if the
|
||||
// default here is not accurate enough.
|
||||
float VLoopAnalyzer::cost_for_scalar_node(int opcode) const {
|
||||
float c = 1;
|
||||
#ifndef PRODUCT
|
||||
if (_vloop.is_trace_cost()) {
|
||||
tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]);
|
||||
}
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
|
||||
// For now, we use unit cost. We might refine that in the future.
|
||||
// If needed, we could also use platform specific costs, if the
|
||||
// default here is not accurate enough.
|
||||
float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const {
|
||||
float c = 1;
|
||||
#ifndef PRODUCT
|
||||
if (_vloop.is_trace_cost()) {
|
||||
tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s",
|
||||
c, NodeClassNames[opcode], vlen, type2name(bt));
|
||||
}
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
|
||||
// For now, we use unit cost, i.e. we count the number of backend instructions
|
||||
// that the vtnode will use. We might refine that in the future.
|
||||
// If needed, we could also use platform specific costs, if the
|
||||
// default here is not accurate enough.
|
||||
float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
|
||||
// Each reduction is composed of multiple instructions, each estimated with a unit cost.
|
||||
// Linear: shuffle and reduce Recursive: shuffle and reduce
|
||||
float c = requires_strict_order ? 2 * vlen : 2 * exact_log2(vlen);
|
||||
#ifndef PRODUCT
|
||||
if (_vloop.is_trace_cost()) {
|
||||
tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",
|
||||
c, NodeClassNames[opcode], vlen, type2name(bt),
|
||||
requires_strict_order ? "true" : "false");
|
||||
}
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
|
||||
// Computing aliasing runtime check using init and last of main-loop
|
||||
// -----------------------------------------------------------------
|
||||
//
|
||||
|
||||
@ -209,6 +209,14 @@ public:
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
|
||||
}
|
||||
|
||||
bool is_trace_cost() const {
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::COST);
|
||||
}
|
||||
|
||||
bool is_trace_cost_verbose() const {
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::COST_VERBOSE);
|
||||
}
|
||||
|
||||
bool is_trace_speculative_runtime_checks() const {
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
|
||||
}
|
||||
@ -584,6 +592,32 @@ private:
|
||||
const Type* container_type(Node* n) const;
|
||||
};
|
||||
|
||||
// Mark all nodes from the loop that are part of any VPointer expression.
|
||||
class PointerExpressionNodes : public MemPointerParserCallback {
|
||||
private:
|
||||
const VLoop& _vloop;
|
||||
const VLoopBody& _body;
|
||||
VectorSet _in_pointer_expression;
|
||||
|
||||
public:
|
||||
PointerExpressionNodes(Arena* arena,
|
||||
const VLoop& vloop,
|
||||
const VLoopBody& body) :
|
||||
_vloop(vloop),
|
||||
_body(body),
|
||||
_in_pointer_expression(arena) {}
|
||||
|
||||
virtual void callback(Node* n) override {
|
||||
if (!_vloop.in_bb(n)) { return; }
|
||||
_in_pointer_expression.set(_body.bb_idx(n));
|
||||
}
|
||||
|
||||
bool contains(const Node* n) const {
|
||||
if (!_vloop.in_bb(n)) { return false; }
|
||||
return _in_pointer_expression.test(_body.bb_idx(n));
|
||||
}
|
||||
};
|
||||
|
||||
// Submodule of VLoopAnalyzer.
|
||||
// We compute and cache the VPointer for every load and store.
|
||||
class VLoopVPointers : public StackObj {
|
||||
@ -599,6 +633,9 @@ private:
|
||||
// Map bb_idx -> index in _vpointers. -1 if not mapped.
|
||||
GrowableArray<int> _bb_idx_to_vpointer;
|
||||
|
||||
// Mark all nodes that are part of any pointers expression.
|
||||
PointerExpressionNodes _pointer_expression_nodes;
|
||||
|
||||
public:
|
||||
VLoopVPointers(Arena* arena,
|
||||
const VLoop& vloop,
|
||||
@ -610,13 +647,18 @@ public:
|
||||
_bb_idx_to_vpointer(arena,
|
||||
vloop.estimated_body_length(),
|
||||
vloop.estimated_body_length(),
|
||||
-1) {}
|
||||
-1),
|
||||
_pointer_expression_nodes(arena, _vloop, _body) {}
|
||||
NONCOPYABLE(VLoopVPointers);
|
||||
|
||||
void compute_vpointers();
|
||||
const VPointer& vpointer(const MemNode* mem) const;
|
||||
NOT_PRODUCT( void print() const; )
|
||||
|
||||
bool is_in_pointer_expression(const Node* n) const {
|
||||
return _pointer_expression_nodes.contains(n);
|
||||
}
|
||||
|
||||
private:
|
||||
void count_vpointers();
|
||||
void allocate_vpointers_array();
|
||||
@ -810,6 +852,15 @@ public:
|
||||
const VLoopVPointers& vpointers() const { return _vpointers; }
|
||||
const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; }
|
||||
|
||||
// Compute the cost of the (scalar) body.
|
||||
float cost_for_scalar_loop() const;
|
||||
bool has_zero_cost(Node* n) const;
|
||||
|
||||
// Cost-modeling with tracing.
|
||||
float cost_for_scalar_node(int opcode) const;
|
||||
float cost_for_vector_node(int opcode, int vlen, BasicType bt) const;
|
||||
float cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
|
||||
|
||||
private:
|
||||
bool setup_submodules();
|
||||
VStatus setup_submodules_helper();
|
||||
|
||||
@ -186,6 +186,99 @@ int VTransformGraph::count_alive_vtnodes() const {
|
||||
return count;
|
||||
}
|
||||
|
||||
// Find all nodes that in the loop, in a 2-phase process:
|
||||
// - First, find all nodes that are not before the loop:
|
||||
// - loop-phis
|
||||
// - loads and stores that are in the loop
|
||||
// - and all their transitive uses.
|
||||
// - Second, we find all nodes that are not after the loop:
|
||||
// - backedges
|
||||
// - loads and stores that are in the loop
|
||||
// - and all their transitive uses.
|
||||
//
|
||||
// in_loop: vtn->_idx -> bool
|
||||
void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
|
||||
assert(is_scheduled(), "must already be scheduled");
|
||||
|
||||
// Phase 1: find all nodes that are not before the loop.
|
||||
VectorSet is_not_before_loop;
|
||||
for (int i = 0; i < _schedule.length(); i++) {
|
||||
VTransformNode* vtn = _schedule.at(i);
|
||||
// Is vtn a loop-phi?
|
||||
if (vtn->isa_LoopPhi() != nullptr ||
|
||||
vtn->is_load_or_store_in_loop()) {
|
||||
is_not_before_loop.set(vtn->_idx);
|
||||
continue;
|
||||
}
|
||||
// Or one of its transitive uses?
|
||||
for (uint j = 0; j < vtn->req(); j++) {
|
||||
VTransformNode* def = vtn->in_req(j);
|
||||
if (def != nullptr && is_not_before_loop.test(def->_idx)) {
|
||||
is_not_before_loop.set(vtn->_idx);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: find all nodes that are not after the loop.
|
||||
for (int i = _schedule.length()-1; i >= 0; i--) {
|
||||
VTransformNode* vtn = _schedule.at(i);
|
||||
if (!is_not_before_loop.test(vtn->_idx)) { continue; }
|
||||
// Is load or store?
|
||||
if (vtn->is_load_or_store_in_loop()) {
|
||||
in_loop.set(vtn->_idx);
|
||||
continue;
|
||||
}
|
||||
for (uint i = 0; i < vtn->out_strong_edges(); i++) {
|
||||
VTransformNode* use = vtn->out_strong_edge(i);
|
||||
// Or is vtn a backedge or one of its transitive defs?
|
||||
if (in_loop.test(use->_idx) ||
|
||||
use->isa_LoopPhi() != nullptr) {
|
||||
in_loop.set(vtn->_idx);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float VTransformGraph::cost_for_vector_loop() const {
|
||||
assert(is_scheduled(), "must already be scheduled");
|
||||
#ifndef PRODUCT
|
||||
if (_vloop.is_trace_cost()) {
|
||||
tty->print_cr("\nVTransformGraph::cost_for_vector_loop:");
|
||||
}
|
||||
#endif
|
||||
|
||||
// We only want to count the cost of nodes that are in the loop.
|
||||
// This is especially important for cases where we were able to move
|
||||
// some nodes outside the loop during VTransform::optimize, e.g.:
|
||||
// VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
|
||||
ResourceMark rm;
|
||||
VectorSet in_loop; // vtn->_idx -> bool
|
||||
mark_vtnodes_in_loop(in_loop);
|
||||
|
||||
float sum = 0;
|
||||
for (int i = 0; i < _schedule.length(); i++) {
|
||||
VTransformNode* vtn = _schedule.at(i);
|
||||
if (!in_loop.test(vtn->_idx)) { continue; }
|
||||
float c = vtn->cost(_vloop_analyzer);
|
||||
sum += c;
|
||||
#ifndef PRODUCT
|
||||
if (c != 0 && _vloop.is_trace_cost_verbose()) {
|
||||
tty->print(" -> cost = %.2f for ", c);
|
||||
vtn->print();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (_vloop.is_trace_cost()) {
|
||||
tty->print_cr(" total_cost = %.2f", sum);
|
||||
}
|
||||
#endif
|
||||
return sum;
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
|
||||
const VectorSet& pre_visited,
|
||||
@ -831,6 +924,12 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app
|
||||
}
|
||||
}
|
||||
|
||||
float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
// This is an identity transform, but loads and stores must be counted.
|
||||
assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted");
|
||||
return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const {
|
||||
apply_vtn_inputs_to_node(_node, apply_state);
|
||||
// The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
|
||||
@ -843,6 +942,16 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app
|
||||
return VTransformApplyResult::make_scalar(_node);
|
||||
}
|
||||
|
||||
float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
// Since this is an identity transform, we may have nodes that also
|
||||
// VLoopAnalyzer::cost does not count for the scalar loop.
|
||||
if (vloop_analyzer.has_zero_cost(_node)) {
|
||||
return 0;
|
||||
} else {
|
||||
return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
|
||||
}
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const {
|
||||
apply_vtn_inputs_to_node(_node, apply_state);
|
||||
return VTransformApplyResult::make_scalar(_node);
|
||||
@ -895,6 +1004,10 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta
|
||||
return VTransformApplyResult::make_scalar(_node);
|
||||
}
|
||||
|
||||
float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const {
|
||||
Node* val = apply_state.transformed_node(in_req(1));
|
||||
VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type);
|
||||
@ -902,6 +1015,10 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply
|
||||
return VTransformApplyResult::make_vector(vn);
|
||||
}
|
||||
|
||||
float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const {
|
||||
Node* val = apply_state.transformed_node(in_req(1));
|
||||
Node* n = new ConvI2LNode(val);
|
||||
@ -909,6 +1026,12 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s
|
||||
return VTransformApplyResult::make_scalar(n);
|
||||
}
|
||||
|
||||
float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode);
|
||||
return vloop_analyzer.cost_for_scalar_node(Op_AndI) +
|
||||
vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const {
|
||||
PhaseIdealLoop* phase = apply_state.phase();
|
||||
Node* shift_count_in = apply_state.transformed_node(in_req(1));
|
||||
@ -924,6 +1047,9 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl
|
||||
return VTransformApplyResult::make_vector(vn);
|
||||
}
|
||||
|
||||
float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const {
|
||||
PhaseIdealLoop* phase = apply_state.phase();
|
||||
@ -936,6 +1062,10 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a
|
||||
return VTransformApplyResult::make_vector(vn);
|
||||
}
|
||||
|
||||
float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const {
|
||||
assert(2 <= req() && req() <= 4, "Must have 1-3 inputs");
|
||||
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
|
||||
@ -954,6 +1084,12 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat
|
||||
return VTransformApplyResult::make_vector(vn);
|
||||
}
|
||||
|
||||
float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type());
|
||||
return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) +
|
||||
vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
|
||||
uint vlen = vector_length();
|
||||
int sopc = scalar_opcode();
|
||||
@ -969,6 +1105,10 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(
|
||||
return VTransformApplyResult::make_vector(vn);
|
||||
}
|
||||
|
||||
float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {
|
||||
const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length());
|
||||
const TypeVect* src_vt = TypeVect::make(_src_bt, vector_length());
|
||||
@ -981,6 +1121,11 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat
|
||||
return VTransformApplyResult::make_vector(vn);
|
||||
}
|
||||
|
||||
float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
assert(scalar_opcode() == Op_Bool, "");
|
||||
return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type());
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const {
|
||||
const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
|
||||
assert(scalar_opcode() == Op_Bool, "");
|
||||
@ -1101,10 +1246,10 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
|
||||
const BasicType bt = element_basic_type();
|
||||
const int ropc = vector_reduction_opcode();
|
||||
const int vopc = VectorNode::opcode(sopc, bt);
|
||||
if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
|
||||
DEBUG_ONLY( this->print(); )
|
||||
assert(false, "do not have normal vector op for this reduction");
|
||||
return false; // not implemented
|
||||
if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) {
|
||||
// The element-wise vector operation needed for the vector accumulator
|
||||
// is not implemented / supported.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Traverse up the chain of non strict order reductions, checking that it loops
|
||||
@ -1236,6 +1381,14 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
|
||||
return true; // success
|
||||
}
|
||||
|
||||
float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
uint vlen = vector_length();
|
||||
BasicType bt = element_basic_type();
|
||||
int vopc = vector_reduction_opcode();
|
||||
bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc);
|
||||
return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
|
||||
Node* init = apply_state.transformed_node(in_req(1));
|
||||
Node* vec = apply_state.transformed_node(in_req(2));
|
||||
@ -1245,6 +1398,12 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState&
|
||||
return VTransformApplyResult::make_vector(vn, vn->vect_type());
|
||||
}
|
||||
|
||||
float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
uint vlen = vector_length();
|
||||
BasicType bt = element_basic_type();
|
||||
return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const {
|
||||
int sopc = scalar_opcode();
|
||||
uint vlen = vector_length();
|
||||
@ -1274,6 +1433,12 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl
|
||||
return VTransformApplyResult::make_vector(vn, vn->vect_type());
|
||||
}
|
||||
|
||||
float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
|
||||
uint vlen = vector_length();
|
||||
BasicType bt = element_basic_type();
|
||||
return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt);
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const {
|
||||
int sopc = scalar_opcode();
|
||||
uint vlen = vector_length();
|
||||
|
||||
@ -51,6 +51,10 @@
|
||||
// - Compute linearization of the VTransformGraph, into an order that respects
|
||||
// all edges in the graph (bailout if cycle detected).
|
||||
//
|
||||
// - Cost-Model:
|
||||
// - We use a cost-model as a heuristic to determine if vectorization is profitable.
|
||||
// Compute the cost of the loop with and without vectorization.
|
||||
//
|
||||
// - Apply:
|
||||
// - Changes to the C2 IR are only made once the "apply" method is called.
|
||||
// - Align the main loop, by adjusting pre loop limit.
|
||||
@ -190,6 +194,7 @@ public:
|
||||
void optimize(VTransform& vtransform);
|
||||
bool schedule();
|
||||
bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
|
||||
float cost_for_vector_loop() const;
|
||||
void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
|
||||
|
||||
private:
|
||||
@ -200,6 +205,7 @@ private:
|
||||
|
||||
void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
|
||||
int count_alive_vtnodes() const;
|
||||
void mark_vtnodes_in_loop(VectorSet& in_loop) const;
|
||||
|
||||
#ifndef PRODUCT
|
||||
void print_vtnodes() const;
|
||||
@ -252,6 +258,8 @@ public:
|
||||
|
||||
void optimize() { return _graph.optimize(*this); }
|
||||
bool schedule() { return _graph.schedule(); }
|
||||
bool is_profitable() const;
|
||||
float cost_for_vector_loop() const { return _graph.cost_for_vector_loop(); }
|
||||
bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
|
||||
void apply();
|
||||
|
||||
@ -549,6 +557,8 @@ public:
|
||||
|
||||
virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
|
||||
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const = 0;
|
||||
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
|
||||
virtual void apply_backedge(VTransformApplyState& apply_state) const {};
|
||||
void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
|
||||
@ -579,6 +589,7 @@ public:
|
||||
virtual bool is_load_or_store_in_loop() const override { return true; }
|
||||
|
||||
virtual const VPointer& vpointer() const override { return _vpointer; }
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "MemopScalar"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -595,6 +606,7 @@ public:
|
||||
assert(!_node->is_Mem() && !_node->is_Phi() && !_node->is_CFG(), "must be data node: %s", _node->Name());
|
||||
}
|
||||
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "DataScalar"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -612,6 +624,7 @@ public:
|
||||
}
|
||||
|
||||
virtual VTransformLoopPhiNode* isa_LoopPhi() override { return this; }
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
virtual void apply_backedge(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "LoopPhi"; };)
|
||||
@ -629,6 +642,7 @@ public:
|
||||
assert(_node->is_CFG(), "must be CFG node: %s", _node->Name());
|
||||
}
|
||||
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "CFG"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -655,6 +669,7 @@ public:
|
||||
VTransformNode(vtransform, n->req()), _node(n) {}
|
||||
|
||||
virtual VTransformOuterNode* isa_Outer() override { return this; }
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { ShouldNotReachHere(); }
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "Outer"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -668,6 +683,7 @@ private:
|
||||
public:
|
||||
VTransformReplicateNode(VTransform& vtransform, int vlen, BasicType element_type) :
|
||||
VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {}
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -677,6 +693,7 @@ public:
|
||||
class VTransformConvI2LNode : public VTransformNode {
|
||||
public:
|
||||
VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {}
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };)
|
||||
};
|
||||
@ -691,6 +708,7 @@ private:
|
||||
public:
|
||||
VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) :
|
||||
VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {}
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -704,6 +722,7 @@ private:
|
||||
public:
|
||||
VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) :
|
||||
VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {}
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -769,6 +788,7 @@ public:
|
||||
VTransformElementWiseVectorNode(VTransform& vtransform, uint req, const VTransformVectorNodeProperties properties, const int vector_opcode) :
|
||||
VTransformVectorNode(vtransform, req, properties), _vector_opcode(vector_opcode) {}
|
||||
virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; }
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -781,6 +801,7 @@ class VTransformElementWiseLongOpWithCastToIntVectorNode : public VTransformVect
|
||||
public:
|
||||
VTransformElementWiseLongOpWithCastToIntVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
|
||||
VTransformVectorNode(vtransform, 2, properties) {}
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseLongOpWithCastToIntVector"; };)
|
||||
};
|
||||
@ -791,6 +812,7 @@ private:
|
||||
public:
|
||||
VTransformReinterpretVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties, const BasicType src_bt) :
|
||||
VTransformVectorNode(vtransform, 2, properties), _src_bt(src_bt) {}
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ReinterpretVector"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -811,6 +833,7 @@ public:
|
||||
VTransformCmpVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
|
||||
VTransformVectorNode(vtransform, 3, properties) {}
|
||||
virtual VTransformCmpVectorNode* isa_CmpVector() override { return this; }
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override { return VTransformApplyResult::make_empty(); }
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "CmpVector"; };)
|
||||
};
|
||||
@ -823,6 +846,7 @@ public:
|
||||
VTransformVectorNode(vtransform, 2, properties), _test(test) {}
|
||||
VTransformBoolTest test() const { return _test; }
|
||||
virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; }
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };)
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
@ -835,6 +859,7 @@ public:
|
||||
VTransformVectorNode(vtransform, 3, properties) {}
|
||||
virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
|
||||
virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
|
||||
|
||||
@ -877,6 +902,7 @@ public:
|
||||
LoadNode::ControlDependency control_dependency() const;
|
||||
virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; }
|
||||
virtual bool is_load_in_loop() const override { return true; }
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
|
||||
};
|
||||
@ -888,6 +914,7 @@ public:
|
||||
VTransformMemVectorNode(vtransform, 4, properties, vpointer, adr_type) {}
|
||||
virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; }
|
||||
virtual bool is_load_in_loop() const override { return false; }
|
||||
virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
|
||||
};
|
||||
|
||||
@ -410,12 +410,12 @@ public class TestIntVect {
|
||||
|
||||
}
|
||||
|
||||
// Not vectorized: simple addition not profitable, see JDK-8307516. NOTE:
|
||||
// This check does not document the _desired_ behavior of the system but
|
||||
// the current behavior (no vectorization)
|
||||
@Test
|
||||
@IR(counts = { IRNode.LOAD_VECTOR_I, "= 0",
|
||||
IRNode.STORE_VECTOR, "= 0" })
|
||||
@IR(counts = { IRNode.LOAD_VECTOR_I, "> 0",
|
||||
IRNode.ADD_REDUCTION_VI, "> 0",
|
||||
IRNode.ADD_VI, "> 0" })
|
||||
// The reduction is moved outside the loop, and we use a
|
||||
// element-wise accumulator inside the loop.
|
||||
int test_sum(int[] a1) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < a1.length; i+=1) {
|
||||
|
||||
@ -115,17 +115,18 @@ public class TestAutoVectorizationOverrideProfitability {
|
||||
@Test
|
||||
@Warmup(10)
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
|
||||
counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"})
|
||||
@IR(applyIfCPUFeatureOr = {"avx", "true"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
|
||||
applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
|
||||
counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"})
|
||||
// Current heuristics say that this simple int reduction is not profitable.
|
||||
// But it would actually be profitable, since we are able to move the
|
||||
// reduction out of the loop (we can reorder the reduction). When moving
|
||||
// the reduction out of the loop, we instead accumulate with a simple
|
||||
// ADD_VI inside the loop.
|
||||
// See: JDK-8307516 JDK-8345044
|
||||
// We are able to vectorize the reduction. But on its own, that would
|
||||
// not reduce the cost sufficiently in all cases, because vectorized
|
||||
// reduction nodes are expensive. But since integer addition is associative
|
||||
// we can move the reduction vector out of the loop. Instead, we accumulate
|
||||
// with a simple ADD_VI inside the loop, which is very cheap. After the
|
||||
// loop, we only need to use the vectorized reduction once, to collapse
|
||||
// the partial sums contained in the lanes.
|
||||
private static int simpleIntReduction() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < aI.length; i++) {
|
||||
|
||||
2452
test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
Normal file
2452
test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
Normal file
File diff suppressed because it is too large
Load Diff
@ -28,6 +28,10 @@ import org.openjdk.jmh.infra.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Note: there is a corresponding IR test:
|
||||
* test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
|
||||
*/
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@State(Scope.Thread)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user