8369448: C2 SuperWord: refactor VTransform to do move_unordered_reduction_out_of_loop during VTransform::optimize

Reviewed-by: chagedorn, kvn
This commit is contained in:
Emanuel Peter 2025-10-14 08:32:32 +00:00
parent a3ee821f38
commit 4786f8bee5
10 changed files with 402 additions and 341 deletions

View File

@ -5287,16 +5287,6 @@ void PhaseIdealLoop::build_and_optimize() {
}
}
// Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization.
if (C->has_loops() && !C->major_progress()) {
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
if (lpt->is_counted() && lpt->is_innermost()) {
move_unordered_reduction_out_of_loop(lpt);
}
}
}
// Keep loop predicates and perform optimizations with them
// until no more loop optimizations could be done.
// After that switch predicates off and do more loop optimizations.

View File

@ -1550,9 +1550,6 @@ public:
IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj);
bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt);
// Move an unordered Reduction out of loop if possible
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
// Create a scheduled list of nodes control dependent on ctrl set.
void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
// Has a use in the vector set

View File

@ -4548,211 +4548,6 @@ void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(Id
do_multiversioning(lpt, old_new);
}
// Returns true if the Reduction node is unordered.
static bool is_unordered_reduction(Node* n) {
return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
}
// Having ReductionNodes in the loop is expensive. They need to recursively
// fold together the vector values, for every vectorized loop iteration. If
// we encounter the following pattern, we can vector accumulate the values
// inside the loop, and only have a single UnorderedReduction after the loop.
//
// Note: UnorderedReduction represents a ReductionNode which does not require
// calculating in strict order.
//
// CountedLoop init
// | |
// +------+ | +-----------------------+
// | | | |
// PhiNode (s) |
// | |
// | Vector |
// | | |
// UnorderedReduction (first_ur) |
// | |
// ... Vector |
// | | |
// UnorderedReduction (last_ur) |
// | |
// +---------------------+
//
// We patch the graph to look like this:
//
// CountedLoop identity_vector
// | |
// +-------+ | +---------------+
// | | | |
// PhiNode (v) |
// | |
// | Vector |
// | | |
// VectorAccumulator |
// | |
// ... Vector |
// | | |
// init VectorAccumulator |
// | | | |
// UnorderedReduction +-----------+
//
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
// use vector_accumulators, which do the same reductions, but only element
// wise. This is a single operation per vector_accumulator, rather than many
// for a UnorderedReduction. We can then reduce the last vector_accumulator
// after the loop, and also reduce the init value into it.
//
// We can not do this with all reductions. Some reductions do not allow the
// reordering of operations (for example float addition/multiplication require
// strict order).
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
// Find all Phi nodes with an unordered Reduction on backedge.
CountedLoopNode* cl = loop->_head->as_CountedLoop();
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
Node* phi = cl->fast_out(j);
// We have a phi with a single use, and an unordered Reduction on the backedge.
if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) {
continue;
}
ReductionNode* last_ur = phi->in(2)->as_Reduction();
assert(!last_ur->requires_strict_order(), "must be");
// Determine types
const TypeVect* vec_t = last_ur->vect_type();
uint vector_length = vec_t->length();
BasicType bt = vec_t->element_basic_type();
// Convert opcode from vector-reduction -> scalar -> normal-vector-op
const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
const int vopc = VectorNode::opcode(sopc, bt);
if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
DEBUG_ONLY( last_ur->dump(); )
assert(false, "do not have normal vector op for this reduction");
continue; // not implemented -> fails
}
// Traverse up the chain of unordered Reductions, checking that it loops back to
// the phi. Check that all unordered Reductions only have a single use, except for
// the last (last_ur), which only has phi as a use in the loop, and all other uses
// are outside the loop.
ReductionNode* current = last_ur;
ReductionNode* first_ur = nullptr;
while (true) {
assert(!current->requires_strict_order(), "sanity");
// Expect no ctrl and a vector_input from within the loop.
Node* ctrl = current->in(0);
Node* vector_input = current->in(2);
if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
DEBUG_ONLY( current->dump(1); )
assert(false, "reduction has ctrl or bad vector_input");
break; // Chain traversal fails.
}
assert(current->vect_type() != nullptr, "must have vector type");
if (current->vect_type() != last_ur->vect_type()) {
// Reductions do not have the same vector type (length and element type).
break; // Chain traversal fails.
}
// Expect single use of an unordered Reduction, except for last_ur.
if (current == last_ur) {
// Expect all uses to be outside the loop, except phi.
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
Node* use = current->fast_out(k);
if (use != phi && ctrl_or_self(use) == cl) {
DEBUG_ONLY( current->dump(-1); )
assert(false, "reduction has use inside loop");
// Should not be allowed by SuperWord::mark_reductions
return; // bail out of optimization
}
}
} else {
if (current->outcnt() != 1) {
break; // Chain traversal fails.
}
}
// Expect another unordered Reduction or phi as the scalar input.
Node* scalar_input = current->in(1);
if (is_unordered_reduction(scalar_input) &&
scalar_input->Opcode() == current->Opcode()) {
// Move up the unordered Reduction chain.
current = scalar_input->as_Reduction();
assert(!current->requires_strict_order(), "must be");
} else if (scalar_input == phi) {
// Chain terminates at phi.
first_ur = current;
current = nullptr;
break; // Success.
} else {
// scalar_input is neither phi nor a matching reduction
// Can for example be scalar reduction when we have
// partial vectorization.
break; // Chain traversal fails.
}
}
if (current != nullptr) {
// Chain traversal was not successful.
continue;
}
assert(first_ur != nullptr, "must have successfully terminated chain traversal");
Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
set_root_as_ctrl(identity_scalar);
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt);
register_new_node(identity_vector, C->root());
assert(vec_t == identity_vector->vect_type(), "matching vector type");
VectorNode::trace_new_vector(identity_vector, "Unordered Reduction");
// Turn the scalar phi into a vector phi.
_igvn.rehash_node_delayed(phi);
Node* init = phi->in(1); // Remember init before replacing it.
phi->set_req_X(1, identity_vector, &_igvn);
phi->as_Type()->set_type(vec_t);
_igvn.set_type(phi, vec_t);
// Traverse down the chain of unordered Reductions, and replace them with vector_accumulators.
current = first_ur;
while (true) {
// Create vector_accumulator to replace current.
Node* last_vector_accumulator = current->in(1);
Node* vector_input = current->in(2);
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
register_new_node(vector_accumulator, cl);
_igvn.replace_node(current, vector_accumulator);
VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction");
if (current == last_ur) {
break;
}
current = vector_accumulator->unique_out()->as_Reduction();
assert(!current->requires_strict_order(), "must be");
}
// Create post-loop reduction.
Node* last_accumulator = phi->in(2);
Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);
// Take over uses of last_accumulator that are not in the loop.
for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
Node* use = last_accumulator->out(i);
if (use != phi && use != post_loop_reduction) {
assert(ctrl_or_self(use) != cl, "use must be outside loop");
use->replace_edge(last_accumulator, post_loop_reduction, &_igvn);
--i;
}
}
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction");
assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
assert(phi->outcnt() == 1, "accumulator is the only use of phi");
}
}
void DataNodeGraph::clone_data_nodes(Node* new_ctrl) {
for (uint i = 0; i < _data_nodes.size(); i++) {
clone(_data_nodes[i], new_ctrl);

View File

@ -1606,7 +1606,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
// 3 instructions (1 shuffle and two reduction ops).
// However, this optimization assumes that these reductions stay in the loop
// which may not be true any more in most cases after the introduction of:
// PhaseIdealLoop::move_unordered_reduction_out_of_loop
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
// Hence, this heuristic has room for improvement.
bool is_two_element_int_or_long_reduction = (size == 2) &&
(arith_type->basic_type() == T_INT ||
@ -1782,7 +1782,7 @@ bool SuperWord::profitable(const Node_List* p) const {
// This heuristic is a bit simplistic, and assumes that the reduction
// vector stays in the loop. But in some cases, we can move the
// reduction out of the loop, replacing it with a single vector op.
// See: PhaseIdealLoop::move_unordered_reduction_out_of_loop
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
// Hence, this heuristic has room for improvement.
#ifndef PRODUCT
if (is_trace_superword_rejections()) {
@ -1947,6 +1947,8 @@ bool SuperWord::do_vtransform() const {
SuperWordVTransformBuilder builder(_packset, vtransform);
}
vtransform.optimize();
if (!vtransform.schedule()) { return false; }
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }

View File

@ -45,10 +45,11 @@
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
flags(VTRANSFORM, "Trace VTransform Graph") \
flags(OPTIMIZATION, "Trace VTransform::optimize") \
flags(ALIGN_VECTOR, "Trace AlignVector") \
flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
flags(VTRANSFORM, "Trace VTransform Graph") \
flags(ALL, "Trace everything (very verbose)")
#define table_entry(name, description) name,

View File

@ -205,6 +205,10 @@ public:
return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
}
bool is_trace_optimization() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
}
bool is_trace_speculative_runtime_checks() const {
return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
}

View File

@ -292,121 +292,6 @@ int VectorNode::opcode(int sopc, BasicType bt) {
}
}
// Return the scalar opcode for the specified vector opcode
// and basic type.
int VectorNode::scalar_opcode(int sopc, BasicType bt) {
switch (sopc) {
case Op_AddReductionVI:
case Op_AddVI:
return Op_AddI;
case Op_AddReductionVL:
case Op_AddVL:
return Op_AddL;
case Op_MulReductionVI:
case Op_MulVI:
return Op_MulI;
case Op_MulReductionVL:
case Op_MulVL:
return Op_MulL;
case Op_AndReductionV:
case Op_AndV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_AndI;
case T_LONG:
return Op_AndL;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_OrReductionV:
case Op_OrV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_OrI;
case T_LONG:
return Op_OrL;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_XorReductionV:
case Op_XorV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_XorI;
case T_LONG:
return Op_XorL;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_MinReductionV:
case Op_MinV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
assert(false, "boolean and char are signed, not implemented for Min");
return 0;
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_MinI;
case T_LONG:
return Op_MinL;
case T_FLOAT:
return Op_MinF;
case T_DOUBLE:
return Op_MinD;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_MaxReductionV:
case Op_MaxV:
switch (bt) {
case T_BOOLEAN:
case T_CHAR:
assert(false, "boolean and char are signed, not implemented for Max");
return 0;
case T_BYTE:
case T_SHORT:
case T_INT:
return Op_MaxI;
case T_LONG:
return Op_MaxL;
case T_FLOAT:
return Op_MaxF;
case T_DOUBLE:
return Op_MaxD;
default:
assert(false, "basic type not handled");
return 0;
}
case Op_MinVHF:
return Op_MinHF;
case Op_MaxVHF:
return Op_MaxHF;
default:
assert(false,
"Vector node %s is not handled in VectorNode::scalar_opcode",
NodeClassNames[sopc]);
return 0; // Unimplemented
}
}
// Limits on vector size (number of elements) for auto-vectorization.
bool VectorNode::vector_size_supported_auto_vectorization(const BasicType bt, int size) {
return Matcher::max_vector_size_auto_vectorization(bt) >= size &&
@ -1727,6 +1612,34 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
return false;
}
bool ReductionNode::auto_vectorization_requires_strict_order(int vopc) {
switch (vopc) {
case Op_AddReductionVI:
case Op_AddReductionVL:
case Op_MulReductionVI:
case Op_MulReductionVL:
case Op_MinReductionV:
case Op_MaxReductionV:
case Op_AndReductionV:
case Op_OrReductionV:
case Op_XorReductionV:
// These are cases that all have associative operations, which can
// thus be reordered, allowing non-strict order reductions.
return false;
case Op_AddReductionVF:
case Op_MulReductionVF:
case Op_AddReductionVD:
case Op_MulReductionVD:
// Floating-point addition and multiplication are non-associative,
// so AddReductionVF/D and MulReductionVF/D require strict ordering
// in auto-vectorization.
return true;
default:
assert(false, "not handled: %s", NodeClassNames[vopc]);
return true;
}
}
MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* in1, Node* in2, Node* in3,
Node* mask, uint truth_table, const TypeVect* vt) {
assert(truth_table <= 0xFF, "invalid");

View File

@ -95,7 +95,6 @@ class VectorNode : public TypeNode {
static bool is_rotate_opcode(int opc);
static int opcode(int sopc, BasicType bt); // scalar_opc -> vector_opc
static int scalar_opcode(int vopc, BasicType bt); // vector_opc -> scalar_opc
static int shift_count_opcode(int opc);
@ -283,6 +282,8 @@ class ReductionNode : public Node {
return false;
}
static bool auto_vectorization_requires_strict_order(int vopc);
#ifndef PRODUCT
void dump_spec(outputStream* st) const {
if (requires_strict_order()) {

View File

@ -23,6 +23,7 @@
#include "opto/castnode.hpp"
#include "opto/convertnode.hpp"
#include "opto/rootnode.hpp"
#include "opto/vectorization.hpp"
#include "opto/vectornode.hpp"
#include "opto/vtransform.hpp"
@ -32,6 +33,45 @@ void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
_vtnodes.push(vtnode);
}
#define TRACE_OPTIMIZE(code) \
NOT_PRODUCT( \
if (vtransform.vloop().is_trace_optimization()) { \
code \
} \
)
// This is similar to IGVN optimization. But we are a bit lazy, and don't care about
// notification / worklist, since the list of nodes is rather small, and we don't
// expect optimizations that trickle over the whole graph.
void VTransformGraph::optimize(VTransform& vtransform) {
TRACE_OPTIMIZE( tty->print_cr("\nVTransformGraph::optimize"); )
bool progress = true;
DEBUG_ONLY(int pass_count = 0;)
while (progress) {
progress = false;
assert(++pass_count < 10, "ensure we do not have endless loops");
for (int i = 0; i < _vtnodes.length(); i++) {
VTransformNode* vtn = _vtnodes.at(i);
if (!vtn->is_alive()) { continue; }
progress |= vtn->optimize(_vloop_analyzer, vtransform);
// Nodes that have no use any more are dead.
if (vtn->out_strong_edges() == 0 &&
// There are some exceptions:
// 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive.
// 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive.
// 3. Outer node with strong inputs: is a use after the loop that we must keep alive.
!(vtn->isa_LoopPhi() != nullptr ||
vtn->is_load_or_store_in_loop() ||
(vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) {
vtn->mark_dead();
progress = true;
}
}
}
}
// Compute a linearization of the graph. We do this with a reverse-post-order of a DFS.
// This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and
// the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the
@ -59,10 +99,11 @@ bool VTransformGraph::schedule() {
VectorSet post_visited;
collect_nodes_without_strong_in_edges(stack);
const int num_alive_nodes = count_alive_vtnodes();
// We create a reverse-post-visit order. This gives us a linearization, if there are
// no cycles. Then, we simply reverse the order, and we have a schedule.
int rpo_idx = _vtnodes.length() - 1;
int rpo_idx = num_alive_nodes - 1;
while (!stack.is_empty()) {
VTransformNode* vtn = stack.top();
if (!pre_visited.test_set(vtn->_idx)) {
@ -79,6 +120,9 @@ bool VTransformGraph::schedule() {
for (uint i = 0; i < vtn->out_strong_edges(); i++) {
VTransformNode* use = vtn->out_strong_edge(i);
// Skip dead nodes
if (!use->is_alive()) { continue; }
// Skip LoopPhi backedge.
if ((use->isa_LoopPhi() != nullptr || use->isa_CountedLoop() != nullptr) && use->in_req(2) == vtn) { continue; }
@ -121,6 +165,7 @@ bool VTransformGraph::schedule() {
void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const {
for (int i = 0; i < _vtnodes.length(); i++) {
VTransformNode* vtn = _vtnodes.at(i);
if (!vtn->is_alive()) { continue; }
if (!vtn->has_strong_in_edge()) {
stack.push(vtn);
}
@ -132,6 +177,15 @@ void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTrans
}
}
int VTransformGraph::count_alive_vtnodes() const {
int count = 0;
for (int i = 0; i < _vtnodes.length(); i++) {
VTransformNode* vtn = _vtnodes.at(i);
if (vtn->is_alive()) { count++; }
}
return count;
}
#ifndef PRODUCT
void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
const VectorSet& pre_visited,
@ -801,6 +855,13 @@ VTransformApplyResult VTransformLoopPhiNode::apply(VTransformApplyState& apply_s
phase->igvn().replace_input_of(_node, 0, in0);
phase->igvn().replace_input_of(_node, 1, in1);
// Note: the backedge is hooked up later.
// The Phi's inputs may have been modified, and the types changes,
// e.g. from scalar to vector.
const Type* t = in1->bottom_type();
_node->as_Type()->set_type(t);
phase->igvn().set_type(_node, t);
return VTransformApplyResult::make_scalar(_node);
}
@ -939,6 +1000,242 @@ VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& appl
return VTransformApplyResult::make_vector(vn);
}
bool VTransformReductionVectorNode::optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
return optimize_move_non_strict_order_reductions_out_of_loop(vloop_analyzer, vtransform);
}
int VTransformReductionVectorNode::vector_reduction_opcode() const {
return ReductionNode::opcode(scalar_opcode(), element_basic_type());
}
bool VTransformReductionVectorNode::requires_strict_order() const {
int vopc = vector_reduction_opcode();
return ReductionNode::auto_vectorization_requires_strict_order(vopc);
}
// Having ReductionNodes in the loop is expensive. They need to recursively
// fold together the vector values, for every vectorized loop iteration. If
// we encounter the following pattern, we can vector accumulate the values
// inside the loop, and only have a single UnorderedReduction after the loop.
//
// Note: UnorderedReduction represents a ReductionNode which does not require
// calculating in strict order.
//
// CountedLoop init
// | |
// +------+ | +------------------------+
// | | | |
// PhiNode (s) |
// | |
// | Vector |
// | | |
// UnorderedReduction (first_red) |
// | |
// ... Vector |
// | | |
// UnorderedReduction (last_red) |
// | |
// +----------------------+
//
// We patch the graph to look like this:
//
// CountedLoop identity_vector
// | |
// +-------+ | +---------------+
// | | | |
// PhiNode (v) |
// | |
// | Vector |
// | | |
// VectorAccumulator |
// | |
// ... Vector |
// | | |
// init VectorAccumulator |
// | | | |
// UnorderedReduction +-----------+
//
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
// use vector_accumulators, which do the same reductions, but only element
// wise. This is a single operation per vector_accumulator, rather than many
// for a UnorderedReduction. We can then reduce the last vector_accumulator
// after the loop, and also reduce the init value into it.
//
// We can not do this with all reductions. Some reductions do not allow the
// reordering of operations (for example float addition/multiplication require
// strict order).
//
// Note: we must perform this optimization already during auto vectorization,
// before we evaluate the cost-model. Without this optimization, we may
// still have expensive reduction nodes in the loop which can make
// vectorization unprofitable. Only with the optimization does vectorization
// become profitable, since the expensive reduction node is moved
// outside the loop, and instead cheaper element-wise vector accumulations
// are performed inside the loop.
bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform) {
// We have a phi with a single use.
VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi();
if (phi == nullptr) {
return false;
}
if (phi->out_strong_edges() != 1) {
TRACE_OPTIMIZE(
tty->print(" Cannot move out of loop, phi has multiple uses:");
print();
tty->print(" phi: ");
phi->print();
)
return false;
}
if (requires_strict_order()) {
TRACE_OPTIMIZE(
tty->print(" Cannot move out of loop, strict order required: ");
print();
)
return false;
}
const int sopc = scalar_opcode();
const uint vlen = vector_length();
const BasicType bt = element_basic_type();
const int ropc = vector_reduction_opcode();
const int vopc = VectorNode::opcode(sopc, bt);
if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
DEBUG_ONLY( this->print(); )
assert(false, "do not have normal vector op for this reduction");
return false; // not implemented
}
// Traverse up the chain of non strict order reductions, checking that it loops
// back to the phi. Check that all non strict order reductions only have a single
// use, except for the last (last_red), which only has phi as a use in the loop,
// and all other uses are outside the loop.
VTransformReductionVectorNode* first_red = this;
VTransformReductionVectorNode* last_red = phi->in_req(2)->isa_ReductionVector();
VTransformReductionVectorNode* current_red = last_red;
while (true) {
if (current_red == nullptr ||
current_red->vector_reduction_opcode() != ropc ||
current_red->element_basic_type() != bt ||
current_red->vector_length() != vlen) {
TRACE_OPTIMIZE(
tty->print(" Cannot move out of loop, other reduction node does not match:");
print();
tty->print(" other: ");
current_red->print();
)
return false; // not compatible
}
VTransformVectorNode* vector_input = current_red->in_req(2)->isa_Vector();
if (vector_input == nullptr) {
assert(false, "reduction has a bad vector input");
return false;
}
// Expect single use of the non strict order reduction. Except for the last_red.
if (current_red == last_red) {
// All uses must be outside loop body, except for the phi.
for (uint i = 0; i < current_red->out_strong_edges(); i++) {
VTransformNode* use = current_red->out_strong_edge(i);
if (use->isa_LoopPhi() == nullptr &&
use->isa_Outer() == nullptr) {
// Should not be allowed by SuperWord::mark_reductions
assert(false, "reduction has use inside loop");
return false;
}
}
} else {
if (current_red->out_strong_edges() != 1) {
TRACE_OPTIMIZE(
tty->print(" Cannot move out of loop, other reduction node has use outside loop:");
print();
tty->print(" other: ");
current_red->print();
)
return false; // Only single use allowed
}
}
// If the scalar input is a phi, we passed all checks.
VTransformNode* scalar_input = current_red->in_req(1);
if (scalar_input == phi) {
break;
}
// We expect another non strict reduction, verify it in the next iteration.
current_red = scalar_input->isa_ReductionVector();
}
return true; // success
}
bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
if (!optimize_move_non_strict_order_reductions_out_of_loop_preconditions(vtransform)) {
return false;
}
// All checks were successful. Edit the vtransform graph now.
TRACE_OPTIMIZE(
tty->print_cr("VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop");
)
const int sopc = scalar_opcode();
const uint vlen = vector_length();
const BasicType bt = element_basic_type();
const int vopc = VectorNode::opcode(sopc, bt);
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
// Create a vector of identity values.
Node* identity = ReductionNode::make_identity_con_scalar(phase->igvn(), sopc, bt);
phase->set_root_as_ctrl(identity);
VTransformNode* vtn_identity = new (vtransform.arena()) VTransformOuterNode(vtransform, identity);
VTransformNode* vtn_identity_vector = new (vtransform.arena()) VTransformReplicateNode(vtransform, vlen, bt);
vtn_identity_vector->init_req(1, vtn_identity);
// Turn the scalar phi into a vector phi.
VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi();
VTransformNode* init = phi->in_req(1);
phi->set_req(1, vtn_identity_vector);
// Traverse down the chain of reductions, and replace them with vector_accumulators.
VTransformReductionVectorNode* first_red = this;
VTransformReductionVectorNode* last_red = phi->in_req(2)->isa_ReductionVector();
VTransformReductionVectorNode* current_red = first_red;
VTransformNode* current_vector_accumulator = phi;
while (true) {
VTransformNode* vector_input = current_red->in_req(2);
VTransformVectorNode* vector_accumulator = new (vtransform.arena()) VTransformElementWiseVectorNode(vtransform, 3, current_red->properties(), vopc);
vector_accumulator->init_req(1, current_vector_accumulator);
vector_accumulator->init_req(2, vector_input);
TRACE_OPTIMIZE(
tty->print(" replace ");
current_red->print();
tty->print(" with ");
vector_accumulator->print();
)
current_vector_accumulator = vector_accumulator;
if (current_red == last_red) { break; }
current_red = current_red->unique_out_strong_edge()->isa_ReductionVector();
}
// Feed vector accumulator into the backedge.
phi->set_req(2, current_vector_accumulator);
// Create post-loop reduction. last_red keeps all uses outside the loop.
last_red->set_req(1, init);
last_red->set_req(2, current_vector_accumulator);
TRACE_OPTIMIZE(
tty->print(" phi ");
phi->print();
tty->print(" after loop ");
last_red->print();
)
return true; // success
}
VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
Node* init = apply_state.transformed_node(in_req(1));
Node* vec = apply_state.transformed_node(in_req(2));
@ -1041,7 +1338,7 @@ void VTransformNode::print() const {
print_node_idx(_in.at(i));
}
}
tty->print(") [");
tty->print(") %s[", _is_alive ? "" : "dead ");
for (uint i = 0; i < _out_end_strong_edges; i++) {
print_node_idx(_out.at(i));
}

View File

@ -41,7 +41,11 @@
// - Construction:
// - From SuperWord PackSet, with the SuperWordVTransformBuilder.
//
// - Future Plans: optimize, if-conversion, etc.
// - Optimize:
// - Move non-strict order reductions out of the loop. This means we have
// only element-wise operations inside the loop, rather than the much
// more expensive lane-crossing reductions. We need to do this before
// assessing profitability with the cost-model.
//
// - Schedule:
// - Compute linearization of the VTransformGraph, into an order that respects
@ -62,12 +66,12 @@
//
// Future Plans with VTransform:
// - Cost model: estimate if vectorization is profitable.
// - Optimizations: moving unordered reductions out of the loop, whih decreases cost.
// - Pack/Unpack/Shuffle: introduce additional nodes not present in the scalar loop.
// This is difficult to do with the SuperWord packset approach.
// - If-conversion: convert predicated nodes into CFG.
typedef int VTransformNodeIDX;
class VTransform;
class VTransformNode;
class VTransformMemopScalarNode;
class VTransformDataScalarNode;
@ -183,6 +187,7 @@ public:
const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
const GrowableArray<VTransformNode*>& get_schedule() const { return _schedule; }
void optimize(VTransform& vtransform);
bool schedule();
bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
@ -194,6 +199,7 @@ private:
bool in_bb(const Node* n) const { return _vloop.in_bb(n); }
void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
int count_alive_vtnodes() const;
#ifndef PRODUCT
void print_vtnodes() const;
@ -239,10 +245,12 @@ public:
_aw_for_main_loop_alignment(aw_for_main_loop_alignment) {}
const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; }
const VLoop& vloop() const { return _vloop; }
Arena* arena() { return &_arena; }
DEBUG_ONLY( bool has_graph() const { return !_graph.is_empty(); } )
VTransformGraph& graph() { return _graph; }
void optimize() { return _graph.optimize(*this); }
bool schedule() { return _graph.schedule(); }
bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
void apply();
@ -372,6 +380,8 @@ public:
const VTransformNodeIDX _idx;
private:
bool _is_alive;
// We split _in into 3 sections:
// - data edges (req): _in[0 .. _req-1]
// - strong memory edges: _in[_req .. _in_end_strong_memory_edges-1]
@ -389,6 +399,7 @@ private:
public:
VTransformNode(VTransform& vtransform, const uint req) :
_idx(vtransform.graph().new_idx()),
_is_alive(true),
_req(req),
_in_end_strong_memory_edges(req),
_in(vtransform.arena(), req, req, nullptr),
@ -405,6 +416,14 @@ public:
n->add_out_strong_edge(this);
}
void set_req(uint i, VTransformNode* n) {
assert(i < _req, "must be a req");
VTransformNode* old = _in.at(i);
if (old != nullptr) { old->del_out_strong_edge(this); }
_in.at_put(i, n);
if (n != nullptr) { n->add_out_strong_edge(this); }
}
void swap_req(uint i, uint j) {
assert(i < _req, "must be a req");
assert(j < _req, "must be a req");
@ -452,6 +471,23 @@ private:
_out.push(n);
}
void del_out_strong_edge(VTransformNode* n) {
int i = _out.find(n);
assert(0 <= i && i < (int)_out_end_strong_edges, "must be in strong edges");
// Replace n with the last strong edge.
VTransformNode* last_strong = _out.at(_out_end_strong_edges - 1);
_out.at_put(i, last_strong);
if (_out_end_strong_edges < (uint)_out.length()) {
// Now replace where last_strong was with the last weak edge.
VTransformNode* last_weak = _out.top();
_out.at_put(_out_end_strong_edges - 1, last_weak);
}
_out.pop();
_out_end_strong_edges--;
}
public:
uint req() const { return _req; }
uint out_strong_edges() const { return _out_end_strong_edges; }
@ -479,6 +515,21 @@ public:
return false;
}
VTransformNode* unique_out_strong_edge() const {
assert(out_strong_edges() == 1, "must be unique");
return _out.at(0);
}
bool is_alive() const { return _is_alive; }
void mark_dead() {
_is_alive = false;
// Remove all inputs
for (uint i = 0; i < req(); i++) {
set_req(i, nullptr);
}
}
virtual VTransformMemopScalarNode* isa_MemopScalar() { return nullptr; }
virtual VTransformLoopPhiNode* isa_LoopPhi() { return nullptr; }
virtual VTransformCountedLoopNode* isa_CountedLoop() { return nullptr; }
@ -496,6 +547,8 @@ public:
virtual bool is_load_or_store_in_loop() const { return false; }
virtual const VPointer& vpointer() const { ShouldNotReachHere(); }
virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
virtual void apply_backedge(VTransformApplyState& apply_state) const {};
void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
@ -701,6 +754,7 @@ public:
NOT_PRODUCT(virtual void print_spec() const override;)
protected:
const VTransformVectorNodeProperties& properties() const { return _properties; }
Node* approximate_origin() const { return _properties.approximate_origin(); }
int scalar_opcode() const { return _properties.scalar_opcode(); }
uint vector_length() const { return _properties.vector_length(); }
@ -780,8 +834,15 @@ public:
VTransformReductionVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
VTransformVectorNode(vtransform, 3, properties) {}
virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
private:
int vector_reduction_opcode() const;
bool requires_strict_order() const;
bool optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform);
bool optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform);
};
class VTransformMemVectorNode : public VTransformVectorNode {