mirror of
https://github.com/openjdk/jdk.git
synced 2026-01-28 12:09:14 +00:00
8369448: C2 SuperWord: refactor VTransform to do move_unordered_reduction_out_of_loop during VTransform::optimize
Reviewed-by: chagedorn, kvn
This commit is contained in:
parent
a3ee821f38
commit
4786f8bee5
@ -5287,16 +5287,6 @@ void PhaseIdealLoop::build_and_optimize() {
|
||||
}
|
||||
}
|
||||
|
||||
// Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization.
|
||||
if (C->has_loops() && !C->major_progress()) {
|
||||
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
|
||||
IdealLoopTree* lpt = iter.current();
|
||||
if (lpt->is_counted() && lpt->is_innermost()) {
|
||||
move_unordered_reduction_out_of_loop(lpt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Keep loop predicates and perform optimizations with them
|
||||
// until no more loop optimizations could be done.
|
||||
// After that switch predicates off and do more loop optimizations.
|
||||
|
||||
@ -1550,9 +1550,6 @@ public:
|
||||
IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj);
|
||||
bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt);
|
||||
|
||||
// Move an unordered Reduction out of loop if possible
|
||||
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
|
||||
|
||||
// Create a scheduled list of nodes control dependent on ctrl set.
|
||||
void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
|
||||
// Has a use in the vector set
|
||||
|
||||
@ -4548,211 +4548,6 @@ void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(Id
|
||||
do_multiversioning(lpt, old_new);
|
||||
}
|
||||
|
||||
// Returns true if the Reduction node is unordered.
|
||||
static bool is_unordered_reduction(Node* n) {
|
||||
return n->is_Reduction() && !n->as_Reduction()->requires_strict_order();
|
||||
}
|
||||
|
||||
// Having ReductionNodes in the loop is expensive. They need to recursively
|
||||
// fold together the vector values, for every vectorized loop iteration. If
|
||||
// we encounter the following pattern, we can vector accumulate the values
|
||||
// inside the loop, and only have a single UnorderedReduction after the loop.
|
||||
//
|
||||
// Note: UnorderedReduction represents a ReductionNode which does not require
|
||||
// calculating in strict order.
|
||||
//
|
||||
// CountedLoop init
|
||||
// | |
|
||||
// +------+ | +-----------------------+
|
||||
// | | | |
|
||||
// PhiNode (s) |
|
||||
// | |
|
||||
// | Vector |
|
||||
// | | |
|
||||
// UnorderedReduction (first_ur) |
|
||||
// | |
|
||||
// ... Vector |
|
||||
// | | |
|
||||
// UnorderedReduction (last_ur) |
|
||||
// | |
|
||||
// +---------------------+
|
||||
//
|
||||
// We patch the graph to look like this:
|
||||
//
|
||||
// CountedLoop identity_vector
|
||||
// | |
|
||||
// +-------+ | +---------------+
|
||||
// | | | |
|
||||
// PhiNode (v) |
|
||||
// | |
|
||||
// | Vector |
|
||||
// | | |
|
||||
// VectorAccumulator |
|
||||
// | |
|
||||
// ... Vector |
|
||||
// | | |
|
||||
// init VectorAccumulator |
|
||||
// | | | |
|
||||
// UnorderedReduction +-----------+
|
||||
//
|
||||
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
|
||||
// use vector_accumulators, which do the same reductions, but only element
|
||||
// wise. This is a single operation per vector_accumulator, rather than many
|
||||
// for a UnorderedReduction. We can then reduce the last vector_accumulator
|
||||
// after the loop, and also reduce the init value into it.
|
||||
//
|
||||
// We can not do this with all reductions. Some reductions do not allow the
|
||||
// reordering of operations (for example float addition/multiplication require
|
||||
// strict order).
|
||||
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
|
||||
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
|
||||
|
||||
// Find all Phi nodes with an unordered Reduction on backedge.
|
||||
CountedLoopNode* cl = loop->_head->as_CountedLoop();
|
||||
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
|
||||
Node* phi = cl->fast_out(j);
|
||||
// We have a phi with a single use, and an unordered Reduction on the backedge.
|
||||
if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ReductionNode* last_ur = phi->in(2)->as_Reduction();
|
||||
assert(!last_ur->requires_strict_order(), "must be");
|
||||
|
||||
// Determine types
|
||||
const TypeVect* vec_t = last_ur->vect_type();
|
||||
uint vector_length = vec_t->length();
|
||||
BasicType bt = vec_t->element_basic_type();
|
||||
|
||||
// Convert opcode from vector-reduction -> scalar -> normal-vector-op
|
||||
const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
|
||||
const int vopc = VectorNode::opcode(sopc, bt);
|
||||
if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
|
||||
DEBUG_ONLY( last_ur->dump(); )
|
||||
assert(false, "do not have normal vector op for this reduction");
|
||||
continue; // not implemented -> fails
|
||||
}
|
||||
|
||||
// Traverse up the chain of unordered Reductions, checking that it loops back to
|
||||
// the phi. Check that all unordered Reductions only have a single use, except for
|
||||
// the last (last_ur), which only has phi as a use in the loop, and all other uses
|
||||
// are outside the loop.
|
||||
ReductionNode* current = last_ur;
|
||||
ReductionNode* first_ur = nullptr;
|
||||
while (true) {
|
||||
assert(!current->requires_strict_order(), "sanity");
|
||||
|
||||
// Expect no ctrl and a vector_input from within the loop.
|
||||
Node* ctrl = current->in(0);
|
||||
Node* vector_input = current->in(2);
|
||||
if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
|
||||
DEBUG_ONLY( current->dump(1); )
|
||||
assert(false, "reduction has ctrl or bad vector_input");
|
||||
break; // Chain traversal fails.
|
||||
}
|
||||
|
||||
assert(current->vect_type() != nullptr, "must have vector type");
|
||||
if (current->vect_type() != last_ur->vect_type()) {
|
||||
// Reductions do not have the same vector type (length and element type).
|
||||
break; // Chain traversal fails.
|
||||
}
|
||||
|
||||
// Expect single use of an unordered Reduction, except for last_ur.
|
||||
if (current == last_ur) {
|
||||
// Expect all uses to be outside the loop, except phi.
|
||||
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
|
||||
Node* use = current->fast_out(k);
|
||||
if (use != phi && ctrl_or_self(use) == cl) {
|
||||
DEBUG_ONLY( current->dump(-1); )
|
||||
assert(false, "reduction has use inside loop");
|
||||
// Should not be allowed by SuperWord::mark_reductions
|
||||
return; // bail out of optimization
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (current->outcnt() != 1) {
|
||||
break; // Chain traversal fails.
|
||||
}
|
||||
}
|
||||
|
||||
// Expect another unordered Reduction or phi as the scalar input.
|
||||
Node* scalar_input = current->in(1);
|
||||
if (is_unordered_reduction(scalar_input) &&
|
||||
scalar_input->Opcode() == current->Opcode()) {
|
||||
// Move up the unordered Reduction chain.
|
||||
current = scalar_input->as_Reduction();
|
||||
assert(!current->requires_strict_order(), "must be");
|
||||
} else if (scalar_input == phi) {
|
||||
// Chain terminates at phi.
|
||||
first_ur = current;
|
||||
current = nullptr;
|
||||
break; // Success.
|
||||
} else {
|
||||
// scalar_input is neither phi nor a matching reduction
|
||||
// Can for example be scalar reduction when we have
|
||||
// partial vectorization.
|
||||
break; // Chain traversal fails.
|
||||
}
|
||||
}
|
||||
if (current != nullptr) {
|
||||
// Chain traversal was not successful.
|
||||
continue;
|
||||
}
|
||||
assert(first_ur != nullptr, "must have successfully terminated chain traversal");
|
||||
|
||||
Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
|
||||
set_root_as_ctrl(identity_scalar);
|
||||
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt);
|
||||
register_new_node(identity_vector, C->root());
|
||||
assert(vec_t == identity_vector->vect_type(), "matching vector type");
|
||||
VectorNode::trace_new_vector(identity_vector, "Unordered Reduction");
|
||||
|
||||
// Turn the scalar phi into a vector phi.
|
||||
_igvn.rehash_node_delayed(phi);
|
||||
Node* init = phi->in(1); // Remember init before replacing it.
|
||||
phi->set_req_X(1, identity_vector, &_igvn);
|
||||
phi->as_Type()->set_type(vec_t);
|
||||
_igvn.set_type(phi, vec_t);
|
||||
|
||||
// Traverse down the chain of unordered Reductions, and replace them with vector_accumulators.
|
||||
current = first_ur;
|
||||
while (true) {
|
||||
// Create vector_accumulator to replace current.
|
||||
Node* last_vector_accumulator = current->in(1);
|
||||
Node* vector_input = current->in(2);
|
||||
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
|
||||
register_new_node(vector_accumulator, cl);
|
||||
_igvn.replace_node(current, vector_accumulator);
|
||||
VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction");
|
||||
if (current == last_ur) {
|
||||
break;
|
||||
}
|
||||
current = vector_accumulator->unique_out()->as_Reduction();
|
||||
assert(!current->requires_strict_order(), "must be");
|
||||
}
|
||||
|
||||
// Create post-loop reduction.
|
||||
Node* last_accumulator = phi->in(2);
|
||||
Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);
|
||||
|
||||
// Take over uses of last_accumulator that are not in the loop.
|
||||
for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
|
||||
Node* use = last_accumulator->out(i);
|
||||
if (use != phi && use != post_loop_reduction) {
|
||||
assert(ctrl_or_self(use) != cl, "use must be outside loop");
|
||||
use->replace_edge(last_accumulator, post_loop_reduction, &_igvn);
|
||||
--i;
|
||||
}
|
||||
}
|
||||
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
|
||||
VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction");
|
||||
|
||||
assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
|
||||
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
|
||||
assert(phi->outcnt() == 1, "accumulator is the only use of phi");
|
||||
}
|
||||
}
|
||||
|
||||
void DataNodeGraph::clone_data_nodes(Node* new_ctrl) {
|
||||
for (uint i = 0; i < _data_nodes.size(); i++) {
|
||||
clone(_data_nodes[i], new_ctrl);
|
||||
|
||||
@ -1606,7 +1606,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
|
||||
// 3 instructions (1 shuffle and two reduction ops).
|
||||
// However, this optimization assumes that these reductions stay in the loop
|
||||
// which may not be true any more in most cases after the introduction of:
|
||||
// PhaseIdealLoop::move_unordered_reduction_out_of_loop
|
||||
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
|
||||
// Hence, this heuristic has room for improvement.
|
||||
bool is_two_element_int_or_long_reduction = (size == 2) &&
|
||||
(arith_type->basic_type() == T_INT ||
|
||||
@ -1782,7 +1782,7 @@ bool SuperWord::profitable(const Node_List* p) const {
|
||||
// This heuristic is a bit simplistic, and assumes that the reduction
|
||||
// vector stays in the loop. But in some cases, we can move the
|
||||
// reduction out of the loop, replacing it with a single vector op.
|
||||
// See: PhaseIdealLoop::move_unordered_reduction_out_of_loop
|
||||
// See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
|
||||
// Hence, this heuristic has room for improvement.
|
||||
#ifndef PRODUCT
|
||||
if (is_trace_superword_rejections()) {
|
||||
@ -1947,6 +1947,8 @@ bool SuperWord::do_vtransform() const {
|
||||
SuperWordVTransformBuilder builder(_packset, vtransform);
|
||||
}
|
||||
|
||||
vtransform.optimize();
|
||||
|
||||
if (!vtransform.schedule()) { return false; }
|
||||
if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
|
||||
|
||||
|
||||
@ -45,10 +45,11 @@
|
||||
flags(SW_PACKSET, "Trace SuperWord packset at different stages") \
|
||||
flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \
|
||||
flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \
|
||||
flags(VTRANSFORM, "Trace VTransform Graph") \
|
||||
flags(OPTIMIZATION, "Trace VTransform::optimize") \
|
||||
flags(ALIGN_VECTOR, "Trace AlignVector") \
|
||||
flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
|
||||
flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
|
||||
flags(VTRANSFORM, "Trace VTransform Graph") \
|
||||
flags(ALL, "Trace everything (very verbose)")
|
||||
|
||||
#define table_entry(name, description) name,
|
||||
|
||||
@ -205,6 +205,10 @@ public:
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS);
|
||||
}
|
||||
|
||||
bool is_trace_optimization() const {
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
|
||||
}
|
||||
|
||||
bool is_trace_speculative_runtime_checks() const {
|
||||
return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
|
||||
}
|
||||
|
||||
@ -292,121 +292,6 @@ int VectorNode::opcode(int sopc, BasicType bt) {
|
||||
}
|
||||
}
|
||||
|
||||
// Return the scalar opcode for the specified vector opcode
|
||||
// and basic type.
|
||||
int VectorNode::scalar_opcode(int sopc, BasicType bt) {
|
||||
switch (sopc) {
|
||||
case Op_AddReductionVI:
|
||||
case Op_AddVI:
|
||||
return Op_AddI;
|
||||
case Op_AddReductionVL:
|
||||
case Op_AddVL:
|
||||
return Op_AddL;
|
||||
case Op_MulReductionVI:
|
||||
case Op_MulVI:
|
||||
return Op_MulI;
|
||||
case Op_MulReductionVL:
|
||||
case Op_MulVL:
|
||||
return Op_MulL;
|
||||
case Op_AndReductionV:
|
||||
case Op_AndV:
|
||||
switch (bt) {
|
||||
case T_BOOLEAN:
|
||||
case T_CHAR:
|
||||
case T_BYTE:
|
||||
case T_SHORT:
|
||||
case T_INT:
|
||||
return Op_AndI;
|
||||
case T_LONG:
|
||||
return Op_AndL;
|
||||
default:
|
||||
assert(false, "basic type not handled");
|
||||
return 0;
|
||||
}
|
||||
case Op_OrReductionV:
|
||||
case Op_OrV:
|
||||
switch (bt) {
|
||||
case T_BOOLEAN:
|
||||
case T_CHAR:
|
||||
case T_BYTE:
|
||||
case T_SHORT:
|
||||
case T_INT:
|
||||
return Op_OrI;
|
||||
case T_LONG:
|
||||
return Op_OrL;
|
||||
default:
|
||||
assert(false, "basic type not handled");
|
||||
return 0;
|
||||
}
|
||||
case Op_XorReductionV:
|
||||
case Op_XorV:
|
||||
switch (bt) {
|
||||
case T_BOOLEAN:
|
||||
case T_CHAR:
|
||||
case T_BYTE:
|
||||
case T_SHORT:
|
||||
case T_INT:
|
||||
return Op_XorI;
|
||||
case T_LONG:
|
||||
return Op_XorL;
|
||||
default:
|
||||
assert(false, "basic type not handled");
|
||||
return 0;
|
||||
}
|
||||
case Op_MinReductionV:
|
||||
case Op_MinV:
|
||||
switch (bt) {
|
||||
case T_BOOLEAN:
|
||||
case T_CHAR:
|
||||
assert(false, "boolean and char are signed, not implemented for Min");
|
||||
return 0;
|
||||
case T_BYTE:
|
||||
case T_SHORT:
|
||||
case T_INT:
|
||||
return Op_MinI;
|
||||
case T_LONG:
|
||||
return Op_MinL;
|
||||
case T_FLOAT:
|
||||
return Op_MinF;
|
||||
case T_DOUBLE:
|
||||
return Op_MinD;
|
||||
default:
|
||||
assert(false, "basic type not handled");
|
||||
return 0;
|
||||
}
|
||||
case Op_MaxReductionV:
|
||||
case Op_MaxV:
|
||||
switch (bt) {
|
||||
case T_BOOLEAN:
|
||||
case T_CHAR:
|
||||
assert(false, "boolean and char are signed, not implemented for Max");
|
||||
return 0;
|
||||
case T_BYTE:
|
||||
case T_SHORT:
|
||||
case T_INT:
|
||||
return Op_MaxI;
|
||||
case T_LONG:
|
||||
return Op_MaxL;
|
||||
case T_FLOAT:
|
||||
return Op_MaxF;
|
||||
case T_DOUBLE:
|
||||
return Op_MaxD;
|
||||
default:
|
||||
assert(false, "basic type not handled");
|
||||
return 0;
|
||||
}
|
||||
case Op_MinVHF:
|
||||
return Op_MinHF;
|
||||
case Op_MaxVHF:
|
||||
return Op_MaxHF;
|
||||
default:
|
||||
assert(false,
|
||||
"Vector node %s is not handled in VectorNode::scalar_opcode",
|
||||
NodeClassNames[sopc]);
|
||||
return 0; // Unimplemented
|
||||
}
|
||||
}
|
||||
|
||||
// Limits on vector size (number of elements) for auto-vectorization.
|
||||
bool VectorNode::vector_size_supported_auto_vectorization(const BasicType bt, int size) {
|
||||
return Matcher::max_vector_size_auto_vectorization(bt) >= size &&
|
||||
@ -1727,6 +1612,34 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ReductionNode::auto_vectorization_requires_strict_order(int vopc) {
|
||||
switch (vopc) {
|
||||
case Op_AddReductionVI:
|
||||
case Op_AddReductionVL:
|
||||
case Op_MulReductionVI:
|
||||
case Op_MulReductionVL:
|
||||
case Op_MinReductionV:
|
||||
case Op_MaxReductionV:
|
||||
case Op_AndReductionV:
|
||||
case Op_OrReductionV:
|
||||
case Op_XorReductionV:
|
||||
// These are cases that all have associative operations, which can
|
||||
// thus be reordered, allowing non-strict order reductions.
|
||||
return false;
|
||||
case Op_AddReductionVF:
|
||||
case Op_MulReductionVF:
|
||||
case Op_AddReductionVD:
|
||||
case Op_MulReductionVD:
|
||||
// Floating-point addition and multiplication are non-associative,
|
||||
// so AddReductionVF/D and MulReductionVF/D require strict ordering
|
||||
// in auto-vectorization.
|
||||
return true;
|
||||
default:
|
||||
assert(false, "not handled: %s", NodeClassNames[vopc]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* in1, Node* in2, Node* in3,
|
||||
Node* mask, uint truth_table, const TypeVect* vt) {
|
||||
assert(truth_table <= 0xFF, "invalid");
|
||||
|
||||
@ -95,7 +95,6 @@ class VectorNode : public TypeNode {
|
||||
static bool is_rotate_opcode(int opc);
|
||||
|
||||
static int opcode(int sopc, BasicType bt); // scalar_opc -> vector_opc
|
||||
static int scalar_opcode(int vopc, BasicType bt); // vector_opc -> scalar_opc
|
||||
|
||||
static int shift_count_opcode(int opc);
|
||||
|
||||
@ -283,6 +282,8 @@ class ReductionNode : public Node {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool auto_vectorization_requires_strict_order(int vopc);
|
||||
|
||||
#ifndef PRODUCT
|
||||
void dump_spec(outputStream* st) const {
|
||||
if (requires_strict_order()) {
|
||||
|
||||
@ -23,6 +23,7 @@
|
||||
|
||||
#include "opto/castnode.hpp"
|
||||
#include "opto/convertnode.hpp"
|
||||
#include "opto/rootnode.hpp"
|
||||
#include "opto/vectorization.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
#include "opto/vtransform.hpp"
|
||||
@ -32,6 +33,45 @@ void VTransformGraph::add_vtnode(VTransformNode* vtnode) {
|
||||
_vtnodes.push(vtnode);
|
||||
}
|
||||
|
||||
#define TRACE_OPTIMIZE(code) \
|
||||
NOT_PRODUCT( \
|
||||
if (vtransform.vloop().is_trace_optimization()) { \
|
||||
code \
|
||||
} \
|
||||
)
|
||||
|
||||
// This is similar to IGVN optimization. But we are a bit lazy, and don't care about
|
||||
// notification / worklist, since the list of nodes is rather small, and we don't
|
||||
// expect optimizations that trickle over the whole graph.
|
||||
void VTransformGraph::optimize(VTransform& vtransform) {
|
||||
TRACE_OPTIMIZE( tty->print_cr("\nVTransformGraph::optimize"); )
|
||||
|
||||
bool progress = true;
|
||||
DEBUG_ONLY(int pass_count = 0;)
|
||||
while (progress) {
|
||||
progress = false;
|
||||
assert(++pass_count < 10, "ensure we do not have endless loops");
|
||||
for (int i = 0; i < _vtnodes.length(); i++) {
|
||||
VTransformNode* vtn = _vtnodes.at(i);
|
||||
if (!vtn->is_alive()) { continue; }
|
||||
progress |= vtn->optimize(_vloop_analyzer, vtransform);
|
||||
|
||||
// Nodes that have no use any more are dead.
|
||||
if (vtn->out_strong_edges() == 0 &&
|
||||
// There are some exceptions:
|
||||
// 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive.
|
||||
// 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive.
|
||||
// 3. Outer node with strong inputs: is a use after the loop that we must keep alive.
|
||||
!(vtn->isa_LoopPhi() != nullptr ||
|
||||
vtn->is_load_or_store_in_loop() ||
|
||||
(vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) {
|
||||
vtn->mark_dead();
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute a linearization of the graph. We do this with a reverse-post-order of a DFS.
|
||||
// This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and
|
||||
// the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the
|
||||
@ -59,10 +99,11 @@ bool VTransformGraph::schedule() {
|
||||
VectorSet post_visited;
|
||||
|
||||
collect_nodes_without_strong_in_edges(stack);
|
||||
const int num_alive_nodes = count_alive_vtnodes();
|
||||
|
||||
// We create a reverse-post-visit order. This gives us a linearization, if there are
|
||||
// no cycles. Then, we simply reverse the order, and we have a schedule.
|
||||
int rpo_idx = _vtnodes.length() - 1;
|
||||
int rpo_idx = num_alive_nodes - 1;
|
||||
while (!stack.is_empty()) {
|
||||
VTransformNode* vtn = stack.top();
|
||||
if (!pre_visited.test_set(vtn->_idx)) {
|
||||
@ -79,6 +120,9 @@ bool VTransformGraph::schedule() {
|
||||
for (uint i = 0; i < vtn->out_strong_edges(); i++) {
|
||||
VTransformNode* use = vtn->out_strong_edge(i);
|
||||
|
||||
// Skip dead nodes
|
||||
if (!use->is_alive()) { continue; }
|
||||
|
||||
// Skip LoopPhi backedge.
|
||||
if ((use->isa_LoopPhi() != nullptr || use->isa_CountedLoop() != nullptr) && use->in_req(2) == vtn) { continue; }
|
||||
|
||||
@ -121,6 +165,7 @@ bool VTransformGraph::schedule() {
|
||||
void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const {
|
||||
for (int i = 0; i < _vtnodes.length(); i++) {
|
||||
VTransformNode* vtn = _vtnodes.at(i);
|
||||
if (!vtn->is_alive()) { continue; }
|
||||
if (!vtn->has_strong_in_edge()) {
|
||||
stack.push(vtn);
|
||||
}
|
||||
@ -132,6 +177,15 @@ void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray<VTrans
|
||||
}
|
||||
}
|
||||
|
||||
int VTransformGraph::count_alive_vtnodes() const {
|
||||
int count = 0;
|
||||
for (int i = 0; i < _vtnodes.length(); i++) {
|
||||
VTransformNode* vtn = _vtnodes.at(i);
|
||||
if (vtn->is_alive()) { count++; }
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
|
||||
const VectorSet& pre_visited,
|
||||
@ -801,6 +855,13 @@ VTransformApplyResult VTransformLoopPhiNode::apply(VTransformApplyState& apply_s
|
||||
phase->igvn().replace_input_of(_node, 0, in0);
|
||||
phase->igvn().replace_input_of(_node, 1, in1);
|
||||
// Note: the backedge is hooked up later.
|
||||
|
||||
// The Phi's inputs may have been modified, and the types changes,
|
||||
// e.g. from scalar to vector.
|
||||
const Type* t = in1->bottom_type();
|
||||
_node->as_Type()->set_type(t);
|
||||
phase->igvn().set_type(_node, t);
|
||||
|
||||
return VTransformApplyResult::make_scalar(_node);
|
||||
}
|
||||
|
||||
@ -939,6 +1000,242 @@ VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& appl
|
||||
return VTransformApplyResult::make_vector(vn);
|
||||
}
|
||||
|
||||
bool VTransformReductionVectorNode::optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
|
||||
return optimize_move_non_strict_order_reductions_out_of_loop(vloop_analyzer, vtransform);
|
||||
}
|
||||
|
||||
int VTransformReductionVectorNode::vector_reduction_opcode() const {
|
||||
return ReductionNode::opcode(scalar_opcode(), element_basic_type());
|
||||
}
|
||||
|
||||
bool VTransformReductionVectorNode::requires_strict_order() const {
|
||||
int vopc = vector_reduction_opcode();
|
||||
return ReductionNode::auto_vectorization_requires_strict_order(vopc);
|
||||
}
|
||||
|
||||
// Having ReductionNodes in the loop is expensive. They need to recursively
|
||||
// fold together the vector values, for every vectorized loop iteration. If
|
||||
// we encounter the following pattern, we can vector accumulate the values
|
||||
// inside the loop, and only have a single UnorderedReduction after the loop.
|
||||
//
|
||||
// Note: UnorderedReduction represents a ReductionNode which does not require
|
||||
// calculating in strict order.
|
||||
//
|
||||
// CountedLoop init
|
||||
// | |
|
||||
// +------+ | +------------------------+
|
||||
// | | | |
|
||||
// PhiNode (s) |
|
||||
// | |
|
||||
// | Vector |
|
||||
// | | |
|
||||
// UnorderedReduction (first_red) |
|
||||
// | |
|
||||
// ... Vector |
|
||||
// | | |
|
||||
// UnorderedReduction (last_red) |
|
||||
// | |
|
||||
// +----------------------+
|
||||
//
|
||||
// We patch the graph to look like this:
|
||||
//
|
||||
// CountedLoop identity_vector
|
||||
// | |
|
||||
// +-------+ | +---------------+
|
||||
// | | | |
|
||||
// PhiNode (v) |
|
||||
// | |
|
||||
// | Vector |
|
||||
// | | |
|
||||
// VectorAccumulator |
|
||||
// | |
|
||||
// ... Vector |
|
||||
// | | |
|
||||
// init VectorAccumulator |
|
||||
// | | | |
|
||||
// UnorderedReduction +-----------+
|
||||
//
|
||||
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
|
||||
// use vector_accumulators, which do the same reductions, but only element
|
||||
// wise. This is a single operation per vector_accumulator, rather than many
|
||||
// for a UnorderedReduction. We can then reduce the last vector_accumulator
|
||||
// after the loop, and also reduce the init value into it.
|
||||
//
|
||||
// We can not do this with all reductions. Some reductions do not allow the
|
||||
// reordering of operations (for example float addition/multiplication require
|
||||
// strict order).
|
||||
//
|
||||
// Note: we must perform this optimization already during auto vectorization,
|
||||
// before we evaluate the cost-model. Without this optimization, we may
|
||||
// still have expensive reduction nodes in the loop which can make
|
||||
// vectorization unprofitable. Only with the optimization does vectorization
|
||||
// become profitable, since the expensive reduction node is moved
|
||||
// outside the loop, and instead cheaper element-wise vector accumulations
|
||||
// are performed inside the loop.
|
||||
bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform) {
|
||||
// We have a phi with a single use.
|
||||
VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi();
|
||||
if (phi == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (phi->out_strong_edges() != 1) {
|
||||
TRACE_OPTIMIZE(
|
||||
tty->print(" Cannot move out of loop, phi has multiple uses:");
|
||||
print();
|
||||
tty->print(" phi: ");
|
||||
phi->print();
|
||||
)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (requires_strict_order()) {
|
||||
TRACE_OPTIMIZE(
|
||||
tty->print(" Cannot move out of loop, strict order required: ");
|
||||
print();
|
||||
)
|
||||
return false;
|
||||
}
|
||||
|
||||
const int sopc = scalar_opcode();
|
||||
const uint vlen = vector_length();
|
||||
const BasicType bt = element_basic_type();
|
||||
const int ropc = vector_reduction_opcode();
|
||||
const int vopc = VectorNode::opcode(sopc, bt);
|
||||
if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
|
||||
DEBUG_ONLY( this->print(); )
|
||||
assert(false, "do not have normal vector op for this reduction");
|
||||
return false; // not implemented
|
||||
}
|
||||
|
||||
// Traverse up the chain of non strict order reductions, checking that it loops
|
||||
// back to the phi. Check that all non strict order reductions only have a single
|
||||
// use, except for the last (last_red), which only has phi as a use in the loop,
|
||||
// and all other uses are outside the loop.
|
||||
VTransformReductionVectorNode* first_red = this;
|
||||
VTransformReductionVectorNode* last_red = phi->in_req(2)->isa_ReductionVector();
|
||||
VTransformReductionVectorNode* current_red = last_red;
|
||||
while (true) {
|
||||
if (current_red == nullptr ||
|
||||
current_red->vector_reduction_opcode() != ropc ||
|
||||
current_red->element_basic_type() != bt ||
|
||||
current_red->vector_length() != vlen) {
|
||||
TRACE_OPTIMIZE(
|
||||
tty->print(" Cannot move out of loop, other reduction node does not match:");
|
||||
print();
|
||||
tty->print(" other: ");
|
||||
current_red->print();
|
||||
)
|
||||
return false; // not compatible
|
||||
}
|
||||
|
||||
VTransformVectorNode* vector_input = current_red->in_req(2)->isa_Vector();
|
||||
if (vector_input == nullptr) {
|
||||
assert(false, "reduction has a bad vector input");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Expect single use of the non strict order reduction. Except for the last_red.
|
||||
if (current_red == last_red) {
|
||||
// All uses must be outside loop body, except for the phi.
|
||||
for (uint i = 0; i < current_red->out_strong_edges(); i++) {
|
||||
VTransformNode* use = current_red->out_strong_edge(i);
|
||||
if (use->isa_LoopPhi() == nullptr &&
|
||||
use->isa_Outer() == nullptr) {
|
||||
// Should not be allowed by SuperWord::mark_reductions
|
||||
assert(false, "reduction has use inside loop");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (current_red->out_strong_edges() != 1) {
|
||||
TRACE_OPTIMIZE(
|
||||
tty->print(" Cannot move out of loop, other reduction node has use outside loop:");
|
||||
print();
|
||||
tty->print(" other: ");
|
||||
current_red->print();
|
||||
)
|
||||
return false; // Only single use allowed
|
||||
}
|
||||
}
|
||||
|
||||
// If the scalar input is a phi, we passed all checks.
|
||||
VTransformNode* scalar_input = current_red->in_req(1);
|
||||
if (scalar_input == phi) {
|
||||
break;
|
||||
}
|
||||
|
||||
// We expect another non strict reduction, verify it in the next iteration.
|
||||
current_red = scalar_input->isa_ReductionVector();
|
||||
}
|
||||
return true; // success
|
||||
}
|
||||
|
||||
bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) {
|
||||
if (!optimize_move_non_strict_order_reductions_out_of_loop_preconditions(vtransform)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// All checks were successful. Edit the vtransform graph now.
|
||||
TRACE_OPTIMIZE(
|
||||
tty->print_cr("VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop");
|
||||
)
|
||||
|
||||
const int sopc = scalar_opcode();
|
||||
const uint vlen = vector_length();
|
||||
const BasicType bt = element_basic_type();
|
||||
const int vopc = VectorNode::opcode(sopc, bt);
|
||||
PhaseIdealLoop* phase = vloop_analyzer.vloop().phase();
|
||||
|
||||
// Create a vector of identity values.
|
||||
Node* identity = ReductionNode::make_identity_con_scalar(phase->igvn(), sopc, bt);
|
||||
phase->set_root_as_ctrl(identity);
|
||||
VTransformNode* vtn_identity = new (vtransform.arena()) VTransformOuterNode(vtransform, identity);
|
||||
|
||||
VTransformNode* vtn_identity_vector = new (vtransform.arena()) VTransformReplicateNode(vtransform, vlen, bt);
|
||||
vtn_identity_vector->init_req(1, vtn_identity);
|
||||
|
||||
// Turn the scalar phi into a vector phi.
|
||||
VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi();
|
||||
VTransformNode* init = phi->in_req(1);
|
||||
phi->set_req(1, vtn_identity_vector);
|
||||
|
||||
// Traverse down the chain of reductions, and replace them with vector_accumulators.
|
||||
VTransformReductionVectorNode* first_red = this;
|
||||
VTransformReductionVectorNode* last_red = phi->in_req(2)->isa_ReductionVector();
|
||||
VTransformReductionVectorNode* current_red = first_red;
|
||||
VTransformNode* current_vector_accumulator = phi;
|
||||
while (true) {
|
||||
VTransformNode* vector_input = current_red->in_req(2);
|
||||
VTransformVectorNode* vector_accumulator = new (vtransform.arena()) VTransformElementWiseVectorNode(vtransform, 3, current_red->properties(), vopc);
|
||||
vector_accumulator->init_req(1, current_vector_accumulator);
|
||||
vector_accumulator->init_req(2, vector_input);
|
||||
TRACE_OPTIMIZE(
|
||||
tty->print(" replace ");
|
||||
current_red->print();
|
||||
tty->print(" with ");
|
||||
vector_accumulator->print();
|
||||
)
|
||||
current_vector_accumulator = vector_accumulator;
|
||||
if (current_red == last_red) { break; }
|
||||
current_red = current_red->unique_out_strong_edge()->isa_ReductionVector();
|
||||
}
|
||||
|
||||
// Feed vector accumulator into the backedge.
|
||||
phi->set_req(2, current_vector_accumulator);
|
||||
|
||||
// Create post-loop reduction. last_red keeps all uses outside the loop.
|
||||
last_red->set_req(1, init);
|
||||
last_red->set_req(2, current_vector_accumulator);
|
||||
|
||||
TRACE_OPTIMIZE(
|
||||
tty->print(" phi ");
|
||||
phi->print();
|
||||
tty->print(" after loop ");
|
||||
last_red->print();
|
||||
)
|
||||
return true; // success
|
||||
}
|
||||
|
||||
VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
|
||||
Node* init = apply_state.transformed_node(in_req(1));
|
||||
Node* vec = apply_state.transformed_node(in_req(2));
|
||||
@ -1041,7 +1338,7 @@ void VTransformNode::print() const {
|
||||
print_node_idx(_in.at(i));
|
||||
}
|
||||
}
|
||||
tty->print(") [");
|
||||
tty->print(") %s[", _is_alive ? "" : "dead ");
|
||||
for (uint i = 0; i < _out_end_strong_edges; i++) {
|
||||
print_node_idx(_out.at(i));
|
||||
}
|
||||
|
||||
@ -41,7 +41,11 @@
|
||||
// - Construction:
|
||||
// - From SuperWord PackSet, with the SuperWordVTransformBuilder.
|
||||
//
|
||||
// - Future Plans: optimize, if-conversion, etc.
|
||||
// - Optimize:
|
||||
// - Move non-strict order reductions out of the loop. This means we have
|
||||
// only element-wise operations inside the loop, rather than the much
|
||||
// more expensive lane-crossing reductions. We need to do this before
|
||||
// assessing profitability with the cost-model.
|
||||
//
|
||||
// - Schedule:
|
||||
// - Compute linearization of the VTransformGraph, into an order that respects
|
||||
@ -62,12 +66,12 @@
|
||||
//
|
||||
// Future Plans with VTransform:
|
||||
// - Cost model: estimate if vectorization is profitable.
|
||||
// - Optimizations: moving unordered reductions out of the loop, whih decreases cost.
|
||||
// - Pack/Unpack/Shuffle: introduce additional nodes not present in the scalar loop.
|
||||
// This is difficult to do with the SuperWord packset approach.
|
||||
// - If-conversion: convert predicated nodes into CFG.
|
||||
|
||||
typedef int VTransformNodeIDX;
|
||||
class VTransform;
|
||||
class VTransformNode;
|
||||
class VTransformMemopScalarNode;
|
||||
class VTransformDataScalarNode;
|
||||
@ -183,6 +187,7 @@ public:
|
||||
const GrowableArray<VTransformNode*>& vtnodes() const { return _vtnodes; }
|
||||
const GrowableArray<VTransformNode*>& get_schedule() const { return _schedule; }
|
||||
|
||||
void optimize(VTransform& vtransform);
|
||||
bool schedule();
|
||||
bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
|
||||
void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
|
||||
@ -194,6 +199,7 @@ private:
|
||||
bool in_bb(const Node* n) const { return _vloop.in_bb(n); }
|
||||
|
||||
void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
|
||||
int count_alive_vtnodes() const;
|
||||
|
||||
#ifndef PRODUCT
|
||||
void print_vtnodes() const;
|
||||
@ -239,10 +245,12 @@ public:
|
||||
_aw_for_main_loop_alignment(aw_for_main_loop_alignment) {}
|
||||
|
||||
const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; }
|
||||
const VLoop& vloop() const { return _vloop; }
|
||||
Arena* arena() { return &_arena; }
|
||||
DEBUG_ONLY( bool has_graph() const { return !_graph.is_empty(); } )
|
||||
VTransformGraph& graph() { return _graph; }
|
||||
|
||||
void optimize() { return _graph.optimize(*this); }
|
||||
bool schedule() { return _graph.schedule(); }
|
||||
bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
|
||||
void apply();
|
||||
@ -372,6 +380,8 @@ public:
|
||||
const VTransformNodeIDX _idx;
|
||||
|
||||
private:
|
||||
bool _is_alive;
|
||||
|
||||
// We split _in into 3 sections:
|
||||
// - data edges (req): _in[0 .. _req-1]
|
||||
// - strong memory edges: _in[_req .. _in_end_strong_memory_edges-1]
|
||||
@ -389,6 +399,7 @@ private:
|
||||
public:
|
||||
VTransformNode(VTransform& vtransform, const uint req) :
|
||||
_idx(vtransform.graph().new_idx()),
|
||||
_is_alive(true),
|
||||
_req(req),
|
||||
_in_end_strong_memory_edges(req),
|
||||
_in(vtransform.arena(), req, req, nullptr),
|
||||
@ -405,6 +416,14 @@ public:
|
||||
n->add_out_strong_edge(this);
|
||||
}
|
||||
|
||||
void set_req(uint i, VTransformNode* n) {
|
||||
assert(i < _req, "must be a req");
|
||||
VTransformNode* old = _in.at(i);
|
||||
if (old != nullptr) { old->del_out_strong_edge(this); }
|
||||
_in.at_put(i, n);
|
||||
if (n != nullptr) { n->add_out_strong_edge(this); }
|
||||
}
|
||||
|
||||
void swap_req(uint i, uint j) {
|
||||
assert(i < _req, "must be a req");
|
||||
assert(j < _req, "must be a req");
|
||||
@ -452,6 +471,23 @@ private:
|
||||
_out.push(n);
|
||||
}
|
||||
|
||||
void del_out_strong_edge(VTransformNode* n) {
|
||||
int i = _out.find(n);
|
||||
assert(0 <= i && i < (int)_out_end_strong_edges, "must be in strong edges");
|
||||
|
||||
// Replace n with the last strong edge.
|
||||
VTransformNode* last_strong = _out.at(_out_end_strong_edges - 1);
|
||||
_out.at_put(i, last_strong);
|
||||
|
||||
if (_out_end_strong_edges < (uint)_out.length()) {
|
||||
// Now replace where last_strong was with the last weak edge.
|
||||
VTransformNode* last_weak = _out.top();
|
||||
_out.at_put(_out_end_strong_edges - 1, last_weak);
|
||||
}
|
||||
_out.pop();
|
||||
_out_end_strong_edges--;
|
||||
}
|
||||
|
||||
public:
|
||||
uint req() const { return _req; }
|
||||
uint out_strong_edges() const { return _out_end_strong_edges; }
|
||||
@ -479,6 +515,21 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
VTransformNode* unique_out_strong_edge() const {
|
||||
assert(out_strong_edges() == 1, "must be unique");
|
||||
return _out.at(0);
|
||||
}
|
||||
|
||||
bool is_alive() const { return _is_alive; }
|
||||
|
||||
void mark_dead() {
|
||||
_is_alive = false;
|
||||
// Remove all inputs
|
||||
for (uint i = 0; i < req(); i++) {
|
||||
set_req(i, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
virtual VTransformMemopScalarNode* isa_MemopScalar() { return nullptr; }
|
||||
virtual VTransformLoopPhiNode* isa_LoopPhi() { return nullptr; }
|
||||
virtual VTransformCountedLoopNode* isa_CountedLoop() { return nullptr; }
|
||||
@ -496,6 +547,8 @@ public:
|
||||
virtual bool is_load_or_store_in_loop() const { return false; }
|
||||
virtual const VPointer& vpointer() const { ShouldNotReachHere(); }
|
||||
|
||||
virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
|
||||
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
|
||||
virtual void apply_backedge(VTransformApplyState& apply_state) const {};
|
||||
void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
|
||||
@ -701,6 +754,7 @@ public:
|
||||
NOT_PRODUCT(virtual void print_spec() const override;)
|
||||
|
||||
protected:
|
||||
const VTransformVectorNodeProperties& properties() const { return _properties; }
|
||||
Node* approximate_origin() const { return _properties.approximate_origin(); }
|
||||
int scalar_opcode() const { return _properties.scalar_opcode(); }
|
||||
uint vector_length() const { return _properties.vector_length(); }
|
||||
@ -780,8 +834,15 @@ public:
|
||||
VTransformReductionVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
|
||||
VTransformVectorNode(vtransform, 3, properties) {}
|
||||
virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
|
||||
virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
|
||||
virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
|
||||
NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
|
||||
|
||||
private:
|
||||
int vector_reduction_opcode() const;
|
||||
bool requires_strict_order() const;
|
||||
bool optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform);
|
||||
bool optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform);
|
||||
};
|
||||
|
||||
class VTransformMemVectorNode : public VTransformVectorNode {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user