From 4786f8bee5c79c1bcf652758a25360b4d308ce1c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 14 Oct 2025 08:32:32 +0000 Subject: [PATCH] 8369448: C2 SuperWord: refactor VTransform to do move_unordered_reduction_out_of_loop during VTransform::optimize Reviewed-by: chagedorn, kvn --- src/hotspot/share/opto/loopnode.cpp | 10 - src/hotspot/share/opto/loopnode.hpp | 3 - src/hotspot/share/opto/loopopts.cpp | 205 ------------ src/hotspot/share/opto/superword.cpp | 6 +- .../share/opto/traceAutoVectorizationTag.hpp | 3 +- src/hotspot/share/opto/vectorization.hpp | 4 + src/hotspot/share/opto/vectornode.cpp | 143 ++------- src/hotspot/share/opto/vectornode.hpp | 3 +- src/hotspot/share/opto/vtransform.cpp | 301 +++++++++++++++++- src/hotspot/share/opto/vtransform.hpp | 65 +++- 10 files changed, 402 insertions(+), 341 deletions(-) diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index a3e3be66583..4cb1862cbb9 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -5287,16 +5287,6 @@ void PhaseIdealLoop::build_and_optimize() { } } - // Move UnorderedReduction out of counted loop. Can be introduced by AutoVectorization. - if (C->has_loops() && !C->major_progress()) { - for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) { - IdealLoopTree* lpt = iter.current(); - if (lpt->is_counted() && lpt->is_innermost()) { - move_unordered_reduction_out_of_loop(lpt); - } - } - } - // Keep loop predicates and perform optimizations with them // until no more loop optimizations could be done. // After that switch predicates off and do more loop optimizations. diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index 2645df86d96..1101de81595 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -1550,9 +1550,6 @@ public: IfTrueNode* create_new_if_for_multiversion(IfTrueNode* multiversioning_fast_proj); bool try_resume_optimizations_for_delayed_slow_loop(IdealLoopTree* lpt); - // Move an unordered Reduction out of loop if possible - void move_unordered_reduction_out_of_loop(IdealLoopTree* loop); - // Create a scheduled list of nodes control dependent on ctrl set. void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched ); // Has a use in the vector set diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index a9baac394a2..ae7b318ece4 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4548,211 +4548,6 @@ void PhaseIdealLoop::maybe_multiversion_for_auto_vectorization_runtime_checks(Id do_multiversioning(lpt, old_new); } -// Returns true if the Reduction node is unordered. -static bool is_unordered_reduction(Node* n) { - return n->is_Reduction() && !n->as_Reduction()->requires_strict_order(); -} - -// Having ReductionNodes in the loop is expensive. They need to recursively -// fold together the vector values, for every vectorized loop iteration. If -// we encounter the following pattern, we can vector accumulate the values -// inside the loop, and only have a single UnorderedReduction after the loop. -// -// Note: UnorderedReduction represents a ReductionNode which does not require -// calculating in strict order. -// -// CountedLoop init -// | | -// +------+ | +-----------------------+ -// | | | | -// PhiNode (s) | -// | | -// | Vector | -// | | | -// UnorderedReduction (first_ur) | -// | | -// ... Vector | -// | | | -// UnorderedReduction (last_ur) | -// | | -// +---------------------+ -// -// We patch the graph to look like this: -// -// CountedLoop identity_vector -// | | -// +-------+ | +---------------+ -// | | | | -// PhiNode (v) | -// | | -// | Vector | -// | | | -// VectorAccumulator | -// | | -// ... Vector | -// | | | -// init VectorAccumulator | -// | | | | -// UnorderedReduction +-----------+ -// -// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we -// use vector_accumulators, which do the same reductions, but only element -// wise. This is a single operation per vector_accumulator, rather than many -// for a UnorderedReduction. We can then reduce the last vector_accumulator -// after the loop, and also reduce the init value into it. -// -// We can not do this with all reductions. Some reductions do not allow the -// reordering of operations (for example float addition/multiplication require -// strict order). -void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { - assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity"); - - // Find all Phi nodes with an unordered Reduction on backedge. - CountedLoopNode* cl = loop->_head->as_CountedLoop(); - for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) { - Node* phi = cl->fast_out(j); - // We have a phi with a single use, and an unordered Reduction on the backedge. - if (!phi->is_Phi() || phi->outcnt() != 1 || !is_unordered_reduction(phi->in(2))) { - continue; - } - - ReductionNode* last_ur = phi->in(2)->as_Reduction(); - assert(!last_ur->requires_strict_order(), "must be"); - - // Determine types - const TypeVect* vec_t = last_ur->vect_type(); - uint vector_length = vec_t->length(); - BasicType bt = vec_t->element_basic_type(); - - // Convert opcode from vector-reduction -> scalar -> normal-vector-op - const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt); - const int vopc = VectorNode::opcode(sopc, bt); - if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) { - DEBUG_ONLY( last_ur->dump(); ) - assert(false, "do not have normal vector op for this reduction"); - continue; // not implemented -> fails - } - - // Traverse up the chain of unordered Reductions, checking that it loops back to - // the phi. Check that all unordered Reductions only have a single use, except for - // the last (last_ur), which only has phi as a use in the loop, and all other uses - // are outside the loop. - ReductionNode* current = last_ur; - ReductionNode* first_ur = nullptr; - while (true) { - assert(!current->requires_strict_order(), "sanity"); - - // Expect no ctrl and a vector_input from within the loop. - Node* ctrl = current->in(0); - Node* vector_input = current->in(2); - if (ctrl != nullptr || get_ctrl(vector_input) != cl) { - DEBUG_ONLY( current->dump(1); ) - assert(false, "reduction has ctrl or bad vector_input"); - break; // Chain traversal fails. - } - - assert(current->vect_type() != nullptr, "must have vector type"); - if (current->vect_type() != last_ur->vect_type()) { - // Reductions do not have the same vector type (length and element type). - break; // Chain traversal fails. - } - - // Expect single use of an unordered Reduction, except for last_ur. - if (current == last_ur) { - // Expect all uses to be outside the loop, except phi. - for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) { - Node* use = current->fast_out(k); - if (use != phi && ctrl_or_self(use) == cl) { - DEBUG_ONLY( current->dump(-1); ) - assert(false, "reduction has use inside loop"); - // Should not be allowed by SuperWord::mark_reductions - return; // bail out of optimization - } - } - } else { - if (current->outcnt() != 1) { - break; // Chain traversal fails. - } - } - - // Expect another unordered Reduction or phi as the scalar input. - Node* scalar_input = current->in(1); - if (is_unordered_reduction(scalar_input) && - scalar_input->Opcode() == current->Opcode()) { - // Move up the unordered Reduction chain. - current = scalar_input->as_Reduction(); - assert(!current->requires_strict_order(), "must be"); - } else if (scalar_input == phi) { - // Chain terminates at phi. - first_ur = current; - current = nullptr; - break; // Success. - } else { - // scalar_input is neither phi nor a matching reduction - // Can for example be scalar reduction when we have - // partial vectorization. - break; // Chain traversal fails. - } - } - if (current != nullptr) { - // Chain traversal was not successful. - continue; - } - assert(first_ur != nullptr, "must have successfully terminated chain traversal"); - - Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt); - set_root_as_ctrl(identity_scalar); - VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt); - register_new_node(identity_vector, C->root()); - assert(vec_t == identity_vector->vect_type(), "matching vector type"); - VectorNode::trace_new_vector(identity_vector, "Unordered Reduction"); - - // Turn the scalar phi into a vector phi. - _igvn.rehash_node_delayed(phi); - Node* init = phi->in(1); // Remember init before replacing it. - phi->set_req_X(1, identity_vector, &_igvn); - phi->as_Type()->set_type(vec_t); - _igvn.set_type(phi, vec_t); - - // Traverse down the chain of unordered Reductions, and replace them with vector_accumulators. - current = first_ur; - while (true) { - // Create vector_accumulator to replace current. - Node* last_vector_accumulator = current->in(1); - Node* vector_input = current->in(2); - VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t); - register_new_node(vector_accumulator, cl); - _igvn.replace_node(current, vector_accumulator); - VectorNode::trace_new_vector(vector_accumulator, "Unordered Reduction"); - if (current == last_ur) { - break; - } - current = vector_accumulator->unique_out()->as_Reduction(); - assert(!current->requires_strict_order(), "must be"); - } - - // Create post-loop reduction. - Node* last_accumulator = phi->in(2); - Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt); - - // Take over uses of last_accumulator that are not in the loop. - for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) { - Node* use = last_accumulator->out(i); - if (use != phi && use != post_loop_reduction) { - assert(ctrl_or_self(use) != cl, "use must be outside loop"); - use->replace_edge(last_accumulator, post_loop_reduction, &_igvn); - --i; - } - } - register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl)); - VectorNode::trace_new_vector(post_loop_reduction, "Unordered Reduction"); - - assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction"); - assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator"); - assert(phi->outcnt() == 1, "accumulator is the only use of phi"); - } -} - void DataNodeGraph::clone_data_nodes(Node* new_ctrl) { for (uint i = 0; i < _data_nodes.size(); i++) { clone(_data_nodes[i], new_ctrl); diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 41a4339e4c9..c0f005048ec 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1606,7 +1606,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const { // 3 instructions (1 shuffle and two reduction ops). // However, this optimization assumes that these reductions stay in the loop // which may not be true any more in most cases after the introduction of: - // PhaseIdealLoop::move_unordered_reduction_out_of_loop + // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop // Hence, this heuristic has room for improvement. bool is_two_element_int_or_long_reduction = (size == 2) && (arith_type->basic_type() == T_INT || @@ -1782,7 +1782,7 @@ bool SuperWord::profitable(const Node_List* p) const { // This heuristic is a bit simplistic, and assumes that the reduction // vector stays in the loop. But in some cases, we can move the // reduction out of the loop, replacing it with a single vector op. - // See: PhaseIdealLoop::move_unordered_reduction_out_of_loop + // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop // Hence, this heuristic has room for improvement. #ifndef PRODUCT if (is_trace_superword_rejections()) { @@ -1947,6 +1947,8 @@ bool SuperWord::do_vtransform() const { SuperWordVTransformBuilder builder(_packset, vtransform); } + vtransform.optimize(); + if (!vtransform.schedule()) { return false; } if (vtransform.has_store_to_load_forwarding_failure()) { return false; } diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 6713ed6cac6..d996173aeb4 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -45,10 +45,11 @@ flags(SW_PACKSET, "Trace SuperWord packset at different stages") \ flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \ flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \ + flags(VTRANSFORM, "Trace VTransform Graph") \ + flags(OPTIMIZATION, "Trace VTransform::optimize") \ flags(ALIGN_VECTOR, "Trace AlignVector") \ flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \ flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \ - flags(VTRANSFORM, "Trace VTransform Graph") \ flags(ALL, "Trace everything (very verbose)") #define table_entry(name, description) name, diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index e006589cce9..b1be52d531a 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -205,6 +205,10 @@ public: return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS); } + bool is_trace_optimization() const { + return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION); + } + bool is_trace_speculative_runtime_checks() const { return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS); } diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index d656bf3127b..6ae8bbe8aa0 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -292,121 +292,6 @@ int VectorNode::opcode(int sopc, BasicType bt) { } } -// Return the scalar opcode for the specified vector opcode -// and basic type. -int VectorNode::scalar_opcode(int sopc, BasicType bt) { - switch (sopc) { - case Op_AddReductionVI: - case Op_AddVI: - return Op_AddI; - case Op_AddReductionVL: - case Op_AddVL: - return Op_AddL; - case Op_MulReductionVI: - case Op_MulVI: - return Op_MulI; - case Op_MulReductionVL: - case Op_MulVL: - return Op_MulL; - case Op_AndReductionV: - case Op_AndV: - switch (bt) { - case T_BOOLEAN: - case T_CHAR: - case T_BYTE: - case T_SHORT: - case T_INT: - return Op_AndI; - case T_LONG: - return Op_AndL; - default: - assert(false, "basic type not handled"); - return 0; - } - case Op_OrReductionV: - case Op_OrV: - switch (bt) { - case T_BOOLEAN: - case T_CHAR: - case T_BYTE: - case T_SHORT: - case T_INT: - return Op_OrI; - case T_LONG: - return Op_OrL; - default: - assert(false, "basic type not handled"); - return 0; - } - case Op_XorReductionV: - case Op_XorV: - switch (bt) { - case T_BOOLEAN: - case T_CHAR: - case T_BYTE: - case T_SHORT: - case T_INT: - return Op_XorI; - case T_LONG: - return Op_XorL; - default: - assert(false, "basic type not handled"); - return 0; - } - case Op_MinReductionV: - case Op_MinV: - switch (bt) { - case T_BOOLEAN: - case T_CHAR: - assert(false, "boolean and char are signed, not implemented for Min"); - return 0; - case T_BYTE: - case T_SHORT: - case T_INT: - return Op_MinI; - case T_LONG: - return Op_MinL; - case T_FLOAT: - return Op_MinF; - case T_DOUBLE: - return Op_MinD; - default: - assert(false, "basic type not handled"); - return 0; - } - case Op_MaxReductionV: - case Op_MaxV: - switch (bt) { - case T_BOOLEAN: - case T_CHAR: - assert(false, "boolean and char are signed, not implemented for Max"); - return 0; - case T_BYTE: - case T_SHORT: - case T_INT: - return Op_MaxI; - case T_LONG: - return Op_MaxL; - case T_FLOAT: - return Op_MaxF; - case T_DOUBLE: - return Op_MaxD; - default: - assert(false, "basic type not handled"); - return 0; - } - case Op_MinVHF: - return Op_MinHF; - case Op_MaxVHF: - return Op_MaxHF; - default: - assert(false, - "Vector node %s is not handled in VectorNode::scalar_opcode", - NodeClassNames[sopc]); - return 0; // Unimplemented - } -} - // Limits on vector size (number of elements) for auto-vectorization. bool VectorNode::vector_size_supported_auto_vectorization(const BasicType bt, int size) { return Matcher::max_vector_size_auto_vectorization(bt) >= size && @@ -1727,6 +1612,34 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) { return false; } +bool ReductionNode::auto_vectorization_requires_strict_order(int vopc) { + switch (vopc) { + case Op_AddReductionVI: + case Op_AddReductionVL: + case Op_MulReductionVI: + case Op_MulReductionVL: + case Op_MinReductionV: + case Op_MaxReductionV: + case Op_AndReductionV: + case Op_OrReductionV: + case Op_XorReductionV: + // These are cases that all have associative operations, which can + // thus be reordered, allowing non-strict order reductions. + return false; + case Op_AddReductionVF: + case Op_MulReductionVF: + case Op_AddReductionVD: + case Op_MulReductionVD: + // Floating-point addition and multiplication are non-associative, + // so AddReductionVF/D and MulReductionVF/D require strict ordering + // in auto-vectorization. + return true; + default: + assert(false, "not handled: %s", NodeClassNames[vopc]); + return true; + } +} + MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* in1, Node* in2, Node* in3, Node* mask, uint truth_table, const TypeVect* vt) { assert(truth_table <= 0xFF, "invalid"); diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 53778b61d0e..427aeff53fc 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -95,7 +95,6 @@ class VectorNode : public TypeNode { static bool is_rotate_opcode(int opc); static int opcode(int sopc, BasicType bt); // scalar_opc -> vector_opc - static int scalar_opcode(int vopc, BasicType bt); // vector_opc -> scalar_opc static int shift_count_opcode(int opc); @@ -283,6 +282,8 @@ class ReductionNode : public Node { return false; } + static bool auto_vectorization_requires_strict_order(int vopc); + #ifndef PRODUCT void dump_spec(outputStream* st) const { if (requires_strict_order()) { diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 27c541c2732..46e8f43cb65 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -23,6 +23,7 @@ #include "opto/castnode.hpp" #include "opto/convertnode.hpp" +#include "opto/rootnode.hpp" #include "opto/vectorization.hpp" #include "opto/vectornode.hpp" #include "opto/vtransform.hpp" @@ -32,6 +33,45 @@ void VTransformGraph::add_vtnode(VTransformNode* vtnode) { _vtnodes.push(vtnode); } +#define TRACE_OPTIMIZE(code) \ + NOT_PRODUCT( \ + if (vtransform.vloop().is_trace_optimization()) { \ + code \ + } \ + ) + +// This is similar to IGVN optimization. But we are a bit lazy, and don't care about +// notification / worklist, since the list of nodes is rather small, and we don't +// expect optimizations that trickle over the whole graph. +void VTransformGraph::optimize(VTransform& vtransform) { + TRACE_OPTIMIZE( tty->print_cr("\nVTransformGraph::optimize"); ) + + bool progress = true; + DEBUG_ONLY(int pass_count = 0;) + while (progress) { + progress = false; + assert(++pass_count < 10, "ensure we do not have endless loops"); + for (int i = 0; i < _vtnodes.length(); i++) { + VTransformNode* vtn = _vtnodes.at(i); + if (!vtn->is_alive()) { continue; } + progress |= vtn->optimize(_vloop_analyzer, vtransform); + + // Nodes that have no use any more are dead. + if (vtn->out_strong_edges() == 0 && + // There are some exceptions: + // 1. Memory phi uses are not modeled, so they appear to have no use here, but must be kept alive. + // 2. Similarly, some stores may not have their memory uses modeled, but need to be kept alive. + // 3. Outer node with strong inputs: is a use after the loop that we must keep alive. + !(vtn->isa_LoopPhi() != nullptr || + vtn->is_load_or_store_in_loop() || + (vtn->isa_Outer() != nullptr && vtn->has_strong_in_edge()))) { + vtn->mark_dead(); + progress = true; + } + } + } +} + // Compute a linearization of the graph. We do this with a reverse-post-order of a DFS. // This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and // the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the @@ -59,10 +99,11 @@ bool VTransformGraph::schedule() { VectorSet post_visited; collect_nodes_without_strong_in_edges(stack); + const int num_alive_nodes = count_alive_vtnodes(); // We create a reverse-post-visit order. This gives us a linearization, if there are // no cycles. Then, we simply reverse the order, and we have a schedule. - int rpo_idx = _vtnodes.length() - 1; + int rpo_idx = num_alive_nodes - 1; while (!stack.is_empty()) { VTransformNode* vtn = stack.top(); if (!pre_visited.test_set(vtn->_idx)) { @@ -79,6 +120,9 @@ bool VTransformGraph::schedule() { for (uint i = 0; i < vtn->out_strong_edges(); i++) { VTransformNode* use = vtn->out_strong_edge(i); + // Skip dead nodes + if (!use->is_alive()) { continue; } + // Skip LoopPhi backedge. if ((use->isa_LoopPhi() != nullptr || use->isa_CountedLoop() != nullptr) && use->in_req(2) == vtn) { continue; } @@ -121,6 +165,7 @@ bool VTransformGraph::schedule() { void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArray& stack) const { for (int i = 0; i < _vtnodes.length(); i++) { VTransformNode* vtn = _vtnodes.at(i); + if (!vtn->is_alive()) { continue; } if (!vtn->has_strong_in_edge()) { stack.push(vtn); } @@ -132,6 +177,15 @@ void VTransformGraph::collect_nodes_without_strong_in_edges(GrowableArrayis_alive()) { count++; } + } + return count; +} + #ifndef PRODUCT void VTransformGraph::trace_schedule_cycle(const GrowableArray& stack, const VectorSet& pre_visited, @@ -801,6 +855,13 @@ VTransformApplyResult VTransformLoopPhiNode::apply(VTransformApplyState& apply_s phase->igvn().replace_input_of(_node, 0, in0); phase->igvn().replace_input_of(_node, 1, in1); // Note: the backedge is hooked up later. + + // The Phi's inputs may have been modified, and the types changes, + // e.g. from scalar to vector. + const Type* t = in1->bottom_type(); + _node->as_Type()->set_type(t); + phase->igvn().set_type(_node, t); + return VTransformApplyResult::make_scalar(_node); } @@ -939,6 +1000,242 @@ VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& appl return VTransformApplyResult::make_vector(vn); } +bool VTransformReductionVectorNode::optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { + return optimize_move_non_strict_order_reductions_out_of_loop(vloop_analyzer, vtransform); +} + +int VTransformReductionVectorNode::vector_reduction_opcode() const { + return ReductionNode::opcode(scalar_opcode(), element_basic_type()); +} + +bool VTransformReductionVectorNode::requires_strict_order() const { + int vopc = vector_reduction_opcode(); + return ReductionNode::auto_vectorization_requires_strict_order(vopc); +} + +// Having ReductionNodes in the loop is expensive. They need to recursively +// fold together the vector values, for every vectorized loop iteration. If +// we encounter the following pattern, we can vector accumulate the values +// inside the loop, and only have a single UnorderedReduction after the loop. +// +// Note: UnorderedReduction represents a ReductionNode which does not require +// calculating in strict order. +// +// CountedLoop init +// | | +// +------+ | +------------------------+ +// | | | | +// PhiNode (s) | +// | | +// | Vector | +// | | | +// UnorderedReduction (first_red) | +// | | +// ... Vector | +// | | | +// UnorderedReduction (last_red) | +// | | +// +----------------------+ +// +// We patch the graph to look like this: +// +// CountedLoop identity_vector +// | | +// +-------+ | +---------------+ +// | | | | +// PhiNode (v) | +// | | +// | Vector | +// | | | +// VectorAccumulator | +// | | +// ... Vector | +// | | | +// init VectorAccumulator | +// | | | | +// UnorderedReduction +-----------+ +// +// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we +// use vector_accumulators, which do the same reductions, but only element +// wise. This is a single operation per vector_accumulator, rather than many +// for a UnorderedReduction. We can then reduce the last vector_accumulator +// after the loop, and also reduce the init value into it. +// +// We can not do this with all reductions. Some reductions do not allow the +// reordering of operations (for example float addition/multiplication require +// strict order). +// +// Note: we must perform this optimization already during auto vectorization, +// before we evaluate the cost-model. Without this optimization, we may +// still have expensive reduction nodes in the loop which can make +// vectorization unprofitable. Only with the optimization does vectorization +// become profitable, since the expensive reduction node is moved +// outside the loop, and instead cheaper element-wise vector accumulations +// are performed inside the loop. +bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform) { + // We have a phi with a single use. + VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi(); + if (phi == nullptr) { + return false; + } + if (phi->out_strong_edges() != 1) { + TRACE_OPTIMIZE( + tty->print(" Cannot move out of loop, phi has multiple uses:"); + print(); + tty->print(" phi: "); + phi->print(); + ) + return false; + } + + if (requires_strict_order()) { + TRACE_OPTIMIZE( + tty->print(" Cannot move out of loop, strict order required: "); + print(); + ) + return false; + } + + const int sopc = scalar_opcode(); + const uint vlen = vector_length(); + const BasicType bt = element_basic_type(); + const int ropc = vector_reduction_opcode(); + const int vopc = VectorNode::opcode(sopc, bt); + if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) { + DEBUG_ONLY( this->print(); ) + assert(false, "do not have normal vector op for this reduction"); + return false; // not implemented + } + + // Traverse up the chain of non strict order reductions, checking that it loops + // back to the phi. Check that all non strict order reductions only have a single + // use, except for the last (last_red), which only has phi as a use in the loop, + // and all other uses are outside the loop. + VTransformReductionVectorNode* first_red = this; + VTransformReductionVectorNode* last_red = phi->in_req(2)->isa_ReductionVector(); + VTransformReductionVectorNode* current_red = last_red; + while (true) { + if (current_red == nullptr || + current_red->vector_reduction_opcode() != ropc || + current_red->element_basic_type() != bt || + current_red->vector_length() != vlen) { + TRACE_OPTIMIZE( + tty->print(" Cannot move out of loop, other reduction node does not match:"); + print(); + tty->print(" other: "); + current_red->print(); + ) + return false; // not compatible + } + + VTransformVectorNode* vector_input = current_red->in_req(2)->isa_Vector(); + if (vector_input == nullptr) { + assert(false, "reduction has a bad vector input"); + return false; + } + + // Expect single use of the non strict order reduction. Except for the last_red. + if (current_red == last_red) { + // All uses must be outside loop body, except for the phi. + for (uint i = 0; i < current_red->out_strong_edges(); i++) { + VTransformNode* use = current_red->out_strong_edge(i); + if (use->isa_LoopPhi() == nullptr && + use->isa_Outer() == nullptr) { + // Should not be allowed by SuperWord::mark_reductions + assert(false, "reduction has use inside loop"); + return false; + } + } + } else { + if (current_red->out_strong_edges() != 1) { + TRACE_OPTIMIZE( + tty->print(" Cannot move out of loop, other reduction node has use outside loop:"); + print(); + tty->print(" other: "); + current_red->print(); + ) + return false; // Only single use allowed + } + } + + // If the scalar input is a phi, we passed all checks. + VTransformNode* scalar_input = current_red->in_req(1); + if (scalar_input == phi) { + break; + } + + // We expect another non strict reduction, verify it in the next iteration. + current_red = scalar_input->isa_ReductionVector(); + } + return true; // success +} + +bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { + if (!optimize_move_non_strict_order_reductions_out_of_loop_preconditions(vtransform)) { + return false; + } + + // All checks were successful. Edit the vtransform graph now. + TRACE_OPTIMIZE( + tty->print_cr("VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop"); + ) + + const int sopc = scalar_opcode(); + const uint vlen = vector_length(); + const BasicType bt = element_basic_type(); + const int vopc = VectorNode::opcode(sopc, bt); + PhaseIdealLoop* phase = vloop_analyzer.vloop().phase(); + + // Create a vector of identity values. + Node* identity = ReductionNode::make_identity_con_scalar(phase->igvn(), sopc, bt); + phase->set_root_as_ctrl(identity); + VTransformNode* vtn_identity = new (vtransform.arena()) VTransformOuterNode(vtransform, identity); + + VTransformNode* vtn_identity_vector = new (vtransform.arena()) VTransformReplicateNode(vtransform, vlen, bt); + vtn_identity_vector->init_req(1, vtn_identity); + + // Turn the scalar phi into a vector phi. + VTransformLoopPhiNode* phi = in_req(1)->isa_LoopPhi(); + VTransformNode* init = phi->in_req(1); + phi->set_req(1, vtn_identity_vector); + + // Traverse down the chain of reductions, and replace them with vector_accumulators. + VTransformReductionVectorNode* first_red = this; + VTransformReductionVectorNode* last_red = phi->in_req(2)->isa_ReductionVector(); + VTransformReductionVectorNode* current_red = first_red; + VTransformNode* current_vector_accumulator = phi; + while (true) { + VTransformNode* vector_input = current_red->in_req(2); + VTransformVectorNode* vector_accumulator = new (vtransform.arena()) VTransformElementWiseVectorNode(vtransform, 3, current_red->properties(), vopc); + vector_accumulator->init_req(1, current_vector_accumulator); + vector_accumulator->init_req(2, vector_input); + TRACE_OPTIMIZE( + tty->print(" replace "); + current_red->print(); + tty->print(" with "); + vector_accumulator->print(); + ) + current_vector_accumulator = vector_accumulator; + if (current_red == last_red) { break; } + current_red = current_red->unique_out_strong_edge()->isa_ReductionVector(); + } + + // Feed vector accumulator into the backedge. + phi->set_req(2, current_vector_accumulator); + + // Create post-loop reduction. last_red keeps all uses outside the loop. + last_red->set_req(1, init); + last_red->set_req(2, current_vector_accumulator); + + TRACE_OPTIMIZE( + tty->print(" phi "); + phi->print(); + tty->print(" after loop "); + last_red->print(); + ) + return true; // success +} + VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const { Node* init = apply_state.transformed_node(in_req(1)); Node* vec = apply_state.transformed_node(in_req(2)); @@ -1041,7 +1338,7 @@ void VTransformNode::print() const { print_node_idx(_in.at(i)); } } - tty->print(") ["); + tty->print(") %s[", _is_alive ? "" : "dead "); for (uint i = 0; i < _out_end_strong_edges; i++) { print_node_idx(_out.at(i)); } diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index a004962eea7..7ad7b432e9b 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -41,7 +41,11 @@ // - Construction: // - From SuperWord PackSet, with the SuperWordVTransformBuilder. // -// - Future Plans: optimize, if-conversion, etc. +// - Optimize: +// - Move non-strict order reductions out of the loop. This means we have +// only element-wise operations inside the loop, rather than the much +// more expensive lane-crossing reductions. We need to do this before +// assessing profitability with the cost-model. // // - Schedule: // - Compute linearization of the VTransformGraph, into an order that respects @@ -62,12 +66,12 @@ // // Future Plans with VTransform: // - Cost model: estimate if vectorization is profitable. -// - Optimizations: moving unordered reductions out of the loop, whih decreases cost. // - Pack/Unpack/Shuffle: introduce additional nodes not present in the scalar loop. // This is difficult to do with the SuperWord packset approach. // - If-conversion: convert predicated nodes into CFG. typedef int VTransformNodeIDX; +class VTransform; class VTransformNode; class VTransformMemopScalarNode; class VTransformDataScalarNode; @@ -183,6 +187,7 @@ public: const GrowableArray& vtnodes() const { return _vtnodes; } const GrowableArray& get_schedule() const { return _schedule; } + void optimize(VTransform& vtransform); bool schedule(); bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const; void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const; @@ -194,6 +199,7 @@ private: bool in_bb(const Node* n) const { return _vloop.in_bb(n); } void collect_nodes_without_strong_in_edges(GrowableArray& stack) const; + int count_alive_vtnodes() const; #ifndef PRODUCT void print_vtnodes() const; @@ -239,10 +245,12 @@ public: _aw_for_main_loop_alignment(aw_for_main_loop_alignment) {} const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; } + const VLoop& vloop() const { return _vloop; } Arena* arena() { return &_arena; } DEBUG_ONLY( bool has_graph() const { return !_graph.is_empty(); } ) VTransformGraph& graph() { return _graph; } + void optimize() { return _graph.optimize(*this); } bool schedule() { return _graph.schedule(); } bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); } void apply(); @@ -372,6 +380,8 @@ public: const VTransformNodeIDX _idx; private: + bool _is_alive; + // We split _in into 3 sections: // - data edges (req): _in[0 .. _req-1] // - strong memory edges: _in[_req .. _in_end_strong_memory_edges-1] @@ -389,6 +399,7 @@ private: public: VTransformNode(VTransform& vtransform, const uint req) : _idx(vtransform.graph().new_idx()), + _is_alive(true), _req(req), _in_end_strong_memory_edges(req), _in(vtransform.arena(), req, req, nullptr), @@ -405,6 +416,14 @@ public: n->add_out_strong_edge(this); } + void set_req(uint i, VTransformNode* n) { + assert(i < _req, "must be a req"); + VTransformNode* old = _in.at(i); + if (old != nullptr) { old->del_out_strong_edge(this); } + _in.at_put(i, n); + if (n != nullptr) { n->add_out_strong_edge(this); } + } + void swap_req(uint i, uint j) { assert(i < _req, "must be a req"); assert(j < _req, "must be a req"); @@ -452,6 +471,23 @@ private: _out.push(n); } + void del_out_strong_edge(VTransformNode* n) { + int i = _out.find(n); + assert(0 <= i && i < (int)_out_end_strong_edges, "must be in strong edges"); + + // Replace n with the last strong edge. + VTransformNode* last_strong = _out.at(_out_end_strong_edges - 1); + _out.at_put(i, last_strong); + + if (_out_end_strong_edges < (uint)_out.length()) { + // Now replace where last_strong was with the last weak edge. + VTransformNode* last_weak = _out.top(); + _out.at_put(_out_end_strong_edges - 1, last_weak); + } + _out.pop(); + _out_end_strong_edges--; + } + public: uint req() const { return _req; } uint out_strong_edges() const { return _out_end_strong_edges; } @@ -479,6 +515,21 @@ public: return false; } + VTransformNode* unique_out_strong_edge() const { + assert(out_strong_edges() == 1, "must be unique"); + return _out.at(0); + } + + bool is_alive() const { return _is_alive; } + + void mark_dead() { + _is_alive = false; + // Remove all inputs + for (uint i = 0; i < req(); i++) { + set_req(i, nullptr); + } + } + virtual VTransformMemopScalarNode* isa_MemopScalar() { return nullptr; } virtual VTransformLoopPhiNode* isa_LoopPhi() { return nullptr; } virtual VTransformCountedLoopNode* isa_CountedLoop() { return nullptr; } @@ -496,6 +547,8 @@ public: virtual bool is_load_or_store_in_loop() const { return false; } virtual const VPointer& vpointer() const { ShouldNotReachHere(); } + virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; } + virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0; virtual void apply_backedge(VTransformApplyState& apply_state) const {}; void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const; @@ -701,6 +754,7 @@ public: NOT_PRODUCT(virtual void print_spec() const override;) protected: + const VTransformVectorNodeProperties& properties() const { return _properties; } Node* approximate_origin() const { return _properties.approximate_origin(); } int scalar_opcode() const { return _properties.scalar_opcode(); } uint vector_length() const { return _properties.vector_length(); } @@ -780,8 +834,15 @@ public: VTransformReductionVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) : VTransformVectorNode(vtransform, 3, properties) {} virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; } + virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };) + +private: + int vector_reduction_opcode() const; + bool requires_strict_order() const; + bool optimize_move_non_strict_order_reductions_out_of_loop_preconditions(VTransform& vtransform); + bool optimize_move_non_strict_order_reductions_out_of_loop(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform); }; class VTransformMemVectorNode : public VTransformVectorNode {