diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index f78a9b63926..ba2dd423bf5 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -22,22 +22,13 @@ */ #include "precompiled.hpp" -#include "libadt/vectset.hpp" -#include "memory/allocation.inline.hpp" -#include "memory/resourceArea.hpp" #include "opto/addnode.hpp" -#include "opto/c2compiler.hpp" #include "opto/castnode.hpp" #include "opto/convertnode.hpp" -#include "opto/matcher.hpp" -#include "opto/memnode.hpp" -#include "opto/opcodes.hpp" -#include "opto/opaquenode.hpp" -#include "opto/rootnode.hpp" #include "opto/superword.hpp" +#include "opto/superwordVTransformBuilder.hpp" #include "opto/vectornode.hpp" #include "opto/movenode.hpp" -#include "utilities/powerOfTwo.hpp" SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : _vloop_analyzer(vloop_analyzer), @@ -707,7 +698,7 @@ bool SuperWord::can_pack_into_pair(Node* s1, Node* s2) { } // Forbid anything that looks like a PopulateIndex to be packed. It does not need to be packed, - // and will still be vectorized by SuperWord::vector_opd. + // and will still be vectorized by SuperWordVTransformBuilder::get_or_make_vtnode_vector_input_at_index. if (isomorphic(s1, s2) && !is_populate_index(s1, s2)) { if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) { if (!_pairset.is_left(s1) && !_pairset.is_right(s2)) { @@ -769,8 +760,9 @@ bool SuperWord::isomorphic(Node* s1, Node* s2) { } } -// Look for pattern n1 = (iv + c) and n2 = (iv + c + 1), which may lead to PopulateIndex vector node. -// We skip the pack creation of these nodes. They will be vectorized by SuperWord::vector_opd. +// Look for pattern n1 = (iv + c) and n2 = (iv + c + 1), which may lead to +// PopulateIndex vector node. We skip the pack creation of these nodes. They +// will be vectorized by SuperWordVTransformBuilder::get_or_make_vtnode_vector_input_at_index. bool SuperWord::is_populate_index(const Node* n1, const Node* n2) const { return n1->is_Add() && n2->is_Add() && @@ -1858,307 +1850,74 @@ void PackSet::verify() const { } #endif -// The PacksetGraph combines the dependency graph with the packset. In the PackSet -// graph, we have two kinds of nodes: -// (1) pack-node: Represents all nodes of some pack p in a single node, which -// shall later become a vector node. -// (2) scalar-node: Represents a node that is not in any pack. -// For any edge (n1, n2) in the dependency graph, we add an edge to the PacksetGraph for -// the PacksetGraph nodes corresponding to n1 and n2. -// We work from the dependency graph, because it gives us all the data-dependencies, -// as well as more refined memory-dependencies than the C2 graph. The dependency graph -// does not have cycles. But packing nodes can introduce cyclic dependencies. Example: -// -// +--------+ -// A -> X | v -// Pack [A,B] and [X,Y] [A,B] [X,Y] -// Y -> B ^ | -// +--------+ -// -class PacksetGraph { -private: - // pid: packset graph node id. - GrowableArray _pid; // bb_idx(n) -> pid - GrowableArray _pid_to_node; // one node per pid, find rest via _packset.pack - GrowableArray> _out; // out-edges - GrowableArray _incnt; // number of (implicit) in-edges - int _max_pid = 0; +bool SuperWord::schedule_and_apply() const { + if (_packset.is_empty()) { return false; } - bool _schedule_success; - - SuperWord* _slp; -public: - PacksetGraph(SuperWord* slp) - : _pid(8, 0, /* default */ 0), _slp(slp) { - } - // Get pid, if there is a packset node that n belongs to. Else return 0. - int get_pid_or_zero(const Node* n) const { - if (!_slp->in_bb(n)) { - return 0; - } - int idx = _slp->bb_idx(n); - if (idx >= _pid.length()) { - return 0; - } else { - return _pid.at(idx); - } - } - int get_pid(const Node* n) { - int poz = get_pid_or_zero(n); - assert(poz != 0, "pid should not be zero"); - return poz; - } - void set_pid(Node* n, int pid) { - assert(n != nullptr && pid > 0, "sane inputs"); - assert(_slp->in_bb(n), "must be"); - int idx = _slp->bb_idx(n); - _pid.at_put_grow(idx, pid); - _pid_to_node.at_put_grow(pid - 1, n, nullptr); - } - Node* get_node(int pid) { - assert(pid > 0 && pid <= _pid_to_node.length(), "pid must be mapped"); - Node* n = _pid_to_node.at(pid - 1); - assert(n != nullptr, "sanity"); - return n; - } - int new_pid() { - _incnt.push(0); - _out.push(GrowableArray()); - return ++_max_pid; - } - int incnt(int pid) { return _incnt.at(pid - 1); } - void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); } - GrowableArray& out(int pid) { return _out.at(pid - 1); } - bool schedule_success() const { return _schedule_success; } - - // Create nodes (from packs and scalar-nodes), and add edges, based on the dependency graph. - void build() { - const PackSet& packset = _slp->packset(); - const GrowableArray& body = _slp->body(); - // Map nodes in packsets - for (int i = 0; i < packset.length(); i++) { - Node_List* p = packset.at(i); - int pid = new_pid(); - for (uint k = 0; k < p->size(); k++) { - Node* n = p->at(k); - set_pid(n, pid); - assert(packset.get_pack(n) == p, "matching packset"); - } - } - - int max_pid_packset = _max_pid; - - // Map nodes not in packset - for (int i = 0; i < body.length(); i++) { - Node* n = body.at(i); - if (n->is_Phi() || n->is_CFG()) { - continue; // ignore control flow - } - int pid = get_pid_or_zero(n); - if (pid == 0) { - pid = new_pid(); - set_pid(n, pid); - assert(packset.get_pack(n) == nullptr, "no packset"); - } - } - - // Map edges for packset nodes - VectorSet set; - for (int i = 0; i < packset.length(); i++) { - Node_List* p = packset.at(i); - set.clear(); - int pid = get_pid(p->at(0)); - for (uint k = 0; k < p->size(); k++) { - Node* n = p->at(k); - assert(pid == get_pid(n), "all nodes in pack have same pid"); - for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) { - Node* pred = preds.current(); - int pred_pid = get_pid_or_zero(pred); - if (pred_pid == pid && _slp->is_marked_reduction(n)) { - continue; // reduction -> self-cycle is not a cyclic dependency - } - // Only add edges once, and only for mapped nodes (in body) - if (pred_pid > 0 && !set.test_set(pred_pid)) { - incnt_set(pid, incnt(pid) + 1); // increment - out(pred_pid).push(pid); - } - } - } - } - - // Map edges for nodes not in packset - for (int i = 0; i < body.length(); i++) { - Node* n = body.at(i); - int pid = get_pid_or_zero(n); // zero for Phi or CFG - if (pid <= max_pid_packset) { - continue; // Only scalar-nodes - } - for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) { - Node* pred = preds.current(); - int pred_pid = get_pid_or_zero(pred); - // Only add edges for mapped nodes (in body) - if (pred_pid > 0) { - incnt_set(pid, incnt(pid) + 1); // increment - out(pred_pid).push(pid); - } - } - } - } - - // Schedule nodes of PacksetGraph to worklist, using topsort: schedule a node - // that has zero incnt. If a PacksetGraph node corresponds to memops, then add - // those to the memops_schedule. At the end, we return the memops_schedule, and - // note if topsort was successful. - Node_List schedule() { - Node_List memops_schedule; - GrowableArray worklist; - // Directly schedule all nodes without precedence - for (int pid = 1; pid <= _max_pid; pid++) { - if (incnt(pid) == 0) { - worklist.push(pid); - } - } - // Continue scheduling via topological sort - for (int i = 0; i < worklist.length(); i++) { - int pid = worklist.at(i); - - // Add memops to memops_schedule - Node* n = get_node(pid); - Node_List* p = _slp->packset().get_pack(n); - if (n->is_Mem()) { - if (p == nullptr) { - memops_schedule.push(n); - } else { - for (uint k = 0; k < p->size(); k++) { - memops_schedule.push(p->at(k)); - assert(p->at(k)->is_Mem(), "only schedule memops"); - } - } - } - - // Decrement incnt for all successors - for (int j = 0; j < out(pid).length(); j++){ - int pid_use = out(pid).at(j); - int incnt_use = incnt(pid_use) - 1; - incnt_set(pid_use, incnt_use); - // Did use lose its last input? - if (incnt_use == 0) { - worklist.push(pid_use); - } - } - } - - // Was every pid scheduled? If not, we found some cycles in the PacksetGraph. - _schedule_success = (worklist.length() == _max_pid); - return memops_schedule; - } - - // Print the PacksetGraph. - // print_nodes = true: print all C2 nodes beloning to PacksetGrahp node. - // print_zero_incnt = false: do not print nodes that have no in-edges (any more). - void print(bool print_nodes, bool print_zero_incnt) { - const GrowableArray &body = _slp->body(); - tty->print_cr("PacksetGraph"); - for (int pid = 1; pid <= _max_pid; pid++) { - if (incnt(pid) == 0 && !print_zero_incnt) { - continue; - } - tty->print("Node %d. incnt %d [", pid, incnt(pid)); - for (int j = 0; j < out(pid).length(); j++) { - tty->print("%d ", out(pid).at(j)); - } - tty->print_cr("]"); + // Make an empty transform. #ifndef PRODUCT - if (print_nodes) { - for (int i = 0; i < body.length(); i++) { - Node* n = body.at(i); - if (get_pid_or_zero(n) == pid) { - tty->print(" "); - n->dump(); - } - } - } + VTransformTrace trace(_vloop.vtrace(), + is_trace_superword_rejections(), + is_trace_align_vector(), + is_trace_superword_info()); #endif - } - } -}; + VTransform vtransform(_vloop_analyzer, + _mem_ref_for_main_loop_alignment, + _aw_for_main_loop_alignment + NOT_PRODUCT(COMMA trace) + ); -// We want to replace the packed scalars from the PackSet and replace them -// with vector operations. This requires scheduling and re-ordering the memory -// graph. We take these steps: -// (1) Build the PacksetGraph. It combines the dependency graph with the -// packset. The PacksetGraph gives us the dependencies that must be -// respected after scheduling. -// (2) Schedule the PacksetGraph to the memops_schedule, which represents -// a linear order of all memops in the body. The order respects the -// dependencies of the PacksetGraph. -// (3) If the PacksetGraph has cycles, we cannot schedule. Abort. -// (4) Apply the vectorization, including re-ordering the memops and replacing -// packed scalars with vector operations. -bool SuperWord::schedule_and_apply() { - if (_packset.is_empty()) { - return false; - } - ResourceMark rm; - - // (1) Build the PacksetGraph. - PacksetGraph graph(this); - graph.build(); - - // (2) Schedule the PacksetGraph. - Node_List memops_schedule = graph.schedule(); - - // (3) Check if the PacksetGraph schedule succeeded (had no cycles). - // We now know that we only have independent packs, see verify_packs. - // This is a necessary but not a sufficient condition for an acyclic - // graph (DAG) after scheduling. Thus, we must check if the packs have - // introduced a cycle. The SuperWord paper mentions the need for this - // in "3.7 Scheduling". - if (!graph.schedule_success()) { -#ifndef PRODUCT - if (is_trace_superword_rejections()) { - tty->print_cr("SuperWord::schedule found cycle in PacksetGraph:"); - graph.print(true, false); - tty->print_cr("removing all packs from packset."); - } -#endif - _packset.clear(); - return false; + // Build the transform from the packset. + { + ResourceMark rm; + SuperWordVTransformBuilder builder(_packset, vtransform); } - // (4) Apply the vectorization, including re-ordering the memops. - return apply(memops_schedule); + if (!vtransform.schedule()) { return false; } + vtransform.apply(); + return true; } -bool SuperWord::apply(Node_List& memops_schedule) { - Compile* C = phase()->C; - CountedLoopNode* cl = lpt()->_head->as_CountedLoop(); - C->print_method(PHASE_AUTO_VECTORIZATION1_BEFORE_APPLY, 4, cl); +// Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all +// correctness and profitability checks have passed, and the graph was successfully scheduled. +void VTransform::apply() { +#ifndef PRODUCT + if (_trace._info || TraceLoopOpts) { + tty->print_cr("\nVTransform::apply:"); + lpt()->dump_head(); + lpt()->head()->dump(); + } + assert(cl()->is_main_loop(), "auto vectorization only for main loops"); + assert(_graph.is_scheduled(), "must already be scheduled"); +#endif - apply_memops_reordering_with_schedule(memops_schedule); - C->print_method(PHASE_AUTO_VECTORIZATION2_AFTER_REORDER, 4, cl); + Compile* C = phase()->C; + C->print_method(PHASE_AUTO_VECTORIZATION1_BEFORE_APPLY, 4, cl()); + + _graph.apply_memops_reordering_with_schedule(); + C->print_method(PHASE_AUTO_VECTORIZATION2_AFTER_REORDER, 4, cl()); adjust_pre_loop_limit_to_align_main_loop_vectors(); - C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl); + C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl()); - bool is_success = apply_vectorization(); - C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl); - - return is_success; + apply_vectorization(); + C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl()); } -// Reorder the memory graph for all slices in parallel. We walk over the schedule once, -// and track the current memory state of each slice. -void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule) { +// We prepare the memory graph for the replacement of scalar memops with vector memops. +// We reorder all slices in parallel, ensuring that the memops inside each slice are +// ordered according to the _schedule. This means that all packed memops are consecutive +// in the memory graph after the reordering. +void VTransformGraph::apply_memops_reordering_with_schedule() const { #ifndef PRODUCT - if (is_trace_superword_info()) { - tty->print_cr("\nSuperWord::apply_memops_reordering_with_schedule:"); - memops_schedule.dump(); + assert(is_scheduled(), "must be already scheduled"); + if (_trace._info) { + print_memops_schedule(); } #endif + ResourceMark rm; int max_slices = phase()->C->num_alias_types(); - // When iterating over the memops_schedule, we keep track of the current memory state, + // When iterating over the schedule, we keep track of the current memory state, // which is the Phi or a store in the loop. GrowableArray current_state_in_slice(max_slices, max_slices, nullptr); // The memory state after the loop is the last store inside the loop. If we reorder the @@ -2179,10 +1938,9 @@ void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule old_last_store_in_slice.at_put(alias_idx, last_store); } - // (2) Walk over memops_schedule, append memops to the current state + // (2) Walk over schedule, append memops to the current state // of that slice. If it is a Store, we take it as the new state. - for (uint i = 0; i < memops_schedule.size(); i++) { - MemNode* n = memops_schedule.at(i)->as_Mem(); + for_each_memop_in_schedule([&] (MemNode* n) { assert(n->is_Load() || n->is_Store(), "only loads or stores"); int alias_idx = phase()->C->get_alias_index(n->adr_type()); Node* current_state = current_state_in_slice.at(alias_idx); @@ -2198,12 +1956,12 @@ void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule current_state_in_slice.at_put(alias_idx, n); } } - } + }); // (3) For each slice, we add the current state to the backedge // in the Phi. Further, we replace uses of the old last store // with uses of the new last store (current_state). - Node_List uses_after_loop; + GrowableArray uses_after_loop; for (int i = 0; i < mem_slice_head.length(); i++) { Node* phi = mem_slice_head.at(i); int alias_idx = phase()->C->get_alias_index(phi->adr_type()); @@ -2225,7 +1983,7 @@ void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule uses_after_loop.push(use); } } - for (uint k = 0; k < uses_after_loop.size(); k++) { + for (int k = 0; k < uses_after_loop.length(); k++) { Node* use = uses_after_loop.at(k); for (uint j = 0; j < use->req(); j++) { Node* def = use->in(j); @@ -2237,396 +1995,65 @@ void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule } } -// Convert packs into vector node operations -// At this point, all correctness and profitability checks have passed. -// We start the irreversible process of editing the C2 graph. Should -// there be an unexpected situation (assert fails), then we can only -// bail out of the compilation, as the graph has already been partially -// modified. We bail out, and retry without SuperWord. -bool SuperWord::apply_vectorization() { - CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); - assert(cl->is_main_loop(), "SLP should only work on main loops"); +void VTransformGraph::apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const { + ResourceMark rm; + // We keep track of the resulting Nodes from every "VTransformNode::apply" call. + // Since "apply" is called on defs before uses, this allows us to find the + // generated def (input) nodes when we are generating the use nodes in "apply". + int length = _vtnodes.length(); + GrowableArray vtnode_idx_to_transformed_node(length, length, nullptr); + + for (int i = 0; i < _schedule.length(); i++) { + VTransformNode* vtn = _schedule.at(i); + VTransformApplyResult result = vtn->apply(_vloop_analyzer, + vtnode_idx_to_transformed_node); + NOT_PRODUCT( if (_trace._verbose) { result.trace(vtn); } ) + + vtnode_idx_to_transformed_node.at_put(vtn->_idx, result.node()); + max_vector_length = MAX2(max_vector_length, result.vector_length()); + max_vector_width = MAX2(max_vector_width, result.vector_width()); + } +} + +// We call "apply" on every VTransformNode, which replaces the packed scalar nodes with vector nodes. +void VTransform::apply_vectorization() const { Compile* C = phase()->C; - assert(!_packset.is_empty(), "vectorization requires non-empty packset"); - #ifndef PRODUCT - if (TraceLoopOpts) { - tty->print("SuperWord::apply_vectorization "); - lpt()->dump_head(); + if (_trace._verbose) { + tty->print_cr("\nVTransform::apply_vectorization:"); } #endif - uint max_vlen_in_bytes = 0; - uint max_vlen = 0; + uint max_vector_length = 0; // number of elements + uint max_vector_width = 0; // total width in bytes + _graph.apply_vectorization_for_each_vtnode(max_vector_length, max_vector_width); - for (int i = 0; i < body().length(); i++) { - Node* n = body().at(i); - Node_List* p = get_pack(n); - if (p != nullptr && n == p->at(p->size()-1)) { - // After apply_memops_reordering_with_schedule, we know that the memops have the same order in the pack - // as in the memory slice. Hence, "first" is the first memop in the slice from the pack, - // and "n" is the last node in the slice from the pack. - Node* first = p->at(0); - uint vlen = p->size(); - uint vlen_in_bytes = 0; - Node* vn = nullptr; - int opc = n->Opcode(); - if (n->is_Load()) { - Node* ctl = n->in(MemNode::Control); - Node* mem = first->in(MemNode::Memory); - // Set the memory dependency of the LoadVector as early as possible. - // Walk up the memory chain, and ignore any StoreVector that provably - // does not have any memory dependency. - while (mem->is_StoreVector()) { - VPointer p_store(mem->as_Mem(), _vloop); - if (p_store.overlap_possible_with_any_in(p)) { - break; - } else { - mem = mem->in(MemNode::Memory); - } - } - Node* adr = first->in(MemNode::Address); - const TypePtr* atyp = n->adr_type(); - vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p)); - vlen_in_bytes = vn->as_LoadVector()->memory_size(); - } else if (n->is_Store()) { - // Promote value to be stored to vector - Node* val = vector_opd(p, MemNode::ValueIn); - if (val == nullptr) { - assert(false, "input to vector store was not created"); - C->record_failure(C2Compiler::retry_no_superword()); - return false; // bailout - } + assert(max_vector_length > 0 && max_vector_width > 0, "must have vectorized"); + cl()->mark_loop_vectorized(); - Node* ctl = n->in(MemNode::Control); - Node* mem = first->in(MemNode::Memory); - Node* adr = first->in(MemNode::Address); - const TypePtr* atyp = n->adr_type(); - vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen); - vlen_in_bytes = vn->as_StoreVector()->memory_size(); - } else if (VectorNode::is_scalar_rotate(n)) { - Node* in1 = vector_opd(p, 1); - Node* in2 = first->in(2); - // If rotation count is non-constant or greater than 8bit value create a vector. - if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) { - in2 = vector_opd(p, 2); - } - vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else if (VectorNode::is_roundopD(n)) { - Node* in1 = vector_opd(p, 1); - Node* in2 = first->in(2); - assert(in2->is_Con(), "Constant rounding mode expected."); - vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else if (VectorNode::is_muladds2i(n)) { - assert(n->req() == 5u, "MulAddS2I should have 4 operands."); - Node* in1 = vector_opd(p, 1); - Node* in2 = vector_opd(p, 2); - vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else if (opc == Op_SignumF || opc == Op_SignumD) { - assert(n->req() == 4, "four inputs expected"); - Node* in = vector_opd(p, 1); - Node* zero = vector_opd(p, 2); - Node* one = vector_opd(p, 3); - vn = VectorNode::make(opc, in, zero, one, vlen, velt_basic_type(n)); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else if (n->is_Cmp()) { - // Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend - continue; - } else if (n->is_Bool()) { - // Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend - continue; - } else if (n->is_CMove()) { - // Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend - - BoolNode* bol = n->in(1)->as_Bool(); - assert(bol != nullptr, "must have Bool above CMove"); - Node_List* bool_pack = get_pack(bol); - assert(bool_pack != nullptr, "CMove must have matching Bool pack"); - - CmpNode* cmp = bol->in(1)->as_Cmp(); - assert(cmp != nullptr, "must have cmp above CMove"); - Node_List* cmp_pack = get_pack(cmp); - assert(cmp_pack != nullptr, "Bool must have matching Cmp pack"); - - Node* cmp_in1 = vector_opd(cmp_pack, 1); - Node* cmp_in2 = vector_opd(cmp_pack, 2); - - Node* blend_in1 = vector_opd(p, 2); - Node* blend_in2 = vector_opd(p, 3); - - VTransformBoolTest bool_test = _packset.get_bool_test(bool_pack); - BoolTest::mask test_mask = bool_test._mask; - if (bool_test._is_negated) { - // We can cancel out the negation by swapping the blend inputs. - swap(blend_in1, blend_in2); - } - - // VectorMaskCmp - ConINode* test_mask_node = igvn().intcon((int)test_mask); - BasicType bt = velt_basic_type(cmp); - const TypeVect* vt = TypeVect::make(bt, vlen); - VectorNode* mask = new VectorMaskCmpNode(test_mask, cmp_in1, cmp_in2, test_mask_node, vt); - phase()->register_new_node_with_ctrl_of(mask, p->at(0)); - igvn()._worklist.push(mask); - - // VectorBlend - vn = new VectorBlendNode(blend_in1, blend_in2, mask); - } else if (n->req() == 3) { - // Promote operands to vector - Node* in1 = nullptr; - bool node_isa_reduction = is_marked_reduction(n); - if (node_isa_reduction) { - // the input to the first reduction operation is retained - in1 = first->in(1); - } else { - in1 = vector_opd(p, 1); - if (in1 == nullptr) { - assert(false, "input in1 to vector operand was not created"); - C->record_failure(C2Compiler::retry_no_superword()); - return false; // bailout - } - } - Node* in2 = vector_opd(p, 2); - if (in2 == nullptr) { - assert(false, "input in2 to vector operand was not created"); - C->record_failure(C2Compiler::retry_no_superword()); - return false; // bailout - } - if (in1->Opcode() == Op_Replicate && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) { - // Move invariant vector input into second position to avoid register spilling. - Node* tmp = in1; - in1 = in2; - in2 = tmp; - } - if (node_isa_reduction) { - const Type *arith_type = n->bottom_type(); - vn = ReductionNode::make(opc, nullptr, in1, in2, arith_type->basic_type()); - if (in2->is_Load()) { - vlen_in_bytes = in2->as_LoadVector()->memory_size(); - } else { - vlen_in_bytes = in2->as_Vector()->length_in_bytes(); - } - } else { - if (VectorNode::can_use_RShiftI_instead_of_URShiftI(n, velt_basic_type(n))) { - opc = Op_RShiftI; - } - vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } - } else if (VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc)) { - assert(n->req() == 2, "only one input expected"); - Node* in = vector_opd(p, 1); - vn = VectorNode::make(opc, in, nullptr, vlen, velt_basic_type(n)); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) { - assert(n->req() == 2, "only one input expected"); - Node* in = vector_opd(p, 1); - Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG); - phase()->register_new_node_with_ctrl_of(longval, first); - // Requires extra vector long -> int conversion. - vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else if (VectorNode::is_convert_opcode(opc)) { - assert(n->req() == 2, "only one input expected"); - BasicType bt = velt_basic_type(n); - Node* in = vector_opd(p, 1); - int vopc = VectorCastNode::opcode(opc, in->bottom_type()->is_vect()->element_basic_type()); - vn = VectorCastNode::make(vopc, in, bt, vlen); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else if (opc == Op_FmaD || opc == Op_FmaF) { - // Promote operands to vector - Node* in1 = vector_opd(p, 1); - Node* in2 = vector_opd(p, 2); - Node* in3 = vector_opd(p, 3); - vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n)); - vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else { - assert(false, "Unhandled scalar opcode (%s)", NodeClassNames[opc]); - C->record_failure(C2Compiler::retry_no_superword()); - return false; // bailout - } - - if (vn == nullptr) { - assert(false, "got null node instead of vector node"); - C->record_failure(C2Compiler::retry_no_superword()); - return false; // bailout - } - -#ifdef ASSERT - // Mark Load/Store Vector for alignment verification - if (VerifyAlignVector) { - if (vn->Opcode() == Op_LoadVector) { - vn->as_LoadVector()->set_must_verify_alignment(); - } else if (vn->Opcode() == Op_StoreVector) { - vn->as_StoreVector()->set_must_verify_alignment(); - } - } -#endif - - phase()->register_new_node_with_ctrl_of(vn, first); - for (uint j = 0; j < p->size(); j++) { - Node* pm = p->at(j); - igvn().replace_node(pm, vn); - } - igvn()._worklist.push(vn); - - if (vlen > max_vlen) { - max_vlen = vlen; - } - if (vlen_in_bytes > max_vlen_in_bytes) { - max_vlen_in_bytes = vlen_in_bytes; - } - VectorNode::trace_new_vector(vn, "SuperWord"); - } - }//for (int i = 0; i < body().length(); i++) - - if (max_vlen_in_bytes > C->max_vector_size()) { - C->set_max_vector_size(max_vlen_in_bytes); - } - if (max_vlen_in_bytes > 0) { - cl->mark_loop_vectorized(); + if (max_vector_width > C->max_vector_size()) { + C->set_max_vector_size(max_vector_width); } if (SuperWordLoopUnrollAnalysis) { - if (cl->has_passed_slp()) { - uint slp_max_unroll_factor = cl->slp_max_unroll(); - if (slp_max_unroll_factor == max_vlen) { + if (cl()->has_passed_slp()) { + uint slp_max_unroll_factor = cl()->slp_max_unroll(); + if (slp_max_unroll_factor == max_vector_length) { #ifndef PRODUCT if (TraceSuperWordLoopUnrollAnalysis) { - tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte); + tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vector_length, max_vector_width * BitsPerByte); } #endif // For atomic unrolled loops which are vector mapped, instigate more unrolling - cl->set_notpassed_slp(); + cl()->set_notpassed_slp(); // if vector resources are limited, do not allow additional unrolling if (Matcher::float_pressure_limit() > 8) { C->set_major_progress(); - cl->mark_do_unroll_only(); + cl()->mark_do_unroll_only(); } } } } - - return true; -} - -//------------------------------vector_opd--------------------------- -// Create a vector operand for the nodes in pack p for operand: in(opd_idx) -Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { - Node* p0 = p->at(0); - uint vlen = p->size(); - Node* opd = p0->in(opd_idx); - CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); - Node* same_input = _packset.same_inputs_at_index_or_null(p, opd_idx); - - // Insert index population operation to create a vector of increasing - // indices starting from the iv value. In some special unrolled loops - // (see JDK-8286125), we need scalar replications of the iv value if - // all inputs are the same iv, so we do a same inputs check here. - if (opd == iv() && same_input == nullptr) { - BasicType p0_bt = velt_basic_type(p0); - BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT; - assert(VectorNode::is_populate_index_supported(iv_bt), "Should support"); - const TypeVect* vt = TypeVect::make(iv_bt, vlen); - Node* vn = new PopulateIndexNode(iv(), igvn().intcon(1), vt); - VectorNode::trace_new_vector(vn, "SuperWord"); - phase()->register_new_node_with_ctrl_of(vn, opd); - return vn; - } - - if (same_input != nullptr) { - if (opd->is_Vector() || opd->is_LoadVector()) { - if (opd_idx == 2 && VectorNode::is_shift(p0)) { - assert(false, "shift's count can't be vector"); - return nullptr; - } - return opd; // input is matching vector - } - if ((opd_idx == 2) && VectorNode::is_shift(p0)) { - Node* cnt = opd; - // Vector instructions do not mask shift count, do it here. - juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1); - const TypeInt* t = opd->find_int_type(); - if (t != nullptr && t->is_con()) { - juint shift = t->get_con(); - if (shift > mask) { // Unsigned cmp - cnt = igvn().intcon(shift & mask); - phase()->set_ctrl(cnt, phase()->C->root()); - } - } else { - if (t == nullptr || t->_lo < 0 || t->_hi > (int)mask) { - cnt = igvn().intcon(mask); - cnt = new AndINode(opd, cnt); - phase()->register_new_node_with_ctrl_of(cnt, opd); - } - if (!opd->bottom_type()->isa_int()) { - assert(false, "int type only"); - return nullptr; - } - } - // Move shift count into vector register. - cnt = VectorNode::shift_count(p0->Opcode(), cnt, vlen, velt_basic_type(p0)); - phase()->register_new_node_with_ctrl_of(cnt, opd); - return cnt; - } - if (opd->is_StoreVector()) { - assert(false, "StoreVector is not expected here"); - return nullptr; - } - // Convert scalar input to vector with the same number of elements as - // p0's vector. Use p0's type because size of operand's container in - // vector should match p0's size regardless operand's size. - const Type* p0_t = nullptr; - VectorNode* vn = nullptr; - if (opd_idx == 2 && VectorNode::is_scalar_rotate(p0)) { - Node* conv = opd; - p0_t = TypeInt::INT; - if (p0->bottom_type()->isa_long()) { - p0_t = TypeLong::LONG; - conv = new ConvI2LNode(opd); - phase()->register_new_node_with_ctrl_of(conv, opd); - } - vn = VectorNode::scalar2vector(conv, vlen, p0_t); - } else { - p0_t = velt_type(p0); - vn = VectorNode::scalar2vector(opd, vlen, p0_t); - } - - phase()->register_new_node_with_ctrl_of(vn, opd); - VectorNode::trace_new_vector(vn, "SuperWord"); - return vn; - } - - // Insert pack operation - BasicType bt = velt_basic_type(p0); - PackNode* pk = PackNode::make(opd, vlen, bt); - DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); ) - - for (uint i = 1; i < vlen; i++) { - Node* pi = p->at(i); - Node* in = pi->in(opd_idx); - if (get_pack(in) != nullptr) { - assert(false, "Should already have been unpacked"); - return nullptr; - } - assert(opd_bt == in->bottom_type()->basic_type(), "all same type"); - pk->add_opd(in); - if (VectorNode::is_muladds2i(pi)) { - Node* in2 = pi->in(opd_idx + 2); - if (get_pack(in2) != nullptr) { - assert(false, "Should already have been unpacked"); - return nullptr; - } - assert(opd_bt == in2->bottom_type()->basic_type(), "all same type"); - pk->add_opd(in2); - } - } - phase()->register_new_node_with_ctrl_of(pk, opd); - VectorNode::trace_new_vector(pk, "SuperWord"); - return pk; } #ifdef ASSERT @@ -2797,18 +2224,7 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) const { return _packset.is_muladds2i_pack_with_pack_inputs(u_pk); } - if (u_pk->size() != d_pk->size()) { - return false; - } - - for (uint i = 0; i < u_pk->size(); i++) { - Node* ui = u_pk->at(i); - Node* di = d_pk->at(i); - if (ui->in(u_idx) != di) { - return false; - } - } - return true; + return _packset.pack_input_at_index_or_null(u_pk, u_idx) != nullptr; } // MulAddS2I takes 4 shorts and produces an int. We can reinterpret @@ -3182,10 +2598,10 @@ bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const { _vloop.phase()->C->get_alias_index(m2->adr_type()); } -LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) { +LoadNode::ControlDependency VTransformLoadVectorNode::control_dependency() const { LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest; - for (uint i = 0; i < p->size(); i++) { - Node* n = p->at(i); + for (int i = 0; i < nodes().length(); i++) { + Node* n = nodes().at(i); assert(n->is_Load(), "only meaningful for loads"); if (!n->depends_only_on_test()) { if (n->as_Load()->has_unknown_control_dependency() && @@ -3202,8 +2618,8 @@ LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) { } // Find the memop pack with the maximum vector width, unless they were already -// determined by SuperWord::filter_packs_for_alignment(). -void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() { +// determined (e.g. by SuperWord::filter_packs_for_alignment()). +void VTransform::determine_mem_ref_and_aw_for_main_loop_alignment() { if (_mem_ref_for_main_loop_alignment != nullptr) { assert(VLoop::vectors_should_be_aligned(), "mem_ref only set if filtered for alignment"); return; @@ -3211,15 +2627,18 @@ void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() { MemNode const* mem_ref = nullptr; int max_aw = 0; - for (int i = 0; i < _packset.length(); i++) { - Node_List* pack = _packset.at(i); - MemNode* first = pack->at(0)->isa_Mem(); - if (first == nullptr) { continue; } - int vw = first->memory_size() * pack->size(); + const GrowableArray& vtnodes = _graph.vtnodes(); + for (int i = 0; i < vtnodes.length(); i++) { + VTransformVectorNode* vtn = vtnodes.at(i)->isa_Vector(); + if (vtn == nullptr) { continue; } + MemNode* p0 = vtn->nodes().at(0)->isa_Mem(); + if (p0 == nullptr) { continue; } + + int vw = p0->memory_size() * vtn->nodes().length(); if (vw > max_aw) { max_aw = vw; - mem_ref = first; + mem_ref = p0; } } assert(mem_ref != nullptr && max_aw > 0, "found mem_ref and aw"); @@ -3229,7 +2648,7 @@ void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() { #define TRACE_ALIGN_VECTOR_NODE(node) { \ DEBUG_ONLY( \ - if (is_trace_align_vector()) { \ + if (_trace._align_vector) { \ tty->print(" " #node ": "); \ node->dump(); \ } \ @@ -3240,7 +2659,7 @@ void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() { // the address of "_mem_ref_for_main_loop_alignment" to "_aw_for_main_loop_alignment", which is a // sufficiently large alignment width. We adjust the pre-loop iteration count by adjusting the // pre-loop limit. -void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() { +void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { determine_mem_ref_and_aw_for_main_loop_alignment(); const MemNode* align_to_ref = _mem_ref_for_main_loop_alignment; const int aw = _aw_for_main_loop_alignment; @@ -3397,8 +2816,8 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() { Node* invar = align_to_ref_p.invar(); #ifdef ASSERT - if (is_trace_align_vector()) { - tty->print_cr("\nadjust_pre_loop_limit_to_align_main_loop_vectors:"); + if (_trace._align_vector) { + tty->print_cr("\nVTransform::adjust_pre_loop_limit_to_align_main_loop_vectors:"); tty->print(" align_to_ref:"); align_to_ref->dump(); tty->print_cr(" aw: %d", aw); @@ -3424,7 +2843,7 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() { scale == 0 || !is_power_of_2(abs(scale)) || abs(scale) >= aw) { #ifdef ASSERT - if (is_trace_align_vector()) { + if (_trace._align_vector) { tty->print_cr(" Alignment cannot be affected by changing pre-loop limit because"); tty->print_cr(" stride or scale are not power of 2, or abs(scale) >= aw."); } @@ -3440,7 +2859,7 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() { const int AW = aw / abs(scale); #ifdef ASSERT - if (is_trace_align_vector()) { + if (_trace._align_vector) { tty->print_cr(" AW = aw(%d) / abs(scale(%d)) = %d", aw, scale, AW); } #endif @@ -3595,10 +3014,10 @@ void PackSet::print_pack(Node_List* pack) { #ifndef PRODUCT void VLoopBody::print() const { - tty->print_cr("\nBlock"); + tty->print_cr("\nVLoopBody::print"); for (int i = 0; i < body().length(); i++) { Node* n = body().at(i); - tty->print("%d ", i); + tty->print("%4d ", i); if (n != nullptr) { n->dump(); } @@ -3615,3 +3034,4 @@ bool SuperWord::same_origin_idx(Node* a, Node* b) const { bool SuperWord::same_generation(Node* a, Node* b) const { return a != nullptr && b != nullptr && _clone_map.same_gen(a->_idx, b->_idx); } + diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index fb91d014fae..65f87082525 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -25,6 +25,7 @@ #define SHARE_OPTO_SUPERWORD_HPP #include "opto/vectorization.hpp" +#include "opto/vtransform.hpp" #include "utilities/growableArray.hpp" // @@ -367,6 +368,10 @@ public: Node* same_inputs_at_index_or_null(const Node_List* pack, const int index) const; VTransformBoolTest get_bool_test(const Node_List* bool_pack) const; + Node_List* pack_input_at_index_or_null(const Node_List* pack, const int index) const { + return strided_pack_input_at_index_or_null(pack, index, 1, 0); + } + private: SplitStatus split_pack(const char* split_name, Node_List* pack, SplitTask task); public: @@ -599,13 +604,6 @@ private: DEBUG_ONLY(void verify_packs() const;) - bool schedule_and_apply(); - bool apply(Node_List& memops_schedule); - void apply_memops_reordering_with_schedule(Node_List& memops_schedule); - bool apply_vectorization(); - // Create a vector operand for the nodes in pack p for operand: in(opd_idx) - Node* vector_opd(Node_List* p, int opd_idx); - // Can code be generated for the pack, restricted to size nodes? bool implemented(const Node_List* pack, const uint size) const; // Find the maximal implemented size smaller or equal to the packs size @@ -630,11 +628,7 @@ private: bool is_velt_basic_type_compatible_use_def(Node* use, Node* def) const; - static LoadNode::ControlDependency control_dependency(Node_List* p); - - // Ensure that the main loop vectors are aligned by adjusting the pre loop limit. - void determine_mem_ref_and_aw_for_main_loop_alignment(); - void adjust_pre_loop_limit_to_align_main_loop_vectors(); + bool schedule_and_apply() const; }; #endif // SHARE_OPTO_SUPERWORD_HPP diff --git a/src/hotspot/share/opto/superwordVTransformBuilder.cpp b/src/hotspot/share/opto/superwordVTransformBuilder.cpp new file mode 100644 index 00000000000..b0a0c97cb16 --- /dev/null +++ b/src/hotspot/share/opto/superwordVTransformBuilder.cpp @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +#include "precompiled.hpp" +#include "opto/superwordVTransformBuilder.hpp" +#include "opto/vectornode.hpp" + +void SuperWordVTransformBuilder::build() { + assert(!_packset.is_empty(), "must have non-empty packset"); + assert(!_vtransform.has_graph(), "start with empty vtransform"); + + // Create vtnodes for all nodes in the loop. + build_vector_vtnodes_for_packed_nodes(); + build_scalar_vtnodes_for_non_packed_nodes(); + + // Connect all vtnodes with their inputs. Possibly create vtnodes for input + // nodes that are outside the loop. + VectorSet vtn_dependencies; // Shared, but cleared for every vtnode. + build_inputs_for_vector_vtnodes(vtn_dependencies); + build_inputs_for_scalar_vtnodes(vtn_dependencies); +} + +void SuperWordVTransformBuilder::build_vector_vtnodes_for_packed_nodes() { + for (int i = 0; i < _packset.length(); i++) { + Node_List* pack = _packset.at(i); + VTransformVectorNode* vtn = make_vector_vtnode_for_pack(pack); + for (uint k = 0; k < pack->size(); k++) { + map_node_to_vtnode(pack->at(k), vtn); + } + } +} + +void SuperWordVTransformBuilder::build_scalar_vtnodes_for_non_packed_nodes() { + for (int i = 0; i < _vloop_analyzer.body().body().length(); i++) { + Node* n = _vloop_analyzer.body().body().at(i); + if (_packset.get_pack(n) != nullptr) { continue; } + VTransformScalarNode* vtn = new (_vtransform.arena()) VTransformScalarNode(_vtransform, n); + map_node_to_vtnode(n, vtn); + } +} + +void SuperWordVTransformBuilder::build_inputs_for_vector_vtnodes(VectorSet& vtn_dependencies) { + for (int i = 0; i < _packset.length(); i++) { + Node_List* pack = _packset.at(i); + Node* p0 = pack->at(0); + + VTransformVectorNode* vtn = get_vtnode(p0)->isa_Vector(); + assert(vtn != nullptr, "all packs must have vector vtnodes"); + vtn_dependencies.clear(); // Add every dependency only once per vtn. + + if (p0->is_Load()) { + set_req_with_scalar(p0, vtn, vtn_dependencies, MemNode::Address); + } else if (p0->is_Store()) { + set_req_with_scalar(p0, vtn, vtn_dependencies, MemNode::Address); + set_req_with_vector(pack, vtn, vtn_dependencies, MemNode::ValueIn); + } else if (vtn->isa_ReductionVector() != nullptr) { + set_req_with_scalar(p0, vtn, vtn_dependencies, 1); // scalar init + set_req_with_vector(pack, vtn, vtn_dependencies, 2); // vector + } else { + assert(vtn->isa_ElementWiseVector() != nullptr, "all other vtnodes are handled above"); + if (VectorNode::is_scalar_rotate(p0) && + p0->in(2)->is_Con() && + Matcher::supports_vector_constant_rotates(p0->in(2)->get_int())) { + set_req_with_vector(pack, vtn, vtn_dependencies, 1); + set_req_with_scalar(p0, vtn, vtn_dependencies, 2); // constant rotation + } else if (VectorNode::is_roundopD(p0)) { + set_req_with_vector(pack, vtn, vtn_dependencies, 1); + set_req_with_scalar(p0, vtn, vtn_dependencies, 2); // constant rounding mode + } else if (p0->is_CMove()) { + // Cmp + Bool + CMove -> VectorMaskCmp + VectorBlend. + set_all_req_with_vectors(pack, vtn, vtn_dependencies); + VTransformBoolVectorNode* vtn_mask_cmp = vtn->in(1)->isa_BoolVector(); + if (vtn_mask_cmp->test()._is_negated) { + vtn->swap_req(2, 3); // swap if test was negated. + } + } else { + set_all_req_with_vectors(pack, vtn, vtn_dependencies); + } + } + + for (uint k = 0; k < pack->size(); k++) { + add_dependencies_of_node_to_vtnode(pack->at(k), vtn, vtn_dependencies); + } + } +} + +void SuperWordVTransformBuilder::build_inputs_for_scalar_vtnodes(VectorSet& vtn_dependencies) { + for (int i = 0; i < _vloop_analyzer.body().body().length(); i++) { + Node* n = _vloop_analyzer.body().body().at(i); + VTransformScalarNode* vtn = get_vtnode(n)->isa_Scalar(); + if (vtn == nullptr) { continue; } + vtn_dependencies.clear(); // Add every dependency only once per vtn. + + if (n->is_Load()) { + set_req_with_scalar(n, vtn, vtn_dependencies, MemNode::Address); + } else if (n->is_Store()) { + set_req_with_scalar(n, vtn, vtn_dependencies, MemNode::Address); + set_req_with_scalar(n, vtn, vtn_dependencies, MemNode::ValueIn); + } else if (n->is_CountedLoop()) { + continue; // Is "root", has no dependency. + } else if (n->is_Phi()) { + // CountedLoop Phi's: ignore backedge (and entry value). + assert(n->in(0) == _vloop.cl(), "only Phi's from the CountedLoop allowed"); + set_req_with_scalar(n, vtn, vtn_dependencies, 0); + continue; + } else { + set_all_req_with_scalars(n, vtn, vtn_dependencies); + } + + add_dependencies_of_node_to_vtnode(n, vtn, vtn_dependencies); + } +} + +// Create a vtnode for each pack. No in/out edges set yet. +VTransformVectorNode* SuperWordVTransformBuilder::make_vector_vtnode_for_pack(const Node_List* pack) const { + uint pack_size = pack->size(); + Node* p0 = pack->at(0); + int opc = p0->Opcode(); + VTransformVectorNode* vtn = nullptr; + + if (p0->is_Load()) { + vtn = new (_vtransform.arena()) VTransformLoadVectorNode(_vtransform, pack_size); + } else if (p0->is_Store()) { + vtn = new (_vtransform.arena()) VTransformStoreVectorNode(_vtransform, pack_size); + } else if (p0->is_Bool()) { + VTransformBoolTest kind = _packset.get_bool_test(pack); + vtn = new (_vtransform.arena()) VTransformBoolVectorNode(_vtransform, pack_size, kind); + } else if (_vloop_analyzer.reductions().is_marked_reduction(p0)) { + vtn = new (_vtransform.arena()) VTransformReductionVectorNode(_vtransform, pack_size); + } else if (VectorNode::is_muladds2i(p0)) { + // A special kind of binary element-wise vector op: the inputs are "ints" a and b, + // but reinterpreted as two "shorts" [a0, a1] and [b0, b1]: + // v = MulAddS2I(a, b) = a0 * b0 + a1 + b1 + assert(p0->req() == 5, "MulAddS2I should have 4 operands"); + vtn = new (_vtransform.arena()) VTransformElementWiseVectorNode(_vtransform, 3, pack_size); + } else { + assert(p0->req() == 3 || + p0->is_CMove() || + VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc) || + VectorNode::is_convert_opcode(opc) || + VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc) || + opc == Op_FmaD || + opc == Op_FmaF || + opc == Op_SignumF || + opc == Op_SignumD, + "pack type must be in this list"); + vtn = new (_vtransform.arena()) VTransformElementWiseVectorNode(_vtransform, p0->req(), pack_size); + } + vtn->set_nodes(pack); + return vtn; +} + +void SuperWordVTransformBuilder::set_req_with_scalar(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies, const int index) { + VTransformNode* req = get_vtnode_or_wrap_as_input_scalar(n->in(index)); + vtn->set_req(index, req); + vtn_dependencies.set(req->_idx); +} + +// Either get the existing vtnode vector input (when input is a pack), or else make a +// new vector vtnode for the input (e.g. for Replicate or PopulateIndex). +VTransformNode* SuperWordVTransformBuilder::get_or_make_vtnode_vector_input_at_index(const Node_List* pack, const int index) { + Node* p0 = pack->at(0); + + Node_List* pack_in = _packset.pack_input_at_index_or_null(pack, index); + if (pack_in != nullptr) { + // Input is a matching pack -> vtnode already exists. + assert(index != 2 || !VectorNode::is_shift(p0), "shift's count cannot be vector"); + return get_vtnode(pack_in->at(0)); + } + + if (VectorNode::is_muladds2i(p0)) { + assert(_packset.is_muladds2i_pack_with_pack_inputs(pack), "inputs must all be packs"); + // All inputs are strided (stride = 2), either with offset 0 or 1. + Node_List* pack_in0 = _packset.strided_pack_input_at_index_or_null(pack, index, 2, 0); + if (pack_in0 != nullptr) { + return get_vtnode(pack_in0->at(0)); + } + Node_List* pack_in1 = _packset.strided_pack_input_at_index_or_null(pack, index, 2, 1); + if (pack_in1 != nullptr) { + return get_vtnode(pack_in1->at(0)); + } + } + + Node* same_input = _packset.same_inputs_at_index_or_null(pack, index); + if (same_input == nullptr && p0->in(index) == _vloop.iv()) { + // PopulateIndex: [iv+0, iv+1, iv+2, ...] + VTransformNode* iv_vtn = get_vtnode_or_wrap_as_input_scalar(_vloop.iv()); + BasicType p0_bt = _vloop_analyzer.types().velt_basic_type(p0); + // If we have subword type, take that type directly. If p0 is some ConvI2L/F/D, + // then the p0_bt can also be L/F/D but we need to produce ints for the input of + // the ConvI2L/F/D. + BasicType element_bt = is_subword_type(p0_bt) ? p0_bt : T_INT; + VTransformNode* populate_index = new (_vtransform.arena()) VTransformPopulateIndexNode(_vtransform, pack->size(), element_bt); + populate_index->set_req(1, iv_vtn); + return populate_index; + } + + if (same_input != nullptr) { + VTransformNode* same_input_vtn = get_vtnode_or_wrap_as_input_scalar(same_input); + if (index == 2 && VectorNode::is_shift(p0)) { + // Scalar shift count for vector shift operation: vec2 = shiftV(vec1, scalar_count) + // Scalar shift operations masks the shift count, but the vector shift does not, so + // create a special ShiftCount node. + BasicType element_bt = _vloop_analyzer.types().velt_basic_type(p0); + juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - 1) : (BitsPerLong - 1); + VTransformNode* shift_count = new (_vtransform.arena()) VTransformShiftCountNode(_vtransform, pack->size(), element_bt, mask, p0->Opcode()); + shift_count->set_req(1, same_input_vtn); + return shift_count; + } else { + // Replicate the scalar same_input to every vector element. + const Type* element_type = _vloop_analyzer.types().velt_type(p0); + if (index == 2 && VectorNode::is_scalar_rotate(p0) && element_type->isa_long()) { + // Scalar rotate has int rotation value, but the scalar rotate expects longs. + assert(same_input->bottom_type()->isa_int(), "scalar rotate expects int rotation"); + VTransformNode* conv = new (_vtransform.arena()) VTransformConvI2LNode(_vtransform); + conv->set_req(1, same_input_vtn); + same_input_vtn = conv; + } + VTransformNode* replicate = new (_vtransform.arena()) VTransformReplicateNode(_vtransform, pack->size(), element_type); + replicate->set_req(1, same_input_vtn); + return replicate; + } + } + + // The input is neither a pack not a same_input node. SuperWord::profitable does not allow + // any other case. In the future, we could insert a PackNode. +#ifdef ASSERT + tty->print_cr("\nSuperWordVTransformBuilder::get_or_make_vtnode_vector_input_at_index: index=%d", index); + pack->dump(); + assert(false, "Pack input was neither a pack nor a same_input node"); +#endif + ShouldNotReachHere(); +} + +VTransformNode* SuperWordVTransformBuilder::get_vtnode_or_wrap_as_input_scalar(Node* n) { + VTransformNode* vtn = get_vtnode_or_null(n); + if (vtn != nullptr) { return vtn; } + + assert(!_vloop.in_bb(n), "only nodes outside the loop can be input nodes to the loop"); + vtn = new (_vtransform.arena()) VTransformInputScalarNode(_vtransform, n); + map_node_to_vtnode(n, vtn); + return vtn; +} + +void SuperWordVTransformBuilder::set_req_with_vector(const Node_List* pack, VTransformNode* vtn, VectorSet& vtn_dependencies, int j) { + VTransformNode* req = get_or_make_vtnode_vector_input_at_index(pack, j); + vtn->set_req(j, req); + vtn_dependencies.set(req->_idx); +} + +void SuperWordVTransformBuilder::set_all_req_with_scalars(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies) { + assert(vtn->req() == n->req(), "scalars must have same number of reqs"); + for (uint j = 0; j < n->req(); j++) { + Node* def = n->in(j); + if (def == nullptr) { continue; } + set_req_with_scalar(n, vtn, vtn_dependencies, j); + } +} + +void SuperWordVTransformBuilder::set_all_req_with_vectors(const Node_List* pack, VTransformNode* vtn, VectorSet& vtn_dependencies) { + Node* p0 = pack->at(0); + assert(vtn->req() <= p0->req(), "must have at at most as many reqs"); + // Vectors have no ctrl, so ignore it. + for (uint j = 1; j < vtn->req(); j++) { + Node* def = p0->in(j); + if (def == nullptr) { continue; } + set_req_with_vector(pack, vtn, vtn_dependencies, j); + } +} + +void SuperWordVTransformBuilder::add_dependencies_of_node_to_vtnode(Node*n, VTransformNode* vtn, VectorSet& vtn_dependencies) { + for (VLoopDependencyGraph::PredsIterator preds(_vloop_analyzer.dependency_graph(), n); !preds.done(); preds.next()) { + Node* pred = preds.current(); + if (!_vloop.in_bb(pred)) { continue; } + + // Only add memory dependencies to memory nodes. All others are taken care of with the req. + if (n->is_Mem() && !pred->is_Mem()) { continue; } + + VTransformNode* dependency = get_vtnode(pred); + + // Reduction self-cycle? + if (vtn == dependency && _vloop_analyzer.reductions().is_marked_reduction(n)) { continue; } + + if (vtn_dependencies.test_set(dependency->_idx)) { continue; } + vtn->add_dependency(dependency); // Add every dependency only once per vtn. + } +} + diff --git a/src/hotspot/share/opto/superwordVTransformBuilder.hpp b/src/hotspot/share/opto/superwordVTransformBuilder.hpp new file mode 100644 index 00000000000..847f870bef6 --- /dev/null +++ b/src/hotspot/share/opto/superwordVTransformBuilder.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +#include "opto/vtransform.hpp" +#include "opto/superword.hpp" + +#ifndef SHARE_OPTO_SUPERWORD_VTRANSFORM_BUILDER_HPP +#define SHARE_OPTO_SUPERWORD_VTRANSFORM_BUILDER_HPP + +// Facility class that builds a VTransform from a SuperWord PackSet. +class SuperWordVTransformBuilder : public StackObj { +private: + const VLoopAnalyzer& _vloop_analyzer; + const VLoop& _vloop; + const PackSet& _packset; + VTransform& _vtransform; + + ResourceHashtable _idx_to_vtnode; + +public: + SuperWordVTransformBuilder(const PackSet& packset, + VTransform& vtransform) : + _vloop_analyzer(vtransform.vloop_analyzer()), + _vloop(_vloop_analyzer.vloop()), + _packset(packset), + _vtransform(vtransform) + { + assert(!_vtransform.has_graph(), "constructor is passed an empty vtransform"); + build(); + assert(_vtransform.has_graph(), "vtransform must contain some vtnodes now"); + } + +private: + void build(); + void build_vector_vtnodes_for_packed_nodes(); + void build_scalar_vtnodes_for_non_packed_nodes(); + void build_inputs_for_vector_vtnodes(VectorSet& vtn_dependencies); + void build_inputs_for_scalar_vtnodes(VectorSet& vtn_dependencies); + + // Helper methods for building VTransform. + VTransformNode* get_vtnode_or_null(Node* n) const { + VTransformNode** ptr = _idx_to_vtnode.get(n->_idx); + return (ptr == nullptr) ? nullptr : *ptr; + } + + VTransformNode* get_vtnode(Node* n) const { + VTransformNode* vtn = get_vtnode_or_null(n); + assert(vtn != nullptr, "expect non-null vtnode"); + return vtn; + } + + void map_node_to_vtnode(Node* n, VTransformNode* vtn) { + assert(vtn != nullptr, "only set non-null vtnodes"); + _idx_to_vtnode.put_when_absent(n->_idx, vtn); + } + + VTransformVectorNode* make_vector_vtnode_for_pack(const Node_List* pack) const; + VTransformNode* get_or_make_vtnode_vector_input_at_index(const Node_List* pack, const int index); + VTransformNode* get_vtnode_or_wrap_as_input_scalar(Node* n); + void set_req_with_scalar(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies, const int index); + void set_req_with_vector(const Node_List* pack, VTransformNode* vtn, VectorSet& vtn_dependencies, const int index); + void set_all_req_with_scalars(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies); + void set_all_req_with_vectors(const Node_List* pack, VTransformNode* vtn, VectorSet& vtn_dependencies); + void add_dependencies_of_node_to_vtnode(Node* n, VTransformNode* vtn, VectorSet& vtn_dependencies); +}; + +#endif // SHARE_OPTO_SUPERWORD_VTRANSFORM_BUILDER_HPP diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 3aae04c9453..038e04fe0c5 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -43,6 +43,7 @@ flags(SW_INFO, "Trace SuperWord info (equivalent to TraceSuperWord)") \ flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \ flags(ALIGN_VECTOR, "Trace AlignVector") \ + flags(VTRANSFORM, "Trace VTransform Graph") \ flags(ALL, "Trace everything (very verbose)") #define table_entry(name, description) name, diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 01b235e8b27..8d2d3868fe6 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -26,7 +26,6 @@ #include "opto/addnode.hpp" #include "opto/connode.hpp" #include "opto/convertnode.hpp" -#include "opto/matcher.hpp" #include "opto/mulnode.hpp" #include "opto/rootnode.hpp" #include "opto/vectorization.hpp" diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index c9f54594910..3984407c565 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -25,7 +25,7 @@ #ifndef SHARE_OPTO_VECTORIZATION_HPP #define SHARE_OPTO_VECTORIZATION_HPP -#include "opto/node.hpp" +#include "opto/matcher.hpp" #include "opto/loopnode.hpp" #include "opto/traceAutoVectorizationTag.hpp" #include "utilities/pair.hpp" @@ -763,9 +763,9 @@ class VPointer : public ArenaObj { } } - bool overlap_possible_with_any_in(const Node_List* p) const { - for (uint k = 0; k < p->size(); k++) { - MemNode* mem = p->at(k)->as_Mem(); + bool overlap_possible_with_any_in(const GrowableArray& nodes) const { + for (int i = 0; i < nodes.length(); i++) { + MemNode* mem = nodes.at(i)->as_Mem(); VPointer p_mem(mem, _vloop); // Only if we know that we have Less or Greater can we // be sure that there can never be an overlap between @@ -1323,12 +1323,4 @@ private: #endif }; -struct VTransformBoolTest { - const BoolTest::mask _mask; - const bool _is_negated; - - VTransformBoolTest(const BoolTest::mask mask, bool is_negated) : - _mask(mask), _is_negated(is_negated) {} -}; - #endif // SHARE_OPTO_VECTORIZATION_HPP diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp new file mode 100644 index 00000000000..e40157caa36 --- /dev/null +++ b/src/hotspot/share/opto/vtransform.cpp @@ -0,0 +1,450 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +#include "precompiled.hpp" +#include "opto/vtransform.hpp" +#include "opto/vectornode.hpp" +#include "opto/convertnode.hpp" + +void VTransformGraph::add_vtnode(VTransformNode* vtnode) { + assert(vtnode->_idx == _vtnodes.length(), "position must match idx"); + _vtnodes.push(vtnode); +} + +// Compute a linearization of the graph. We do this with a reverse-post-order of a DFS. +// This only works if the graph is a directed acyclic graph (DAG). The C2 graph, and +// the VLoopDependencyGraph are both DAGs, but after introduction of vectors/packs, the +// graph has additional constraints which can introduce cycles. Example: +// +// +--------+ +// A -> X | v +// Pack [A,B] and [X,Y] [A,B] [X,Y] +// Y -> B ^ | +// +--------+ +// +// We return "true" IFF we find no cycle, i.e. if the linearization succeeds. +bool VTransformGraph::schedule() { + assert(!is_scheduled(), "not yet scheduled"); + +#ifndef PRODUCT + if (_trace._verbose) { + print_vtnodes(); + } +#endif + + ResourceMark rm; + GrowableArray stack; + VectorSet pre_visited; + VectorSet post_visited; + + collect_nodes_without_req_or_dependency(stack); + + // We create a reverse-post-visit order. This gives us a linearization, if there are + // no cycles. Then, we simply reverse the order, and we have a schedule. + int rpo_idx = _vtnodes.length() - 1; + while (!stack.is_empty()) { + VTransformNode* vtn = stack.top(); + if (!pre_visited.test_set(vtn->_idx)) { + // Forward arc in graph (pre-visit). + } else if (!post_visited.test(vtn->_idx)) { + // Forward arc in graph. Check if all uses were already visited: + // Yes -> post-visit. + // No -> we are mid-visit. + bool all_uses_already_visited = true; + + for (int i = 0; i < vtn->outs(); i++) { + VTransformNode* use = vtn->out(i); + if (post_visited.test(use->_idx)) { continue; } + if (pre_visited.test(use->_idx)) { + // Cycle detected! + // The nodes that are pre_visited but not yet post_visited form a path from + // the "root" to the current vtn. Now, we are looking at an edge (vtn, use), + // and discover that use is also pre_visited but not post_visited. Thus, use + // lies on that path from "root" to vtn, and the edge (vtn, use) closes a + // cycle. + NOT_PRODUCT(if (_trace._rejections) { trace_schedule_cycle(stack, pre_visited, post_visited); } ) + return false; + } + stack.push(use); + all_uses_already_visited = false; + } + + if (all_uses_already_visited) { + stack.pop(); + post_visited.set(vtn->_idx); // post-visit + _schedule.at_put_grow(rpo_idx--, vtn); // assign rpo_idx + } + } else { + stack.pop(); // Already post-visited. Ignore secondary edge. + } + } + +#ifndef PRODUCT + if (_trace._verbose) { + print_schedule(); + } +#endif + + assert(rpo_idx == -1, "used up all rpo_idx, rpo_idx=%d", rpo_idx); + return true; +} + +// Push all "root" nodes, i.e. those that have no inputs (req or dependency): +void VTransformGraph::collect_nodes_without_req_or_dependency(GrowableArray& stack) const { + for (int i = 0; i < _vtnodes.length(); i++) { + VTransformNode* vtn = _vtnodes.at(i); + if (!vtn->has_req_or_dependency()) { + stack.push(vtn); + } + } +} + +#ifndef PRODUCT +void VTransformGraph::trace_schedule_cycle(const GrowableArray& stack, + const VectorSet& pre_visited, + const VectorSet& post_visited) const { + tty->print_cr("\nVTransform::schedule found a cycle on path (P), vectorization attempt fails."); + for (int j = 0; j < stack.length(); j++) { + VTransformNode* n = stack.at(j); + bool on_path = pre_visited.test(n->_idx) && !post_visited.test(n->_idx); + tty->print(" %s ", on_path ? "P" : "_"); + n->print(); + } +} + +void VTransformApplyResult::trace(VTransformNode* vtnode) const { + tty->print(" apply: "); + vtnode->print(); + tty->print(" -> "); + if (_node == nullptr) { + tty->print_cr("nullptr"); + } else { + _node->dump(); + } +} +#endif + +Node* VTransformNode::find_transformed_input(int i, const GrowableArray& vnode_idx_to_transformed_node) const { + Node* n = vnode_idx_to_transformed_node.at(in(i)->_idx); + assert(n != nullptr, "must find input IR node"); + return n; +} + +VTransformApplyResult VTransformScalarNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + // This was just wrapped. Now we simply unwap without touching the inputs. + return VTransformApplyResult::make_scalar(_node); +} + +VTransformApplyResult VTransformReplicateNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + Node* val = find_transformed_input(1, vnode_idx_to_transformed_node); + VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type); + register_new_node_from_vectorization(vloop_analyzer, vn, val); + return VTransformApplyResult::make_vector(vn, _vlen, vn->length_in_bytes()); +} + +VTransformApplyResult VTransformConvI2LNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + Node* val = find_transformed_input(1, vnode_idx_to_transformed_node); + Node* n = new ConvI2LNode(val); + register_new_node_from_vectorization(vloop_analyzer, n, val); + return VTransformApplyResult::make_scalar(n); +} + +VTransformApplyResult VTransformShiftCountNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + PhaseIdealLoop* phase = vloop_analyzer.vloop().phase(); + Node* shift_count_in = find_transformed_input(1, vnode_idx_to_transformed_node); + assert(shift_count_in->bottom_type()->isa_int(), "int type only for shift count"); + // The shift_count_in would be automatically truncated to the lowest _mask + // bits in a scalar shift operation. But vector shift does not truncate, so + // we must apply the mask now. + Node* shift_count_masked = new AndINode(shift_count_in, phase->igvn().intcon(_mask)); + register_new_node_from_vectorization(vloop_analyzer, shift_count_masked, shift_count_in); + // Now that masked value is "boadcast" (some platforms only set the lowest element). + VectorNode* vn = VectorNode::shift_count(_shift_opcode, shift_count_masked, _vlen, _element_bt); + register_new_node_from_vectorization(vloop_analyzer, vn, shift_count_in); + return VTransformApplyResult::make_vector(vn, _vlen, vn->length_in_bytes()); +} + + +VTransformApplyResult VTransformPopulateIndexNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + PhaseIdealLoop* phase = vloop_analyzer.vloop().phase(); + Node* val = find_transformed_input(1, vnode_idx_to_transformed_node); + assert(val->is_Phi(), "expected to be iv"); + assert(VectorNode::is_populate_index_supported(_element_bt), "should support"); + const TypeVect* vt = TypeVect::make(_element_bt, _vlen); + VectorNode* vn = new PopulateIndexNode(val, phase->igvn().intcon(1), vt); + register_new_node_from_vectorization(vloop_analyzer, vn, val); + return VTransformApplyResult::make_vector(vn, _vlen, vn->length_in_bytes()); +} + +VTransformApplyResult VTransformElementWiseVectorNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + Node* first = nodes().at(0); + uint vlen = nodes().length(); + int opc = first->Opcode(); + BasicType bt = vloop_analyzer.types().velt_basic_type(first); + + if (first->is_Cmp()) { + // Cmp + Bool -> VectorMaskCmp + // Handled by Bool / VTransformBoolVectorNode, so we do not generate any nodes here. + return VTransformApplyResult::make_empty(); + } + + assert(2 <= req() && req() <= 4, "Must have 1-3 inputs"); + VectorNode* vn = nullptr; + Node* in1 = find_transformed_input(1, vnode_idx_to_transformed_node); + Node* in2 = (req() >= 3) ? find_transformed_input(2, vnode_idx_to_transformed_node) : nullptr; + Node* in3 = (req() >= 4) ? find_transformed_input(3, vnode_idx_to_transformed_node) : nullptr; + + if (first->is_CMove()) { + assert(req() == 4, "three inputs expected: mask, blend1, blend2"); + vn = new VectorBlendNode(/* blend1 */ in2, /* blend2 */ in3, /* mask */ in1); + } else if (VectorNode::is_convert_opcode(opc)) { + assert(first->req() == 2 && req() == 2, "only one input expected"); + int vopc = VectorCastNode::opcode(opc, in1->bottom_type()->is_vect()->element_basic_type()); + vn = VectorCastNode::make(vopc, in1, bt, vlen); + } else if (VectorNode::can_use_RShiftI_instead_of_URShiftI(first, bt)) { + opc = Op_RShiftI; + vn = VectorNode::make(opc, in1, in2, vlen, bt); + } else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) { + // The scalar operation was a long -> int operation. + // However, the vector operation is long -> long. + VectorNode* long_vn = VectorNode::make(opc, in1, nullptr, vlen, T_LONG); + register_new_node_from_vectorization(vloop_analyzer, long_vn, first); + // Cast long -> int, to mimic the scalar long -> int operation. + vn = VectorCastNode::make(Op_VectorCastL2X, long_vn, T_INT, vlen); + } else if (req() == 3 || + VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc)) { + assert(!VectorNode::is_roundopD(first) || in2->is_Con(), "rounding mode must be constant"); + vn = VectorNode::make(opc, in1, in2, vlen, bt); // unary and binary + } else { + assert(req() == 4, "three inputs expected"); + assert(opc == Op_FmaD || + opc == Op_FmaF || + opc == Op_SignumF || + opc == Op_SignumD, + "element wise operation must be from this list"); + vn = VectorNode::make(opc, in1, in2, in3, vlen, bt); // ternary + } + + register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn); + return VTransformApplyResult::make_vector(vn, vlen, vn->length_in_bytes()); +} + +VTransformApplyResult VTransformBoolVectorNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + BoolNode* first = nodes().at(0)->as_Bool(); + uint vlen = nodes().length(); + BasicType bt = vloop_analyzer.types().velt_basic_type(first); + + // Cmp + Bool -> VectorMaskCmp + VTransformElementWiseVectorNode* vtn_cmp = in(1)->isa_ElementWiseVector(); + assert(vtn_cmp != nullptr && vtn_cmp->nodes().at(0)->is_Cmp(), + "bool vtn expects cmp vtn as input"); + + Node* cmp_in1 = vtn_cmp->find_transformed_input(1, vnode_idx_to_transformed_node); + Node* cmp_in2 = vtn_cmp->find_transformed_input(2, vnode_idx_to_transformed_node); + BoolTest::mask mask = test()._mask; + + PhaseIdealLoop* phase = vloop_analyzer.vloop().phase(); + ConINode* mask_node = phase->igvn().intcon((int)mask); + const TypeVect* vt = TypeVect::make(bt, vlen); + VectorNode* vn = new VectorMaskCmpNode(mask, cmp_in1, cmp_in2, mask_node, vt); + register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn); + return VTransformApplyResult::make_vector(vn, vlen, vn->vect_type()->length_in_bytes()); +} + +VTransformApplyResult VTransformReductionVectorNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + Node* first = nodes().at(0); + uint vlen = nodes().length(); + int opc = first->Opcode(); + BasicType bt = first->bottom_type()->basic_type(); + + Node* init = find_transformed_input(1, vnode_idx_to_transformed_node); + Node* vec = find_transformed_input(2, vnode_idx_to_transformed_node); + + ReductionNode* vn = ReductionNode::make(opc, nullptr, init, vec, bt); + register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn); + return VTransformApplyResult::make_vector(vn, vlen, vn->vect_type()->length_in_bytes()); +} + +VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + LoadNode* first = nodes().at(0)->as_Load(); + uint vlen = nodes().length(); + Node* ctrl = first->in(MemNode::Control); + Node* mem = first->in(MemNode::Memory); + Node* adr = first->in(MemNode::Address); + int opc = first->Opcode(); + const TypePtr* adr_type = first->adr_type(); + BasicType bt = vloop_analyzer.types().velt_basic_type(first); + + // Set the memory dependency of the LoadVector as early as possible. + // Walk up the memory chain, and ignore any StoreVector that provably + // does not have any memory dependency. + while (mem->is_StoreVector()) { + VPointer p_store(mem->as_Mem(), vloop_analyzer.vloop()); + if (p_store.overlap_possible_with_any_in(nodes())) { + break; + } else { + mem = mem->in(MemNode::Memory); + } + } + + LoadVectorNode* vn = LoadVectorNode::make(opc, ctrl, mem, adr, adr_type, vlen, bt, + control_dependency()); + DEBUG_ONLY( if (VerifyAlignVector) { vn->set_must_verify_alignment(); } ) + register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn); + return VTransformApplyResult::make_vector(vn, vlen, vn->memory_size()); +} + +VTransformApplyResult VTransformStoreVectorNode::apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const { + StoreNode* first = nodes().at(0)->as_Store(); + uint vlen = nodes().length(); + Node* ctrl = first->in(MemNode::Control); + Node* mem = first->in(MemNode::Memory); + Node* adr = first->in(MemNode::Address); + int opc = first->Opcode(); + const TypePtr* adr_type = first->adr_type(); + + Node* value = find_transformed_input(MemNode::ValueIn, vnode_idx_to_transformed_node); + StoreVectorNode* vn = StoreVectorNode::make(opc, ctrl, mem, adr, adr_type, value, vlen); + DEBUG_ONLY( if (VerifyAlignVector) { vn->set_must_verify_alignment(); } ) + register_new_node_from_vectorization_and_replace_scalar_nodes(vloop_analyzer, vn); + return VTransformApplyResult::make_vector(vn, vlen, vn->memory_size()); +} + +void VTransformVectorNode::register_new_node_from_vectorization_and_replace_scalar_nodes(const VLoopAnalyzer& vloop_analyzer, Node* vn) const { + PhaseIdealLoop* phase = vloop_analyzer.vloop().phase(); + Node* first = nodes().at(0); + + register_new_node_from_vectorization(vloop_analyzer, vn, first); + + for (int i = 0; i < _nodes.length(); i++) { + Node* n = _nodes.at(i); + phase->igvn().replace_node(n, vn); + } +} + +void VTransformNode::register_new_node_from_vectorization(const VLoopAnalyzer& vloop_analyzer, Node* vn, Node* old_node) const { + PhaseIdealLoop* phase = vloop_analyzer.vloop().phase(); + phase->register_new_node_with_ctrl_of(vn, old_node); + phase->igvn()._worklist.push(vn); + VectorNode::trace_new_vector(vn, "AutoVectorization"); +} + +#ifndef PRODUCT +void VTransformGraph::print_vtnodes() const { + tty->print_cr("\nVTransformGraph::print_vtnodes:"); + for (int i = 0; i < _vtnodes.length(); i++) { + _vtnodes.at(i)->print(); + } +} + +void VTransformGraph::print_schedule() const { + tty->print_cr("\nVTransformGraph::print_schedule:"); + for (int i = 0; i < _schedule.length(); i++) { + tty->print(" %3d: ", i); + VTransformNode* vtn = _schedule.at(i); + if (vtn == nullptr) { + tty->print_cr("nullptr"); + } else { + vtn->print(); + } + } +} + +void VTransformGraph::print_memops_schedule() const { + tty->print_cr("\nVTransformGraph::print_memops_schedule:"); + int i = 0; + for_each_memop_in_schedule([&] (MemNode* mem) { + tty->print(" %3d: ", i++); + mem->dump(); + }); +} + +void VTransformNode::print() const { + tty->print("%3d %s (", _idx, name()); + for (uint i = 0; i < _req; i++) { + print_node_idx(_in.at(i)); + } + if ((uint)_in.length() > _req) { + tty->print(" |"); + for (int i = _req; i < _in.length(); i++) { + print_node_idx(_in.at(i)); + } + } + tty->print(") ["); + for (int i = 0; i < _out.length(); i++) { + print_node_idx(_out.at(i)); + } + tty->print("] "); + print_spec(); + tty->cr(); +} + +void VTransformNode::print_node_idx(const VTransformNode* vtn) { + if (vtn == nullptr) { + tty->print(" _"); + } else { + tty->print(" %d", vtn->_idx); + } +} + +void VTransformScalarNode::print_spec() const { + tty->print("node[%d %s]", _node->_idx, _node->Name()); +} + +void VTransformReplicateNode::print_spec() const { + tty->print("vlen=%d element_type=", _vlen); + _element_type->dump(); +} + +void VTransformShiftCountNode::print_spec() const { + tty->print("vlen=%d element_bt=%s mask=%d shift_opcode=%s", + _vlen, type2name(_element_bt), _mask, + NodeClassNames[_shift_opcode]); +} + +void VTransformPopulateIndexNode::print_spec() const { + tty->print("vlen=%d element_bt=%s", _vlen, type2name(_element_bt)); +} + +void VTransformVectorNode::print_spec() const { + tty->print("%d-pack[", _nodes.length()); + for (int i = 0; i < _nodes.length(); i++) { + Node* n = _nodes.at(i); + if (i > 0) { + tty->print(", "); + } + tty->print("%d %s", n->_idx, n->Name()); + } + tty->print("]"); +} +#endif diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp new file mode 100644 index 00000000000..071674533a7 --- /dev/null +++ b/src/hotspot/share/opto/vtransform.hpp @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +#ifndef SHARE_OPTO_VTRANSFORM_HPP +#define SHARE_OPTO_VTRANSFORM_HPP + +#include "opto/node.hpp" +#include "opto/vectorization.hpp" + +// VTransform: +// - Models the transformation of the scalar loop to vectorized loop: +// It is a "C2 subgraph" -> "C2 subgraph" mapping. +// - The VTransform contains a graph (VTransformGraph), which consists of +// many vtnodes (VTransformNode). +// - Each vtnode models a part of the transformation, and is supposed +// to represent the output C2 nodes after the vectorization as closely +// as possible. +// +// This is the life-cycle of a VTransform: +// - Construction: +// - From SuperWord, with the SuperWordVTransformBuilder. +// +// - Future Plans: optimize, if-conversion, etc. +// +// - Schedule: +// - Compute linearization of the VTransformGraph, into an order that respects +// all edges in the graph (bailout if cycle detected). +// +// - Apply: +// - Changes to the C2 IR are only made once the "apply" method is called. +// - Each vtnode generates its corresponding scalar and vector C2 nodes, +// possibly replacing old scalar C2 nodes. +// +// Future Plans with VTransform: +// - Cost model: estimate if vectorization is profitable. +// - Optimizations: moving unordered reductions out of the loop, whih decreases cost. +// - Pack/Unpack/Shuffle: introduce additional nodes not present in the scalar loop. +// This is difficult to do with the SuperWord packset approach. +// - If-conversion: convert predicated nodes into CFG. + +typedef int VTransformNodeIDX; +class VTransformNode; +class VTransformScalarNode; +class VTransformInputScalarNode; +class VTransformVectorNode; +class VTransformElementWiseVectorNode; +class VTransformBoolVectorNode; +class VTransformReductionVectorNode; + +// Result from VTransformNode::apply +class VTransformApplyResult { +private: + Node* const _node; + const uint _vector_length; // number of elements + const uint _vector_width; // total width in bytes + + VTransformApplyResult(Node* n, uint vector_length, uint vector_width) : + _node(n), + _vector_length(vector_length), + _vector_width(vector_width) {} + +public: + static VTransformApplyResult make_scalar(Node* n) { + return VTransformApplyResult(n, 0, 0); + } + + static VTransformApplyResult make_vector(Node* n, uint vector_length, uint vector_width) { + assert(vector_length > 0 && vector_width > 0, "must have nonzero size"); + return VTransformApplyResult(n, vector_length, vector_width); + } + + static VTransformApplyResult make_empty() { + return VTransformApplyResult(nullptr, 0, 0); + } + + Node* node() const { return _node; } + uint vector_length() const { return _vector_length; } + uint vector_width() const { return _vector_width; } + NOT_PRODUCT( void trace(VTransformNode* vtnode) const; ) +}; + +#ifndef PRODUCT +// Convenience class for tracing flags. +class VTransformTrace { +public: + const bool _verbose; + const bool _rejections; + const bool _align_vector; + const bool _info; + + VTransformTrace(const VTrace& vtrace, + const bool is_trace_rejections, + const bool is_trace_align_vector, + const bool is_trace_info) : + _verbose (vtrace.is_trace(TraceAutoVectorizationTag::ALL)), + _rejections (_verbose | is_trace_vtransform(vtrace) | is_trace_rejections), + _align_vector(_verbose | is_trace_vtransform(vtrace) | is_trace_align_vector), + _info (_verbose | is_trace_vtransform(vtrace) | is_trace_info) {} + + static bool is_trace_vtransform(const VTrace& vtrace) { + return vtrace.is_trace(TraceAutoVectorizationTag::VTRANSFORM); + } +}; +#endif + +// VTransformGraph: component of VTransform +// See description at top of this file. +class VTransformGraph : public StackObj { +private: + const VLoopAnalyzer& _vloop_analyzer; + const VLoop& _vloop; + + NOT_PRODUCT(const VTransformTrace _trace;) + + VTransformNodeIDX _next_idx; + GrowableArray _vtnodes; + + // Schedule (linearization) of the graph. We use this to reorder the memory graph + // before inserting vector operations. + GrowableArray _schedule; + +public: + VTransformGraph(const VLoopAnalyzer& vloop_analyzer, + Arena& arena + NOT_PRODUCT( COMMA const VTransformTrace trace)) : + _vloop_analyzer(vloop_analyzer), + _vloop(vloop_analyzer.vloop()), + NOT_PRODUCT(_trace(trace) COMMA) + _next_idx(0), + _vtnodes(&arena, _vloop.estimated_body_length(), 0, nullptr), + _schedule(&arena, _vloop.estimated_body_length(), 0, nullptr) {} + + VTransformNodeIDX new_idx() { return _next_idx++; } + void add_vtnode(VTransformNode* vtnode); + DEBUG_ONLY( bool is_empty() const { return _vtnodes.is_empty(); } ) + DEBUG_ONLY( bool is_scheduled() const { return _schedule.is_nonempty(); } ) + const GrowableArray& vtnodes() const { return _vtnodes; } + + bool schedule(); + void apply_memops_reordering_with_schedule() const; + void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const; + +private: + // VLoop accessors + PhaseIdealLoop* phase() const { return _vloop.phase(); } + PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); } + bool in_bb(const Node* n) const { return _vloop.in_bb(n); } + + void collect_nodes_without_req_or_dependency(GrowableArray& stack) const; + + template + void for_each_memop_in_schedule(Callback callback) const; + +#ifndef PRODUCT + void print_vtnodes() const; + void print_schedule() const; + void print_memops_schedule() const; + void trace_schedule_cycle(const GrowableArray& stack, + const VectorSet& pre_visited, + const VectorSet& post_visited) const; +#endif +}; + +// VTransform: models the transformation of the scalar loop to vectorized loop. +// It is a "C2 subgraph" to "C2 subgraph" mapping. +// See description at top of this file. +class VTransform : public StackObj { +private: + const VLoopAnalyzer& _vloop_analyzer; + const VLoop& _vloop; + + NOT_PRODUCT(const VTransformTrace _trace;) + + // Everything in the vtransform is allocated from this arena, including all vtnodes. + Arena _arena; + + VTransformGraph _graph; + + // Memory reference, and the alignment width (aw) for which we align the main-loop, + // by adjusting the pre-loop limit. + MemNode const* _mem_ref_for_main_loop_alignment; + int _aw_for_main_loop_alignment; + +public: + VTransform(const VLoopAnalyzer& vloop_analyzer, + MemNode const* mem_ref_for_main_loop_alignment, + int aw_for_main_loop_alignment + NOT_PRODUCT( COMMA const VTransformTrace trace) + ) : + _vloop_analyzer(vloop_analyzer), + _vloop(vloop_analyzer.vloop()), + NOT_PRODUCT(_trace(trace) COMMA) + _arena(mtCompiler), + _graph(_vloop_analyzer, _arena NOT_PRODUCT(COMMA _trace)), + _mem_ref_for_main_loop_alignment(mem_ref_for_main_loop_alignment), + _aw_for_main_loop_alignment(aw_for_main_loop_alignment) {} + + const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; } + Arena* arena() { return &_arena; } + DEBUG_ONLY( bool has_graph() const { return !_graph.is_empty(); } ) + VTransformGraph& graph() { return _graph; } + + bool schedule() { return _graph.schedule(); } + void apply(); + +private: + // VLoop accessors + PhaseIdealLoop* phase() const { return _vloop.phase(); } + PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); } + IdealLoopTree* lpt() const { return _vloop.lpt(); } + CountedLoopNode* cl() const { return _vloop.cl(); } + int iv_stride() const { return cl()->stride_con(); } + + // VLoopVPointers accessors + const VPointer& vpointer(const MemNode* mem) const { + return _vloop_analyzer.vpointers().vpointer(mem); + } + + // Ensure that the main loop vectors are aligned by adjusting the pre loop limit. + void determine_mem_ref_and_aw_for_main_loop_alignment(); + void adjust_pre_loop_limit_to_align_main_loop_vectors(); + + void apply_vectorization() const; +}; + +// The vtnodes (VTransformNode) resemble the C2 IR Nodes, and model a part of the +// VTransform. Many such vtnodes make up the VTransformGraph. The vtnodes represent +// the resulting scalar and vector nodes as closely as possible. +// See description at top of this file. +class VTransformNode : public ArenaObj { +public: + const VTransformNodeIDX _idx; + +private: + // _in is split into required inputs (_req), and additional dependencies. + const uint _req; + GrowableArray _in; + GrowableArray _out; + +public: + VTransformNode(VTransform& vtransform, const uint req) : + _idx(vtransform.graph().new_idx()), + _req(req), + _in(vtransform.arena(), req, req, nullptr), + _out(vtransform.arena(), 4, 0, nullptr) + { + vtransform.graph().add_vtnode(this); + } + + void set_req(uint i, VTransformNode* n) { + assert(i < _req, "must be a req"); + assert(_in.at(i) == nullptr && n != nullptr, "only set once"); + _in.at_put(i, n); + n->add_out(this); + } + + void swap_req(uint i, uint j) { + assert(i < _req, "must be a req"); + assert(j < _req, "must be a req"); + VTransformNode* tmp = _in.at(i); + _in.at_put(i, _in.at(j)); + _in.at_put(j, tmp); + } + + void add_dependency(VTransformNode* n) { + assert(n != nullptr, "no need to add nullptr"); + _in.push(n); + n->add_out(this); + } + + void add_out(VTransformNode* n) { + _out.push(n); + } + + uint req() const { return _req; } + VTransformNode* in(int i) const { return _in.at(i); } + int outs() const { return _out.length(); } + VTransformNode* out(int i) const { return _out.at(i); } + + bool has_req_or_dependency() const { + for (int i = 0; i < _in.length(); i++) { + if (_in.at(i) != nullptr) { return true; } + } + return false; + } + + virtual VTransformScalarNode* isa_Scalar() { return nullptr; } + virtual VTransformInputScalarNode* isa_InputScalar() { return nullptr; } + virtual VTransformVectorNode* isa_Vector() { return nullptr; } + virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() { return nullptr; } + virtual VTransformBoolVectorNode* isa_BoolVector() { return nullptr; } + virtual VTransformReductionVectorNode* isa_ReductionVector() { return nullptr; } + + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const = 0; + + Node* find_transformed_input(int i, const GrowableArray& vnode_idx_to_transformed_node) const; + + void register_new_node_from_vectorization(const VLoopAnalyzer& vloop_analyzer, Node* vn, Node* old_node) const; + + NOT_PRODUCT(virtual const char* name() const = 0;) + NOT_PRODUCT(void print() const;) + NOT_PRODUCT(virtual void print_spec() const {};) + NOT_PRODUCT(static void print_node_idx(const VTransformNode* vtn);) +}; + +// Identity transform for scalar nodes. +class VTransformScalarNode : public VTransformNode { +private: + Node* _node; +public: + VTransformScalarNode(VTransform& vtransform, Node* n) : + VTransformNode(vtransform, n->req()), _node(n) {} + Node* node() const { return _node; } + virtual VTransformScalarNode* isa_Scalar() override { return this; } + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };) + NOT_PRODUCT(virtual void print_spec() const override;) +}; + +// Wrapper node for nodes outside the loop that are inputs to nodes in the loop. +// Since we want the loop-internal nodes to be able to reference all inputs as vtnodes, +// we must wrap the inputs that are outside the loop into special vtnodes, too. +class VTransformInputScalarNode : public VTransformScalarNode { +public: + VTransformInputScalarNode(VTransform& vtransform, Node* n) : + VTransformScalarNode(vtransform, n) {} + virtual VTransformInputScalarNode* isa_InputScalar() override { return this; } + NOT_PRODUCT(virtual const char* name() const override { return "InputScalar"; };) +}; + +// Transform produces a ReplicateNode, replicating the input to all vector lanes. +class VTransformReplicateNode : public VTransformNode { +private: + int _vlen; + const Type* _element_type; +public: + VTransformReplicateNode(VTransform& vtransform, int vlen, const Type* element_type) : + VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {} + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };) + NOT_PRODUCT(virtual void print_spec() const override;) +}; + +// Transform introduces a scalar ConvI2LNode that was not previously in the C2 graph. +class VTransformConvI2LNode : public VTransformNode { +public: + VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {} + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };) +}; + +// Transform introduces a shift-count node that truncates the shift count for a vector shift. +class VTransformShiftCountNode : public VTransformNode { +private: + int _vlen; + const BasicType _element_bt; + juint _mask; + int _shift_opcode; +public: + VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) : + VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {} + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };) + NOT_PRODUCT(virtual void print_spec() const override;) +}; + +// Transform introduces a PopulateIndex node: [phi, phi+1, phi+2, phi+3, ...]. +class VTransformPopulateIndexNode : public VTransformNode { +private: + int _vlen; + const BasicType _element_bt; +public: + VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) : + VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {} + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };) + NOT_PRODUCT(virtual void print_spec() const override;) +}; + +// Base class for all vector vtnodes. +class VTransformVectorNode : public VTransformNode { +private: + GrowableArray _nodes; +public: + VTransformVectorNode(VTransform& vtransform, const uint req, const uint number_of_nodes) : + VTransformNode(vtransform, req), _nodes(vtransform.arena(), number_of_nodes, number_of_nodes, nullptr) {} + + void set_nodes(const Node_List* pack) { + for (uint k = 0; k < pack->size(); k++) { + _nodes.at_put(k, pack->at(k)); + } + } + + const GrowableArray& nodes() const { return _nodes; } + virtual VTransformVectorNode* isa_Vector() override { return this; } + void register_new_node_from_vectorization_and_replace_scalar_nodes(const VLoopAnalyzer& vloop_analyzer, Node* vn) const; + NOT_PRODUCT(virtual void print_spec() const override;) +}; + +// Catch all for all element-wise vector operations. +class VTransformElementWiseVectorNode : public VTransformVectorNode { +public: + VTransformElementWiseVectorNode(VTransform& vtransform, uint req, uint number_of_nodes) : + VTransformVectorNode(vtransform, req, number_of_nodes) {} + virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; } + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };) +}; + +struct VTransformBoolTest { + const BoolTest::mask _mask; + const bool _is_negated; + + VTransformBoolTest(const BoolTest::mask mask, bool is_negated) : + _mask(mask), _is_negated(is_negated) {} +}; + +class VTransformBoolVectorNode : public VTransformElementWiseVectorNode { +private: + const VTransformBoolTest _test; +public: + VTransformBoolVectorNode(VTransform& vtransform, uint number_of_nodes, VTransformBoolTest test) : + VTransformElementWiseVectorNode(vtransform, 2, number_of_nodes), _test(test) {} + VTransformBoolTest test() const { return _test; } + virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; } + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };) +}; + +class VTransformReductionVectorNode : public VTransformVectorNode { +public: + // req = 3 -> [ctrl, scalar init, vector] + VTransformReductionVectorNode(VTransform& vtransform, uint number_of_nodes) : + VTransformVectorNode(vtransform, 3, number_of_nodes) {} + virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; } + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };) +}; + +class VTransformLoadVectorNode : public VTransformVectorNode { +public: + // req = 3 -> [ctrl, mem, adr] + VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes) : + VTransformVectorNode(vtransform, 3, number_of_nodes) {} + LoadNode::ControlDependency control_dependency() const; + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };) +}; + +class VTransformStoreVectorNode : public VTransformVectorNode { +public: + // req = 4 -> [ctrl, mem, adr, val] + VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) : + VTransformVectorNode(vtransform, 4, number_of_nodes) {} + virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, + const GrowableArray& vnode_idx_to_transformed_node) const override; + NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };) +}; + +// Invoke callback on all memops, in the order of the schedule. +template +void VTransformGraph::for_each_memop_in_schedule(Callback callback) const { + assert(_schedule.length() == _vtnodes.length(), "schedule was computed"); + + for (int i = 0; i < _schedule.length(); i++) { + VTransformNode* vtn = _schedule.at(i); + + // We can ignore input nodes, they are outside the loop. + if (vtn->isa_InputScalar() != nullptr) { continue; } + + VTransformScalarNode* scalar = vtn->isa_Scalar(); + if (scalar != nullptr && scalar->node()->is_Mem()) { + callback(scalar->node()->as_Mem()); + } + + VTransformVectorNode* vector = vtn->isa_Vector(); + if (vector != nullptr && vector->nodes().at(0)->is_Mem()) { + for (int j = 0; j < vector->nodes().length(); j++) { + callback(vector->nodes().at(j)->as_Mem()); + } + } + } +} + +#endif // SHARE_OPTO_VTRANSFORM_HPP