8260943: C2 SuperWord: Remove dead vectorization optimization added by 8076284

Reviewed-by: kvn, thartmann
This commit is contained in:
Emanuel Peter 2023-05-23 09:51:19 +00:00
parent 4f0f776187
commit bdd240283e
2 changed files with 1 additions and 494 deletions

View File

@ -80,10 +80,7 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
_do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
_do_reserve_copy(DoReserveCopyInSuperWord),
_num_work_vecs(0), // amount of vector work we have
_num_reductions(0), // amount of reduction work we have
_ii_first(-1), // first loop generation index - only if do_vector_loop()
_ii_last(-1), // last loop generation index - only if do_vector_loop()
_ii_order(arena(), 8, 0, 0)
_num_reductions(0) // amount of reduction work we have
{
#ifndef PRODUCT
_vector_loop_debug = 0;
@ -94,8 +91,6 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
#endif
}
static const bool _do_vector_loop_experimental = false; // Experimental vectorization which uses data from loop unrolling.
//------------------------------transform_loop---------------------------
bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
assert(UseSuperWord, "should be");
@ -609,32 +604,6 @@ bool SuperWord::SLP_extract() {
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
if (cl->is_main_loop()) {
if (_do_vector_loop_experimental) {
if (mark_generations() != -1) {
hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
if (!construct_bb()) {
return false; // Exit if no interesting nodes or complex graph.
}
dependence_graph();
compute_max_depth();
}
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
_lpt->dump_head();
for (int j = 0; j < _block.length(); j++) {
Node* n = _block.at(j);
int d = depth(n);
for (int i = 0; i < d; i++) tty->print("%s", " ");
tty->print("%d :", d);
n->dump();
}
}
#endif
}
compute_vector_element_type();
// Attempt vectorization
@ -647,17 +616,6 @@ bool SuperWord::SLP_extract() {
extend_packlist();
if (_do_vector_loop_experimental) {
if (_packset.length() == 0) {
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
}
#endif
pack_parallel();
}
}
combine_packs();
construct_my_pack_map();
@ -3722,35 +3680,16 @@ bool SuperWord::construct_bb() {
int ii_current = -1;
unsigned int load_idx = (unsigned int)-1;
// Build iterations order if needed
bool build_ii_order = _do_vector_loop_experimental && _ii_order.is_empty();
// Create real map of block indices for nodes
for (int j = 0; j < _block.length(); j++) {
Node* n = _block.at(j);
set_bb_idx(n, j);
if (build_ii_order && n->is_Load()) {
if (ii_current == -1) {
ii_current = _clone_map.gen(n->_idx);
_ii_order.push(ii_current);
load_idx = _clone_map.idx(n->_idx);
} else if (_clone_map.idx(n->_idx) == load_idx && _clone_map.gen(n->_idx) != ii_current) {
ii_current = _clone_map.gen(n->_idx);
_ii_order.push(ii_current);
}
}
}//for
// Ensure extra info is allocated.
initialize_bb();
#ifndef PRODUCT
if (_vector_loop_debug && _ii_order.length() > 0) {
tty->print("SuperWord::construct_bb: List of generations: ");
for (int jj = 0; jj < _ii_order.length(); ++jj) {
tty->print(" %d:%d", jj, _ii_order.at(jj));
}
tty->print_cr(" ");
}
if (TraceSuperWord) {
print_bb();
tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > 0 ? "" : "NONE");
@ -4284,8 +4223,6 @@ void SuperWord::init() {
_data_entry.clear();
_mem_slice_head.clear();
_mem_slice_tail.clear();
_iteration_first.clear();
_iteration_last.clear();
_node_info.clear();
_align_to_ref = nullptr;
_race_possible = 0;
@ -4294,19 +4231,6 @@ void SuperWord::init() {
_num_reductions = 0;
}
//------------------------------restart---------------------------
void SuperWord::restart() {
_dg.init();
_packset.clear();
_disjoint_ptrs.clear();
_block.clear();
_post_block.clear();
_data_entry.clear();
_mem_slice_head.clear();
_mem_slice_tail.clear();
_node_info.clear();
}
//------------------------------print_packset---------------------------
void SuperWord::print_packset() {
#ifndef PRODUCT
@ -5200,397 +5124,3 @@ bool SuperWord::same_generation(Node* a, Node* b) const {
return a != nullptr && b != nullptr && _clone_map.same_gen(a->_idx, b->_idx);
}
Node* SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
assert(in_bb(ld), "must be in block");
if (_clone_map.gen(ld->_idx) == _ii_first) {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
_clone_map.gen(ld->_idx));
}
#endif
return nullptr; //we think that any ld in the first gen being vectorizable
}
Node* mem = ld->in(MemNode::Memory);
if (mem->outcnt() <= 1) {
// we don't want to remove the only edge from mem node to load
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
mem->_idx, ld->_idx);
ld->dump();
mem->dump();
}
#endif
return nullptr;
}
if (!in_bb(mem) || same_generation(mem, ld)) {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
_clone_map.gen(mem->_idx));
}
#endif
return nullptr; // does not depend on loop volatile node or depends on the same generation
}
//otherwise first node should depend on mem-phi
Node* first = first_node(ld);
assert(first->is_Load(), "must be Load");
Node* phi = first->as_Load()->in(MemNode::Memory);
if (!phi->is_Phi() || phi->bottom_type() != Type::MEMORY) {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi");
ld->dump();
first->dump();
}
#endif
return nullptr;
}
Node* tail = 0;
for (int m = 0; m < _mem_slice_head.length(); m++) {
if (_mem_slice_head.at(m) == phi) {
tail = _mem_slice_tail.at(m);
}
}
if (tail == 0) { //test that found phi is in the list _mem_slice_head
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
ld->_idx, phi->_idx);
ld->dump();
phi->dump();
}
#endif
return nullptr;
}
// now all conditions are met
return phi;
}
Node* SuperWord::first_node(Node* nd) {
for (int ii = 0; ii < _iteration_first.length(); ii++) {
Node* nnn = _iteration_first.at(ii);
if (same_origin_idx(nnn, nd)) {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
}
#endif
return nnn;
}
}
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
nd->_idx, _clone_map.idx(nd->_idx));
}
#endif
return 0;
}
Node* SuperWord::last_node(Node* nd) {
for (int ii = 0; ii < _iteration_last.length(); ii++) {
Node* nnn = _iteration_last.at(ii);
if (same_origin_idx(nnn, nd)) {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
_clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
}
#endif
return nnn;
}
}
return 0;
}
int SuperWord::mark_generations() {
Node *ii_err = nullptr, *tail_err = nullptr;
for (int i = 0; i < _mem_slice_head.length(); i++) {
Node* phi = _mem_slice_head.at(i);
assert(phi->is_Phi(), "must be phi");
Node* tail = _mem_slice_tail.at(i);
if (_ii_last == -1) {
tail_err = tail;
_ii_last = _clone_map.gen(tail->_idx);
}
else if (_ii_last != _clone_map.gen(tail->_idx)) {
#ifndef PRODUCT
if (TraceSuperWord && Verbose) {
tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes ");
tail->dump();
tail_err->dump();
}
#endif
return -1;
}
// find first iteration in the loop
for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) {
Node* ii = phi->fast_out(i);
if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi
if (_ii_first == -1) {
ii_err = ii;
_ii_first = _clone_map.gen(ii->_idx);
} else if (_ii_first != _clone_map.gen(ii->_idx)) {
#ifndef PRODUCT
if (TraceSuperWord && Verbose) {
tty->print_cr("SuperWord::mark_generations: _ii_first was found before and not equal to one in this node (%d)", _ii_first);
ii->dump();
if (ii_err!= 0) {
ii_err->dump();
}
}
#endif
return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
}
}
}//for (DUIterator_Fast imax,
}//for (int i...
if (_ii_first == -1 || _ii_last == -1) {
if (TraceSuperWord && Verbose) {
tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong");
}
return -1; // something vent wrong
}
// collect nodes in the first and last generations
assert(_iteration_first.length() == 0, "_iteration_first must be empty");
assert(_iteration_last.length() == 0, "_iteration_last must be empty");
for (int j = 0; j < _block.length(); j++) {
Node* n = _block.at(j);
node_idx_t gen = _clone_map.gen(n->_idx);
if ((signed)gen == _ii_first) {
_iteration_first.push(n);
} else if ((signed)gen == _ii_last) {
_iteration_last.push(n);
}
}
// building order of iterations
if (_ii_order.length() == 0 && ii_err != 0) {
assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
Node* nd = ii_err;
while(_clone_map.gen(nd->_idx) != _ii_last) {
_ii_order.push(_clone_map.gen(nd->_idx));
bool found = false;
for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
Node* use = nd->fast_out(i);
if (same_origin_idx(use, nd) && use->as_Store()->in(MemNode::Memory) == nd) {
found = true;
nd = use;
break;
}
}//for
if (found == false) {
if (TraceSuperWord && Verbose) {
tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx);
}
_ii_order.clear();
return -1;
}
} //while
_ii_order.push(_clone_map.gen(nd->_idx));
}
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::mark_generations");
tty->print_cr("First generation (%d) nodes:", _ii_first);
for (int ii = 0; ii < _iteration_first.length(); ii++) _iteration_first.at(ii)->dump();
tty->print_cr("Last generation (%d) nodes:", _ii_last);
for (int ii = 0; ii < _iteration_last.length(); ii++) _iteration_last.at(ii)->dump();
tty->print_cr(" ");
tty->print("SuperWord::List of generations: ");
for (int jj = 0; jj < _ii_order.length(); ++jj) {
tty->print("%d:%d ", jj, _ii_order.at(jj));
}
tty->print_cr(" ");
}
#endif
return _ii_first;
}
bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
assert(same_origin_idx(gold, fix), "should be clones of the same node");
Node* gin1 = gold->in(1);
Node* gin2 = gold->in(2);
Node* fin1 = fix->in(1);
Node* fin2 = fix->in(2);
bool swapped = false;
if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin2)) {
if (same_origin_idx(gin1, fin1) &&
same_origin_idx(gin2, fin2)) {
return true; // nothing to fix
}
if (same_origin_idx(gin1, fin2) &&
same_origin_idx(gin2, fin1)) {
fix->swap_edges(1, 2);
swapped = true;
}
}
// at least one input comes from outside of bb
if (gin1->_idx == fin1->_idx) {
return true; // nothing to fix
}
if (!swapped && (gin1->_idx == fin2->_idx || gin2->_idx == fin1->_idx)) { //swapping is expensive, check condition first
fix->swap_edges(1, 2);
swapped = true;
}
if (swapped) {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx);
}
#endif
return true;
}
if (TraceSuperWord && Verbose) {
tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx);
}
return false;
}
bool SuperWord::pack_parallel() {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::pack_parallel: START");
}
#endif
_packset.clear();
if (_ii_order.is_empty()) {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::pack_parallel: EMPTY");
}
#endif
return false;
}
for (int ii = 0; ii < _iteration_first.length(); ii++) {
Node* nd = _iteration_first.at(ii);
if (in_bb(nd) && (nd->is_Load() || nd->is_Store() || nd->is_Add() || nd->is_Mul())) {
Node_List* pk = new Node_List();
pk->push(nd);
for (int gen = 1; gen < _ii_order.length(); ++gen) {
for (int kk = 0; kk < _block.length(); kk++) {
Node* clone = _block.at(kk);
if (same_origin_idx(clone, nd) &&
_clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
if (nd->is_Add() || nd->is_Mul()) {
fix_commutative_inputs(nd, clone);
}
pk->push(clone);
if (pk->size() == 4) {
_packset.append(pk);
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::pack_parallel: added pack ");
pk->dump();
}
#endif
if (_clone_map.gen(clone->_idx) != _ii_last) {
pk = new Node_List();
}
}
break;
}
}
}//for
}//if
}//for
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::pack_parallel: END");
}
#endif
return true;
}
bool SuperWord::hoist_loads_in_graph() {
GrowableArray<Node*> loads;
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length());
}
#endif
for (int i = 0; i < _mem_slice_head.length(); i++) {
Node* n = _mem_slice_head.at(i);
if ( !in_bb(n) || !n->is_Phi() || n->bottom_type() != Type::MEMORY) {
if (TraceSuperWord && Verbose) {
tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx);
}
continue;
}
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d = _mem_slice_head.at(%d);", n->_idx, i);
}
#endif
for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
Node* ld = n->fast_out(i);
if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
for (int i = 0; i < _block.length(); i++) {
Node* ld2 = _block.at(i);
if (ld2->is_Load() && same_origin_idx(ld, ld2) &&
!same_generation(ld, ld2)) { // <= do not collect the first generation ld
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
}
#endif
// could not do on-the-fly, since iterator is immutable
loads.push(ld2);
}
}// for
}//if
}//for (DUIterator_Fast imax,
}//for (int i = 0; i
for (int i = 0; i < loads.length(); i++) {
LoadNode* ld = loads.at(i)->as_Load();
Node* phi = find_phi_for_mem_dep(ld);
if (phi != nullptr) {
#ifndef PRODUCT
if (_vector_loop_debug) {
tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
MemNode::Memory, ld->_idx, phi->_idx);
}
#endif
_igvn.replace_input_of(ld, MemNode::Memory, phi);
}
}//for
restart(); // invalidate all basic structures, since we rebuilt the graph
if (TraceSuperWord && Verbose) {
tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild");
}
return true;
}

View File

@ -307,8 +307,6 @@ class SuperWord : public ResourceObj {
GrowableArray<Node*> _data_entry; // Nodes with all inputs from outside
GrowableArray<Node*> _mem_slice_head; // Memory slice head nodes
GrowableArray<Node*> _mem_slice_tail; // Memory slice tail nodes
GrowableArray<Node*> _iteration_first; // nodes in the generation that has deps from phi
GrowableArray<Node*> _iteration_last; // nodes in the generation that has deps to phi
GrowableArray<SWNodeInfo> _node_info; // Info needed per node
CloneMap& _clone_map; // map of nodes created in cloning
CMoveKit _cmovev_kit; // support for vectorization of CMov
@ -367,9 +365,6 @@ class SuperWord : public ResourceObj {
bool _do_reserve_copy; // do reserve copy of the graph(loop) before final modification in output
int _num_work_vecs; // Number of non memory vector operations
int _num_reductions; // Number of reduction expressions applied
int _ii_first; // generation with direct deps from mem phi
int _ii_last; // generation with direct deps to mem phi
GrowableArray<int> _ii_order;
#ifndef PRODUCT
uintx _vector_loop_debug; // provide more printing in debug mode
#endif
@ -555,22 +550,6 @@ private:
int get_iv_adjustment(MemNode* mem);
// Can the preloop align the reference to position zero in the vector?
bool ref_is_alignable(SWPointer& p);
// rebuild the graph so all loads in different iterations of cloned loop become dependent on phi node (in _do_vector_loop only)
bool hoist_loads_in_graph();
// Test whether MemNode::Memory dependency to the same load but in the first iteration of this loop is coming from memory phi
// Return false if failed
Node* find_phi_for_mem_dep(LoadNode* ld);
// Return same node but from the first generation. Return 0, if not found
Node* first_node(Node* nd);
// Return same node as this but from the last generation. Return 0, if not found
Node* last_node(Node* n);
// Mark nodes belonging to first and last generation
// returns first generation index or -1 if vectorization/simd is impossible
int mark_generations();
// swapping inputs of commutative instruction (Add or Mul)
bool fix_commutative_inputs(Node* gold, Node* fix);
// make packs forcefully (in _do_vector_loop only)
bool pack_parallel();
// Construct dependency graph.
void dependence_graph();
// Return a memory slice (node list) in predecessor order starting at "start"
@ -670,8 +649,6 @@ private:
// Is the use of d1 in u1 at the same operand position as d2 in u2?
bool opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2);
void init();
// clean up some basic structures - used if the ideal graph was rebuilt
void restart();
// print methods
void print_packset();