diff --git a/src/hotspot/share/adlc/output_h.cpp b/src/hotspot/share/adlc/output_h.cpp index dd149064a5a..b62bc43791f 100644 --- a/src/hotspot/share/adlc/output_h.cpp +++ b/src/hotspot/share/adlc/output_h.cpp @@ -1962,7 +1962,7 @@ void ArchDesc::declareClasses(FILE *fp) { else if( instr->is_ideal_box() ) { // BoxNode provides the address of a stack slot. // Define its bottom type to be TypeRawPtr::BOTTOM instead of TypePtr::BOTTOM - // This prevent s insert_anti_dependencies from complaining. It will + // This prevents raise_above_anti_dependences from complaining. It will // complain if it sees that the pointer base is TypePtr::BOTTOM since // it doesn't understand what that might alias. fprintf(fp," const Type *bottom_type() const { return TypeRawPtr::BOTTOM; } // Box?\n"); diff --git a/src/hotspot/share/opto/block.hpp b/src/hotspot/share/opto/block.hpp index 5baa72dfffb..66ce1f51f24 100644 --- a/src/hotspot/share/opto/block.hpp +++ b/src/hotspot/share/opto/block.hpp @@ -210,7 +210,7 @@ public: uint _freg_pressure; uint _fhrp_index; - // Mark and visited bits for an LCA calculation in insert_anti_dependences. + // Mark and visited bits for an LCA calculation in raise_above_anti_dependences. // Since they hold unique node indexes, they do not need reinitialization. node_idx_t _raise_LCA_mark; void set_raise_LCA_mark(node_idx_t x) { _raise_LCA_mark = x; } @@ -487,10 +487,10 @@ class PhaseCFG : public Phase { // Used when building the CFG and creating end nodes for blocks. MachNode* _goto; - Block* insert_anti_dependences(Block* LCA, Node* load, bool verify = false); + Block* raise_above_anti_dependences(Block* LCA, Node* load, bool verify = false); void verify_anti_dependences(Block* LCA, Node* load) const { assert(LCA == get_block_for_node(load), "should already be scheduled"); - const_cast(this)->insert_anti_dependences(LCA, load, true); + const_cast(this)->raise_above_anti_dependences(LCA, load, true); } bool move_to_next(Block* bx, uint b_index); diff --git a/src/hotspot/share/opto/gcm.cpp b/src/hotspot/share/opto/gcm.cpp index a8080d5adb9..6576196268e 100644 --- a/src/hotspot/share/opto/gcm.cpp +++ b/src/hotspot/share/opto/gcm.cpp @@ -456,6 +456,7 @@ static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseC // of all marked blocks. If there are none marked, return the original // LCA. static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) { + assert(early->dominates(LCA), "precondition failed"); Block_List worklist; worklist.push(LCA); while (worklist.size() > 0) { @@ -470,7 +471,7 @@ static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, c // Raise the LCA. LCA = mid->dom_lca(LCA); if (LCA == early) break; // stop searching everywhere - assert(early->dominates(LCA), "early is high enough"); + assert(early->dominates(LCA), "unsound LCA update"); // Resume searching at that point, skipping intermediate levels. worklist.push(LCA); if (LCA == mid) @@ -543,7 +544,7 @@ static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) return early; } -// This function is used by insert_anti_dependences to find unrelated loads for stores in implicit null checks. +// This function is used by raise_above_anti_dependences to find unrelated loads for stores in implicit null checks. bool PhaseCFG::unrelated_load_in_store_null_block(Node* store, Node* load) { // We expect an anti-dependence edge from 'load' to 'store', except when // implicit_null_check() has hoisted 'store' above its early block to @@ -597,7 +598,7 @@ private: // def_mem is one of the inputs of use_phi and at least one input of use_phi is // not def_mem. It's however possible that use_phi has def_mem as input multiple // times. If that happens, use_phi is recorded as a use of def_mem multiple - // times as well. When PhaseCFG::insert_anti_dependences() goes over + // times as well. When PhaseCFG::raise_above_anti_dependences() goes over // uses of def_mem and enqueues them for processing, use_phi would then be // enqueued for processing multiple times when it only needs to be // processed once. The code below checks if use_phi as a use of def_mem was @@ -662,23 +663,67 @@ public: } }; -//--------------------------insert_anti_dependences--------------------------- -// A load may need to witness memory that nearby stores can overwrite. -// For each nearby store, either insert an "anti-dependence" edge -// from the load to the store, or else move LCA upward to force the -// load to (eventually) be scheduled in a block above the store. +// Enforce a scheduling of the given 'load' that ensures anti-dependent stores +// do not overwrite the load's input memory state before the load executes. // -// Do not add edges to stores on distinct control-flow paths; -// only add edges to stores which might interfere. +// The given 'load' has a current scheduling range in the dominator tree that +// starts at the load's early block (computed in schedule_early) and ends at +// the given 'LCA' block for the load. However, there may still exist +// anti-dependent stores between the early block and the LCA that overwrite +// memory that the load must witness. For such stores, we must // -// Return the (updated) LCA. There will not be any possibly interfering -// store between the load's "early block" and the updated LCA. -// Any stores in the updated LCA will have new precedence edges -// back to the load. The caller is expected to schedule the load -// in the LCA, in which case the precedence edges will make LCM -// preserve anti-dependences. The caller may also hoist the load -// above the LCA, if it is not the early block. -Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { +// 1. raise the load's LCA to force the load to (eventually) be scheduled at +// latest in the store's block, and +// 2. if the load may get scheduled in the store's block, additionally insert +// an anti-dependence edge (i.e., precedence edge) from the load to the +// store to ensure LCM schedules the load before the store within the +// block. +// +// For a given store, we say that the store is on a _distinct_ control-flow +// path relative to the load if there are no paths from early to LCA that go +// through the store's block. Such stores are not anti-dependent, and there is +// no need to update the LCA nor to add anti-dependence edges. +// +// Due to the presence of loops, we must also raise the LCA above +// anti-dependent memory Phis. We defer the details (see later comments in the +// method) and for now look at an example without loops. +// +// CFG DOMINATOR TREE +// +// B1 (early,L) B1 +// |\________ /\\___ +// | \ / \ \ +// B2 (L,S) \ B2 B7 B6 +// / \ \ /\\___ +// B3 B4 (S) B7 (S) / \ \ +// \ / / B3 B4 B5 +// B5 (LCA,L) / +// \ ____/ +// \ / +// B6 +// +// Here, the load's scheduling range when calling raise_above_anti_dependences +// is between early and LCA in the dominator tree, i.e., in block B1, B2, or B5 +// (indicated with "L"). However, there are a number of stores (indicated with +// "S") that overwrite the memory which the load must witness. First, consider +// the store in B4. We cannot legally schedule the load in B4, so an +// anti-dependence edge is redundant. However, we must raise the LCA above +// B4, which means that the updated LCA is B2. Now, consider the store in B2. +// The LCA is already B2, so we do not need to raise it any further. +// If we, eventually, decide to schedule the load in B2, it could happen that +// LCM decides to place the load after the anti-dependent store in B2. +// Therefore, we now need to add an anti-dependence edge between the load and +// the B2 store, ensuring that the load is scheduled before the store. Finally, +// the store in B7 is on a distinct control-flow path. Therefore, B7 requires +// no action. +// +// The raise_above_anti_dependences method returns the updated LCA and ensures +// there are no anti-dependent stores in any block between the load's early +// block and the updated LCA. Any stores in the updated LCA will have new +// anti-dependence edges back to the load. The caller may schedule the load in +// the updated LCA, or it may hoist the load above the updated LCA, if the +// updated LCA is not the early block. +Block* PhaseCFG::raise_above_anti_dependences(Block* LCA, Node* load, const bool verify) { ResourceMark rm; assert(load->needs_anti_dependence_check(), "must be a load of some sort"); assert(LCA != nullptr, ""); @@ -711,16 +756,16 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { node_idx_t load_index = load->_idx; - // Note the earliest legal placement of 'load', as determined by - // by the unique point in the dom tree where all memory effects - // and other inputs are first available. (Computed by schedule_early.) - // For normal loads, 'early' is the shallowest place (dom graph wise) - // to look for anti-deps between this load and any store. + // Record the earliest legal placement of 'load', as determined by the unique + // point in the dominator tree where all memory effects and other inputs are + // first available (computed by schedule_early). For normal loads, 'early' is + // the shallowest place (dominator-tree wise) to look for anti-dependences + // between this load and any store. Block* early = get_block_for_node(load); // If we are subsuming loads, compute an "early" block that only considers - // memory or address inputs. This block may be different than the - // schedule_early block in that it could be at an even shallower depth in the + // memory or address inputs. This block may be different from the + // schedule_early block when it is at an even shallower depth in the // dominator tree, and allow for a broader discovery of anti-dependences. if (C->subsume_loads()) { early = memory_early_block(load, early, this); @@ -729,29 +774,47 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { } } + assert(early->dominates(LCA_orig), "precondition failed"); + ResourceArea* area = Thread::current()->resource_area(); - DefUseMemStatesQueue worklist_def_use_mem_states(area); // prior memory state to store and possible-def to explore - Node_List non_early_stores(area); // all relevant stores outside of early - bool must_raise_LCA = false; - // 'load' uses some memory state; look for users of the same state. - // Recurse through MergeMem nodes to the stores that use them. + // Bookkeeping of possibly anti-dependent stores that we find outside the + // early block and that may need anti-dependence edges. Note that stores in + // non_early_stores are not necessarily dominated by early. The search starts + // from initial_mem, which can reside in a block that dominates early, and + // therefore, stores we find may be in blocks that are on completely distinct + // control-flow paths compared to early. However, in the end, only stores in + // blocks dominated by early matter. The reason for bookkeeping not only + // relevant stores is efficiency: we lazily record all possible + // anti-dependent stores and add anti-dependence edges only to the relevant + // ones at the very end of this method when we know the final updated LCA. + Node_List non_early_stores(area); - // Each of these stores is a possible definition of memory - // that 'load' needs to use. We need to force 'load' - // to occur before each such store. When the store is in - // the same block as 'load', we insert an anti-dependence - // edge load->store. - - // The relevant stores "nearby" the load consist of a tree rooted - // at initial_mem, with internal nodes of type MergeMem. - // Therefore, the branches visited by the worklist are of this form: - // initial_mem -> (MergeMem ->)* Memory state modifying node - // Memory state modifying nodes include Store and Phi nodes and any node for which needs_anti_dependence_check() - // returns false. - // The anti-dependence constraints apply only to the fringe of this tree. + // Whether we must raise the LCA after the main worklist loop below. + bool must_raise_LCA_above_marks = false; + // The input load uses some memory state (initial_mem). Node* initial_mem = load->in(MemNode::Memory); + // To find anti-dependences we must look for users of the same memory state. + // To do this, we search the memory graph downwards from initial_mem. During + // this search, we encounter different types of nodes that we handle + // according to the following three categories: + // + // - MergeMems + // - Memory-state-modifying nodes (informally referred to as "stores" above + // and below) + // - Memory Phis + // + // MergeMems do not modify the memory state. Anti-dependent stores or memory + // Phis may, however, exist downstream of MergeMems. Therefore, we must + // permit the search to continue through MergeMems. Stores may raise the LCA + // and may potentially also require an anti-dependence edge. Memory Phis may + // raise the LCA but never require anti-dependence edges. See the comments + // throughout the worklist loop below for further details. + // + // It may be useful to think of the anti-dependence search as traversing a + // tree rooted at initial_mem, with internal nodes of type MergeMem and + // memory Phis and stores as (potentially repeated) leaves. // We don't optimize the memory graph for pinned loads, so we may need to raise the // root of our search tree through the corresponding slices of MergeMem nodes to @@ -767,14 +830,32 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { } } } - worklist_def_use_mem_states.push(nullptr, initial_mem); - while (worklist_def_use_mem_states.is_nonempty()) { - // Examine a nearby store to see if it might interfere with our load. - Node* def_mem_state = worklist_def_use_mem_states.top_def(); - Node* use_mem_state = worklist_def_use_mem_states.top_use(); - worklist_def_use_mem_states.pop(); + // To administer the search, we use a worklist consisting of (def,use)-pairs + // of memory states, corresponding to edges in the search tree (and edges + // in the memory graph). We need to keep track of search tree edges in the + // worklist rather than individual nodes due to memory Phis (see details + // below). + DefUseMemStatesQueue worklist(area); + // We start the search at initial_mem and indicate the search root with the + // edge (nullptr, initial_mem). + worklist.push(nullptr, initial_mem); - uint op = use_mem_state->Opcode(); + // The worklist loop + while (worklist.is_nonempty()) { + // Pop the next edge from the worklist + Node* def_mem_state = worklist.top_def(); + Node* use_mem_state = worklist.top_use(); + worklist.pop(); + + // We are either + // - at the root of the search with the edge (nullptr, initial_mem), + // - just past initial_mem with the edge (initial_mem, use_mem_state), or + // - just past a MergeMem with the edge (MergeMem, use_mem_state). + assert(def_mem_state == nullptr || def_mem_state == initial_mem || + def_mem_state->is_MergeMem(), + "unexpected memory state"); + + const uint op = use_mem_state->Opcode(); #ifdef ASSERT // CacheWB nodes are peculiar in a sense that they both are anti-dependent and produce memory. @@ -787,132 +868,179 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { assert(!use_mem_state->needs_anti_dependence_check() || is_cache_wb, "no loads"); #endif - // MergeMems do not directly have anti-deps. - // Treat them as internal nodes in a forward tree of memory states, - // the leaves of which are each a 'possible-def'. - if (use_mem_state == initial_mem // root (exclusive) of tree we are searching - || op == Op_MergeMem // internal node of tree we are searching - ) { - def_mem_state = use_mem_state; // It's not a possibly interfering store. - if (use_mem_state == initial_mem) - initial_mem = nullptr; // only process initial memory once + // If we are either at the search root or have found a MergeMem, we step + // past use_mem_state and populate the search worklist with edges + // (use_mem_state, child) for use_mem_state's children. + if (def_mem_state == nullptr // root (exclusive) of tree we are searching + || op == Op_MergeMem // internal node of tree we are searching + ) { + def_mem_state = use_mem_state; for (DUIterator_Fast imax, i = def_mem_state->fast_outs(imax); i < imax; i++) { use_mem_state = def_mem_state->fast_out(i); if (use_mem_state->needs_anti_dependence_check()) { - // use_mem_state is also a kind of load (i.e. needs_anti_dependence_check), and it is not a memory state - // modifying node (store, Phi or MergeMem). Hence, load can't be anti dependent on this node. + // use_mem_state is also a kind of load (i.e., + // needs_anti_dependence_check), and it is not a store nor a memory + // Phi. Hence, it is not anti-dependent on the load. continue; } - worklist_def_use_mem_states.push(def_mem_state, use_mem_state); + worklist.push(def_mem_state, use_mem_state); } + // Nothing more to do for the current (nullptr, initial_mem) or + // (initial_mem/MergeMem, MergeMem) edge, move on. continue; } + assert(!use_mem_state->is_MergeMem(), + "use_mem_state should be either a store or a memory Phi"); + if (op == Op_MachProj || op == Op_Catch) continue; - // Compute the alias index. Loads and stores with different alias - // indices do not need anti-dependence edges. Wide MemBar's are - // anti-dependent on everything (except immutable memories). + // Compute the alias index. If the use_mem_state has an alias index + // different from the load's, it is not anti-dependent. Wide MemBar's + // are anti-dependent with everything (except immutable memories). const TypePtr* adr_type = use_mem_state->adr_type(); if (!C->can_alias(adr_type, load_alias_idx)) continue; // Most slow-path runtime calls do NOT modify Java memory, but // they can block and so write Raw memory. if (use_mem_state->is_Mach()) { - MachNode* mstore = use_mem_state->as_Mach(); + MachNode* muse = use_mem_state->as_Mach(); if (load_alias_idx != Compile::AliasIdxRaw) { // Check for call into the runtime using the Java calling // convention (and from there into a wrapper); it has no // _method. Can't do this optimization for Native calls because // they CAN write to Java memory. - if (mstore->ideal_Opcode() == Op_CallStaticJava) { - assert(mstore->is_MachSafePoint(), ""); - MachSafePointNode* ms = (MachSafePointNode*) mstore; + if (muse->ideal_Opcode() == Op_CallStaticJava) { + assert(muse->is_MachSafePoint(), ""); + MachSafePointNode* ms = (MachSafePointNode*)muse; assert(ms->is_MachCallJava(), ""); MachCallJavaNode* mcj = (MachCallJavaNode*) ms; if (mcj->_method == nullptr) { // These runtime calls do not write to Java visible memory - // (other than Raw) and so do not require anti-dependence edges. + // (other than Raw) and so are not anti-dependent. continue; } } // Same for SafePoints: they read/write Raw but only read otherwise. // This is basically a workaround for SafePoints only defining control // instead of control + memory. - if (mstore->ideal_Opcode() == Op_SafePoint) + if (muse->ideal_Opcode() == Op_SafePoint) { continue; + } } else { // Some raw memory, such as the load of "top" at an allocation, // can be control dependent on the previous safepoint. See // comments in GraphKit::allocate_heap() about control input. - // Inserting an anti-dep between such a safepoint and a use + // Inserting an anti-dependence edge between such a safepoint and a use // creates a cycle, and will cause a subsequent failure in // local scheduling. (BugId 4919904) // (%%% How can a control input be a safepoint and not a projection??) - if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore) + if (muse->ideal_Opcode() == Op_SafePoint && load->in(0) == muse) { continue; + } } } - // Identify a block that the current load must be above, - // or else observe that 'store' is all the way up in the - // earliest legal block for 'load'. In the latter case, - // immediately insert an anti-dependence edge. - Block* store_block = get_block_for_node(use_mem_state); - assert(store_block != nullptr, "unused killing projections skipped above"); + // Determine the block of the use_mem_state. + Block* use_mem_state_block = get_block_for_node(use_mem_state); + assert(use_mem_state_block != nullptr, + "unused killing projections skipped above"); + + // For efficiency, we take a lazy approach to both raising the LCA and + // adding anti-dependence edges. In this worklist loop, we only mark blocks + // which we must raise the LCA above (set_raise_LCA_mark), and keep + // track of nodes that potentially need anti-dependence edges + // (non_early_stores). The only exceptions to this are if we + // immediately see that we have to raise the LCA all the way to the early + // block, and if we find stores in the early block (which always need + // anti-dependence edges). + // + // After the worklist loop, we perform an efficient combined LCA-raising + // operation over all marks and only then add anti-dependence edges where + // strictly necessary according to the new raised LCA. if (use_mem_state->is_Phi()) { - // Loop-phis need to raise load before input. (Other phis are treated - // as store below.) + // We have reached a memory Phi node. On our search from initial_mem to + // the Phi, we have found no anti-dependences (otherwise, we would have + // already terminated the search along this branch). Consider the example + // below, indicating a Phi node and its node inputs (we omit the control + // input). // - // 'load' uses memory which is one (or more) of the Phi's inputs. - // It must be scheduled not before the Phi, but rather before - // each of the relevant Phi inputs. + // def_mem_state + // | + // | ? ? + // \ | / + // Phi // - // Instead of finding the LCA of all inputs to a Phi that match 'mem', - // we mark each corresponding predecessor block and do a combined - // hoisting operation later (raise_LCA_above_marks). + // We reached the Phi from def_mem_state and know that, on this + // particular input, the memory that the load must witness is not + // overwritten. However, for the Phi's other inputs (? in the + // illustration), we have no information and must thus conservatively + // assume that the load's memory is overwritten at and below the Phi. // - // Do not assert(store_block != early, "Phi merging memory after access") + // It is impossible to schedule the load before the Phi in + // the same block as the Phi (use_mem_state_block), and anti-dependence + // edges are, therefore, redundant. We must, however, find the + // predecessor block of use_mem_state_block that corresponds to + // def_mem_state, and raise the LCA above that block. Note that this block + // is not necessarily def_mem_state's block! See the continuation of our + // previous example below (now illustrating blocks instead of nodes) + // + // def_mem_state's block + // | + // | + // pred_block + // | + // | ? ? + // | | | + // use_mem_state_block + // + // Here, we must raise the LCA above pred_block rather than + // def_mem_state's block. + // + // Do not assert(use_mem_state_block != early, "Phi merging memory after access") // PhiNode may be at start of block 'early' with backedge to 'early' + if (LCA == early) { + // Don't bother if LCA is already raised all the way + continue; + } DEBUG_ONLY(bool found_match = false); for (uint j = PhiNode::Input, jmax = use_mem_state->req(); j < jmax; j++) { if (use_mem_state->in(j) == def_mem_state) { // Found matching input? DEBUG_ONLY(found_match = true); - Block* pred_block = get_block_for_node(store_block->pred(j)); + Block* pred_block = get_block_for_node(use_mem_state_block->pred(j)); if (pred_block != early) { - // If any predecessor of the Phi matches the load's "early block", - // we do not need a precedence edge between the Phi and 'load' - // since the load will be forced into a block preceding the Phi. + // Lazily set the LCA mark pred_block->set_raise_LCA_mark(load_index); - assert(!LCA_orig->dominates(pred_block) || - early->dominates(pred_block), "early is high enough"); - must_raise_LCA = true; - } else { - // anti-dependent upon PHI pinned below 'early', no edge needed - LCA = early; // but can not schedule below 'early' + must_raise_LCA_above_marks = true; + } else /* if (pred_block == early) */ { + // We know already now that we must raise LCA all the way to early. + LCA = early; + // This turns off the process of gathering non_early_stores. } } } assert(found_match, "no worklist bug"); - } else if (store_block != early) { - // 'store' is between the current LCA and earliest possible block. - // Label its block, and decide later on how to raise the LCA - // to include the effect on LCA of this store. - // If this store's block gets chosen as the raised LCA, we - // will find him on the non_early_stores list and stick him - // with a precedence edge. - // (But, don't bother if LCA is already raised all the way.) - if (LCA != early && !unrelated_load_in_store_null_block(use_mem_state, load)) { - store_block->set_raise_LCA_mark(load_index); - must_raise_LCA = true; - non_early_stores.push(use_mem_state); + } else if (use_mem_state_block != early) { + // We found an anti-dependent store outside the load's 'early' block. The + // store may be between the current LCA and the earliest possible block + // (but it could very well also be on a distinct control-flow path). + // Lazily set the LCA mark and push to non_early_stores. + if (LCA == early) { + // Don't bother if LCA is already raised all the way + continue; } - } else { - // Found a possibly-interfering store in the load's 'early' block. - // This means 'load' cannot sink at all in the dominator tree. - // Add an anti-dep edge, and squeeze 'load' into the highest block. + if (unrelated_load_in_store_null_block(use_mem_state, load)) { + continue; + } + use_mem_state_block->set_raise_LCA_mark(load_index); + must_raise_LCA_above_marks = true; + non_early_stores.push(use_mem_state); + } else /* if (use_mem_state_block == early) */ { + // We found an anti-dependent store in the load's 'early' block. + // Therefore, we know already now that we must raise LCA all the way to + // early and that we need to add an anti-dependence edge to the store. assert(use_mem_state != load->find_exact_control(load->in(0)), "dependence cycle found"); if (verify) { assert(use_mem_state->find_edge(load) != -1 || unrelated_load_in_store_null_block(use_mem_state, load), @@ -924,36 +1052,54 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { // This turns off the process of gathering non_early_stores. } } - // (Worklist is now empty; all nearby stores have been visited.) + // Worklist is now empty; we have visited all possible anti-dependences. // Finished if 'load' must be scheduled in its 'early' block. // If we found any stores there, they have already been given - // precedence edges. - if (LCA == early) return LCA; + // anti-dependence edges. + if (LCA == early) { + return LCA; + } - // We get here only if there are no possibly-interfering stores - // in the load's 'early' block. Move LCA up above all predecessors - // which contain stores we have noted. - // - // The raised LCA block can be a home to such interfering stores, - // but its predecessors must not contain any such stores. - // - // The raised LCA will be a lower bound for placing the load, - // preventing the load from sinking past any block containing - // a store that may invalidate the memory state required by 'load'. - if (must_raise_LCA) + // We get here only if there are no anti-dependent stores in the load's + // 'early' block and if no memory Phi has forced LCA to the early block. Now + // we must raise the LCA above the blocks for all the anti-dependent stores + // and above the predecessor blocks of anti-dependent memory Phis we reached + // during the search. + if (must_raise_LCA_above_marks) { LCA = raise_LCA_above_marks(LCA, load->_idx, early, this); - if (LCA == early) return LCA; + } - // Insert anti-dependence edges from 'load' to each store - // in the non-early LCA block. - // Mine the non_early_stores list for such stores. + // If LCA == early at this point, there were no stores that required + // anti-dependence edges in the early block. Otherwise, we would have eagerly + // raised the LCA to early already in the worklist loop. + if (LCA == early) { + return LCA; + } + + // The raised LCA block can now be a home to anti-dependent stores for which + // we still need to add anti-dependence edges, but no LCA predecessor block + // contains any such stores (otherwise, we would have raised the LCA even + // higher). + // + // The raised LCA will be a lower bound for placing the load, preventing the + // load from sinking past any block containing a store that may overwrite + // memory that the load must witness. + // + // Now we need to insert the necessary anti-dependence edges from 'load' to + // each store in the non-early LCA block. We have recorded all such potential + // stores in non_early_stores. + // + // If LCA->raise_LCA_mark() != load_index, it means that we raised the LCA to + // a block in which we did not find any anti-dependent stores. So, no need to + // search for any such stores. if (LCA->raise_LCA_mark() == load_index) { while (non_early_stores.size() > 0) { Node* store = non_early_stores.pop(); Block* store_block = get_block_for_node(store); if (store_block == LCA) { - // add anti_dependence from store to load in its own block + // Add anti-dependence edge from the load to the store in the non-early + // LCA. assert(store != load->find_exact_control(load->in(0)), "dependence cycle found"); if (verify) { assert(store->find_edge(load) != -1, "missing precedence edge"); @@ -962,15 +1108,12 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { } } else { assert(store_block->raise_LCA_mark() == load_index, "block was marked"); - // Any other stores we found must be either inside the new LCA - // or else outside the original LCA. In the latter case, they - // did not interfere with any use of 'load'. - assert(LCA->dominates(store_block) - || !LCA_orig->dominates(store_block), "no stray stores"); } } } + assert(LCA->dominates(LCA_orig), "unsound updated LCA"); + // Return the highest block containing stores; any stores // within that block have been given anti-dependence edges. return LCA; @@ -1532,7 +1675,7 @@ void PhaseCFG::schedule_late(VectorSet &visited, Node_Stack &stack) { if (self->needs_anti_dependence_check()) { // Hoist LCA above possible-defs and insert anti-dependences to // defs in new LCA block. - LCA = insert_anti_dependences(LCA, self); + LCA = raise_above_anti_dependences(LCA, self); if (C->failing()) { return; } diff --git a/src/hotspot/share/opto/lcm.cpp b/src/hotspot/share/opto/lcm.cpp index 8d2809f987c..69732245dcf 100644 --- a/src/hotspot/share/opto/lcm.cpp +++ b/src/hotspot/share/opto/lcm.cpp @@ -491,7 +491,7 @@ void PhaseCFG::implicit_null_check(Block* block, Node *proj, Node *val, int allo if (n->needs_anti_dependence_check() && n->in(LoadNode::Memory) == best->in(StoreNode::Memory)) { // Found anti-dependent load - insert_anti_dependences(block, n); + raise_above_anti_dependences(block, n); if (C->failing()) { return; } @@ -1363,7 +1363,7 @@ void PhaseCFG::call_catch_cleanup(Block* block) { sb->insert_node(clone, 1); map_node_to_block(clone, sb); if (clone->needs_anti_dependence_check()) { - insert_anti_dependences(sb, clone); + raise_above_anti_dependences(sb, clone); if (C->failing()) { return; } diff --git a/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage.java b/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage.java index b8db6581b61..5ca215d95ef 100644 --- a/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage.java +++ b/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage.java @@ -24,7 +24,7 @@ /* * @test * @bug 8333258 - * @summary C2: high memory usage in PhaseCFG::insert_anti_dependences() + * @summary C2: high memory usage in PhaseCFG::raise_above_anti_dependences() * @run main/othervm -XX:CompileOnly=TestAntiDependenciesHighMemUsage::test1 -Xcomp TestAntiDependenciesHighMemUsage */ diff --git a/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage2.java b/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage2.java index 3f7dbf9918b..d5f220f4628 100644 --- a/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage2.java +++ b/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage2.java @@ -24,7 +24,7 @@ /* * @test * @bug 8333258 - * @summary C2: high memory usage in PhaseCFG::insert_anti_dependences() + * @summary C2: high memory usage in PhaseCFG::raise_above_anti_dependences() * @run main/othervm -XX:CompileOnly=TestAntiDependenciesHighMemUsage2::test1 -XX:-ClipInlining * -XX:-BackgroundCompilation -XX:-TieredCompilation -XX:-UseOnStackReplacement TestAntiDependenciesHighMemUsage2 */ diff --git a/test/hotspot/jtreg/compiler/loopopts/TestSplitIfPinnedLoadInStripMinedLoop.java b/test/hotspot/jtreg/compiler/loopopts/TestSplitIfPinnedLoadInStripMinedLoop.java index 839413fe135..1af24555049 100644 --- a/test/hotspot/jtreg/compiler/loopopts/TestSplitIfPinnedLoadInStripMinedLoop.java +++ b/test/hotspot/jtreg/compiler/loopopts/TestSplitIfPinnedLoadInStripMinedLoop.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -138,7 +138,7 @@ public class TestSplitIfPinnedLoadInStripMinedLoop { } // Same as test2 but with reference to inner loop induction variable 'j' and different order of instructions. - // Triggers an assert in PhaseCFG::insert_anti_dependences if loop strip mining verification is disabled: + // Triggers an assert in PhaseCFG::raise_above_anti_dependences if loop strip mining verification is disabled: // assert(!LCA_orig->dominates(pred_block) || early->dominates(pred_block)) failed: early is high enough int test4(MyClass obj1, MyClass obj2) { for (int i = 0; i < 10; ++i) {