8351568: Improve source code documentation for PhaseCFG::insert_anti_dependences

Reviewed-by: rcastanedalo, chagedorn
2026-05-15 16:09:44 +00:00 · 2025-05-15 12:54:49 +00:00 · 2025-05-15 12:54:49 +00:00 · 5cb231714f
commit 5cb231714f
parent 1d36f173c4
7 changed files with 292 additions and 149 deletions
--- a/src/hotspot/share/adlc/output_h.cpp
+++ b/src/hotspot/share/adlc/output_h.cpp
@ -1962,7 +1962,7 @@ void ArchDesc::declareClasses(FILE *fp) {
    else if( instr->is_ideal_box() ) {
      // BoxNode provides the address of a stack slot.
      // Define its bottom type to be TypeRawPtr::BOTTOM instead of TypePtr::BOTTOM
-      // This prevent s insert_anti_dependencies from complaining. It will
+      // This prevents raise_above_anti_dependences from complaining. It will
      // complain if it sees that the pointer base is TypePtr::BOTTOM since
      // it doesn't understand what that might alias.
      fprintf(fp,"  const Type            *bottom_type() const { return TypeRawPtr::BOTTOM; } // Box?\n");
--- a/src/hotspot/share/opto/block.hpp
+++ b/src/hotspot/share/opto/block.hpp
@ -210,7 +210,7 @@ public:
  uint _freg_pressure;
  uint _fhrp_index;

-  // Mark and visited bits for an LCA calculation in insert_anti_dependences.
+  // Mark and visited bits for an LCA calculation in raise_above_anti_dependences.
  // Since they hold unique node indexes, they do not need reinitialization.
  node_idx_t _raise_LCA_mark;
  void    set_raise_LCA_mark(node_idx_t x)    { _raise_LCA_mark = x; }
@ -487,10 +487,10 @@ class PhaseCFG : public Phase {
  // Used when building the CFG and creating end nodes for blocks.
  MachNode* _goto;

-  Block* insert_anti_dependences(Block* LCA, Node* load, bool verify = false);
+  Block* raise_above_anti_dependences(Block* LCA, Node* load, bool verify = false);
  void verify_anti_dependences(Block* LCA, Node* load) const {
    assert(LCA == get_block_for_node(load), "should already be scheduled");
-    const_cast<PhaseCFG*>(this)->insert_anti_dependences(LCA, load, true);
+    const_cast<PhaseCFG*>(this)->raise_above_anti_dependences(LCA, load, true);
  }

  bool move_to_next(Block* bx, uint b_index);
--- a/src/hotspot/share/opto/gcm.cpp
+++ b/src/hotspot/share/opto/gcm.cpp
@ -456,6 +456,7 @@ static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseC
 // of all marked blocks.  If there are none marked, return the original
 // LCA.
 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) {
+  assert(early->dominates(LCA), "precondition failed");
  Block_List worklist;
  worklist.push(LCA);
  while (worklist.size() > 0) {
@ -470,7 +471,7 @@ static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, c
      // Raise the LCA.
      LCA = mid->dom_lca(LCA);
      if (LCA == early)  break;   // stop searching everywhere
-      assert(early->dominates(LCA), "early is high enough");
+      assert(early->dominates(LCA), "unsound LCA update");
      // Resume searching at that point, skipping intermediate levels.
      worklist.push(LCA);
      if (LCA == mid)
@ -543,7 +544,7 @@ static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg)
  return early;
 }

-// This function is used by insert_anti_dependences to find unrelated loads for stores in implicit null checks.
+// This function is used by raise_above_anti_dependences to find unrelated loads for stores in implicit null checks.
 bool PhaseCFG::unrelated_load_in_store_null_block(Node* store, Node* load) {
  // We expect an anti-dependence edge from 'load' to 'store', except when
  // implicit_null_check() has hoisted 'store' above its early block to
@ -597,7 +598,7 @@ private:
    // def_mem is one of the inputs of use_phi and at least one input of use_phi is
    // not def_mem. It's however possible that use_phi has def_mem as input multiple
    // times. If that happens, use_phi is recorded as a use of def_mem multiple
-    // times as well. When PhaseCFG::insert_anti_dependences() goes over
+    // times as well. When PhaseCFG::raise_above_anti_dependences() goes over
    // uses of def_mem and enqueues them for processing, use_phi would then be
    // enqueued for processing multiple times when it only needs to be
    // processed once. The code below checks if use_phi as a use of def_mem was
@ -662,23 +663,67 @@ public:
  }
 };

-//--------------------------insert_anti_dependences---------------------------
-// A load may need to witness memory that nearby stores can overwrite.
-// For each nearby store, either insert an "anti-dependence" edge
-// from the load to the store, or else move LCA upward to force the
-// load to (eventually) be scheduled in a block above the store.
+// Enforce a scheduling of the given 'load' that ensures anti-dependent stores
+// do not overwrite the load's input memory state before the load executes.
 //
-// Do not add edges to stores on distinct control-flow paths;
-// only add edges to stores which might interfere.
+// The given 'load' has a current scheduling range in the dominator tree that
+// starts at the load's early block (computed in schedule_early) and ends at
+// the given 'LCA' block for the load. However, there may still exist
+// anti-dependent stores between the early block and the LCA that overwrite
+// memory that the load must witness. For such stores, we must
 //
-// Return the (updated) LCA.  There will not be any possibly interfering
-// store between the load's "early block" and the updated LCA.
-// Any stores in the updated LCA will have new precedence edges
-// back to the load.  The caller is expected to schedule the load
-// in the LCA, in which case the precedence edges will make LCM
-// preserve anti-dependences.  The caller may also hoist the load
-// above the LCA, if it is not the early block.
-Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
+//   1. raise the load's LCA to force the load to (eventually) be scheduled at
+//      latest in the store's block, and
+//   2. if the load may get scheduled in the store's block, additionally insert
+//      an anti-dependence edge (i.e., precedence edge) from the load to the
+//      store to ensure LCM schedules the load before the store within the
+//      block.
+//
+// For a given store, we say that the store is on a _distinct_ control-flow
+// path relative to the load if there are no paths from early to LCA that go
+// through the store's block. Such stores are not anti-dependent, and there is
+// no need to update the LCA nor to add anti-dependence edges.
+//
+// Due to the presence of loops, we must also raise the LCA above
+// anti-dependent memory Phis. We defer the details (see later comments in the
+// method) and for now look at an example without loops.
+//
+//          CFG               DOMINATOR TREE
+//
+//       B1 (early,L)              B1
+//       |\________                /\\___
+//       |         \              /  \   \
+//       B2 (L,S)   \            B2  B7  B6
+//      /  \         \           /\\___
+//     B3  B4 (S)    B7 (S)     /  \   \
+//      \  /         /         B3  B4  B5
+//       B5 (LCA,L) /
+//        \    ____/
+//         \  /
+//          B6
+//
+// Here, the load's scheduling range when calling raise_above_anti_dependences
+// is between early and LCA in the dominator tree, i.e., in block B1, B2, or B5
+// (indicated with "L"). However, there are a number of stores (indicated with
+// "S") that overwrite the memory which the load must witness. First, consider
+// the store in B4. We cannot legally schedule the load in B4, so an
+// anti-dependence edge is redundant. However, we must raise the LCA above
+// B4, which means that the updated LCA is B2. Now, consider the store in B2.
+// The LCA is already B2, so we do not need to raise it any further.
+// If we, eventually, decide to schedule the load in B2, it could happen that
+// LCM decides to place the load after the anti-dependent store in B2.
+// Therefore, we now need to add an anti-dependence edge between the load and
+// the B2 store, ensuring that the load is scheduled before the store. Finally,
+// the store in B7 is on a distinct control-flow path. Therefore, B7 requires
+// no action.
+//
+// The raise_above_anti_dependences method returns the updated LCA and ensures
+// there are no anti-dependent stores in any block between the load's early
+// block and the updated LCA. Any stores in the updated LCA will have new
+// anti-dependence edges back to the load. The caller may schedule the load in
+// the updated LCA, or it may hoist the load above the updated LCA, if the
+// updated LCA is not the early block.
+Block* PhaseCFG::raise_above_anti_dependences(Block* LCA, Node* load, const bool verify) {
  ResourceMark rm;
  assert(load->needs_anti_dependence_check(), "must be a load of some sort");
  assert(LCA != nullptr, "");
@ -711,16 +756,16 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {

  node_idx_t load_index = load->_idx;

-  // Note the earliest legal placement of 'load', as determined by
-  // by the unique point in the dom tree where all memory effects
-  // and other inputs are first available.  (Computed by schedule_early.)
-  // For normal loads, 'early' is the shallowest place (dom graph wise)
-  // to look for anti-deps between this load and any store.
+  // Record the earliest legal placement of 'load', as determined by the unique
+  // point in the dominator tree where all memory effects and other inputs are
+  // first available (computed by schedule_early). For normal loads, 'early' is
+  // the shallowest place (dominator-tree wise) to look for anti-dependences
+  // between this load and any store.
  Block* early = get_block_for_node(load);

  // If we are subsuming loads, compute an "early" block that only considers
-  // memory or address inputs. This block may be different than the
-  // schedule_early block in that it could be at an even shallower depth in the
+  // memory or address inputs. This block may be different from the
+  // schedule_early block when it is at an even shallower depth in the
  // dominator tree, and allow for a broader discovery of anti-dependences.
  if (C->subsume_loads()) {
    early = memory_early_block(load, early, this);
@ -729,29 +774,47 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
    }
  }

+  assert(early->dominates(LCA_orig), "precondition failed");
+
  ResourceArea* area = Thread::current()->resource_area();
-  DefUseMemStatesQueue worklist_def_use_mem_states(area); // prior memory state to store and possible-def to explore
-  Node_List non_early_stores(area); // all relevant stores outside of early
-  bool must_raise_LCA = false;

-  // 'load' uses some memory state; look for users of the same state.
-  // Recurse through MergeMem nodes to the stores that use them.
+  // Bookkeeping of possibly anti-dependent stores that we find outside the
+  // early block and that may need anti-dependence edges. Note that stores in
+  // non_early_stores are not necessarily dominated by early. The search starts
+  // from initial_mem, which can reside in a block that dominates early, and
+  // therefore, stores we find may be in blocks that are on completely distinct
+  // control-flow paths compared to early. However, in the end, only stores in
+  // blocks dominated by early matter. The reason for bookkeeping not only
+  // relevant stores is efficiency: we lazily record all possible
+  // anti-dependent stores and add anti-dependence edges only to the relevant
+  // ones at the very end of this method when we know the final updated LCA.
+  Node_List non_early_stores(area);

-  // Each of these stores is a possible definition of memory
-  // that 'load' needs to use.  We need to force 'load'
-  // to occur before each such store.  When the store is in
-  // the same block as 'load', we insert an anti-dependence
-  // edge load->store.
-
-  // The relevant stores "nearby" the load consist of a tree rooted
-  // at initial_mem, with internal nodes of type MergeMem.
-  // Therefore, the branches visited by the worklist are of this form:
-  //    initial_mem -> (MergeMem ->)* Memory state modifying node
-  // Memory state modifying nodes include Store and Phi nodes and any node for which needs_anti_dependence_check()
-  // returns false.
-  // The anti-dependence constraints apply only to the fringe of this tree.
+  // Whether we must raise the LCA after the main worklist loop below.
+  bool must_raise_LCA_above_marks = false;

+  // The input load uses some memory state (initial_mem).
  Node* initial_mem = load->in(MemNode::Memory);
+  // To find anti-dependences we must look for users of the same memory state.
+  // To do this, we search the memory graph downwards from initial_mem. During
+  // this search, we encounter different types of nodes that we handle
+  // according to the following three categories:
+  //
+  // - MergeMems
+  // - Memory-state-modifying nodes (informally referred to as "stores" above
+  //   and below)
+  // - Memory Phis
+  //
+  // MergeMems do not modify the memory state. Anti-dependent stores or memory
+  // Phis may, however, exist downstream of MergeMems. Therefore, we must
+  // permit the search to continue through MergeMems. Stores may raise the LCA
+  // and may potentially also require an anti-dependence edge. Memory Phis may
+  // raise the LCA but never require anti-dependence edges. See the comments
+  // throughout the worklist loop below for further details.
+  //
+  // It may be useful to think of the anti-dependence search as traversing a
+  // tree rooted at initial_mem, with internal nodes of type MergeMem and
+  // memory Phis and stores as (potentially repeated) leaves.

  // We don't optimize the memory graph for pinned loads, so we may need to raise the
  // root of our search tree through the corresponding slices of MergeMem nodes to
@ -767,14 +830,32 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
      }
    }
  }
-  worklist_def_use_mem_states.push(nullptr, initial_mem);
-  while (worklist_def_use_mem_states.is_nonempty()) {
-    // Examine a nearby store to see if it might interfere with our load.
-    Node* def_mem_state = worklist_def_use_mem_states.top_def();
-    Node* use_mem_state = worklist_def_use_mem_states.top_use();
-    worklist_def_use_mem_states.pop();
+  // To administer the search, we use a worklist consisting of (def,use)-pairs
+  // of memory states, corresponding to edges in the search tree (and edges
+  // in the memory graph). We need to keep track of search tree edges in the
+  // worklist rather than individual nodes due to memory Phis (see details
+  // below).
+  DefUseMemStatesQueue worklist(area);
+  // We start the search at initial_mem and indicate the search root with the
+  // edge (nullptr, initial_mem).
+  worklist.push(nullptr, initial_mem);

-    uint op = use_mem_state->Opcode();
+  // The worklist loop
+  while (worklist.is_nonempty()) {
+    // Pop the next edge from the worklist
+    Node* def_mem_state = worklist.top_def();
+    Node* use_mem_state = worklist.top_use();
+    worklist.pop();
+
+    // We are either
+    // - at the root of the search with the edge (nullptr, initial_mem),
+    // - just past initial_mem with the edge (initial_mem, use_mem_state), or
+    // - just past a MergeMem with the edge (MergeMem, use_mem_state).
+    assert(def_mem_state == nullptr || def_mem_state == initial_mem ||
+           def_mem_state->is_MergeMem(),
+           "unexpected memory state");
+
+    const uint op = use_mem_state->Opcode();

 #ifdef ASSERT
    // CacheWB nodes are peculiar in a sense that they both are anti-dependent and produce memory.
@ -787,132 +868,179 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
    assert(!use_mem_state->needs_anti_dependence_check() || is_cache_wb, "no loads");
 #endif

-    // MergeMems do not directly have anti-deps.
-    // Treat them as internal nodes in a forward tree of memory states,
-    // the leaves of which are each a 'possible-def'.
-    if (use_mem_state == initial_mem    // root (exclusive) of tree we are searching
-        || op == Op_MergeMem    // internal node of tree we are searching
-        ) {
-      def_mem_state = use_mem_state;   // It's not a possibly interfering store.
-      if (use_mem_state == initial_mem)
-        initial_mem = nullptr;  // only process initial memory once
+    // If we are either at the search root or have found a MergeMem, we step
+    // past use_mem_state and populate the search worklist with edges
+    // (use_mem_state, child) for use_mem_state's children.
+    if (def_mem_state == nullptr // root (exclusive) of tree we are searching
+        || op == Op_MergeMem     // internal node of tree we are searching
+    ) {
+      def_mem_state = use_mem_state;

      for (DUIterator_Fast imax, i = def_mem_state->fast_outs(imax); i < imax; i++) {
        use_mem_state = def_mem_state->fast_out(i);
        if (use_mem_state->needs_anti_dependence_check()) {
-          // use_mem_state is also a kind of load (i.e. needs_anti_dependence_check), and it is not a memory state
-          // modifying node (store, Phi or MergeMem). Hence, load can't be anti dependent on this node.
+          // use_mem_state is also a kind of load (i.e.,
+          // needs_anti_dependence_check), and it is not a store nor a memory
+          // Phi. Hence, it is not anti-dependent on the load.
          continue;
        }
-        worklist_def_use_mem_states.push(def_mem_state, use_mem_state);
+        worklist.push(def_mem_state, use_mem_state);
      }
+      // Nothing more to do for the current (nullptr, initial_mem) or
+      // (initial_mem/MergeMem, MergeMem) edge, move on.
      continue;
    }

+    assert(!use_mem_state->is_MergeMem(),
+           "use_mem_state should be either a store or a memory Phi");
+
    if (op == Op_MachProj || op == Op_Catch)   continue;

-    // Compute the alias index.  Loads and stores with different alias
-    // indices do not need anti-dependence edges.  Wide MemBar's are
-    // anti-dependent on everything (except immutable memories).
+    // Compute the alias index. If the use_mem_state has an alias index
+    // different from the load's, it is not anti-dependent. Wide MemBar's
+    // are anti-dependent with everything (except immutable memories).
    const TypePtr* adr_type = use_mem_state->adr_type();
    if (!C->can_alias(adr_type, load_alias_idx))  continue;

    // Most slow-path runtime calls do NOT modify Java memory, but
    // they can block and so write Raw memory.
    if (use_mem_state->is_Mach()) {
-      MachNode* mstore = use_mem_state->as_Mach();
+      MachNode* muse = use_mem_state->as_Mach();
      if (load_alias_idx != Compile::AliasIdxRaw) {
        // Check for call into the runtime using the Java calling
        // convention (and from there into a wrapper); it has no
        // _method.  Can't do this optimization for Native calls because
        // they CAN write to Java memory.
-        if (mstore->ideal_Opcode() == Op_CallStaticJava) {
-          assert(mstore->is_MachSafePoint(), "");
-          MachSafePointNode* ms = (MachSafePointNode*) mstore;
+        if (muse->ideal_Opcode() == Op_CallStaticJava) {
+          assert(muse->is_MachSafePoint(), "");
+          MachSafePointNode* ms = (MachSafePointNode*)muse;
          assert(ms->is_MachCallJava(), "");
          MachCallJavaNode* mcj = (MachCallJavaNode*) ms;
          if (mcj->_method == nullptr) {
            // These runtime calls do not write to Java visible memory
-            // (other than Raw) and so do not require anti-dependence edges.
+            // (other than Raw) and so are not anti-dependent.
            continue;
          }
        }
        // Same for SafePoints: they read/write Raw but only read otherwise.
        // This is basically a workaround for SafePoints only defining control
        // instead of control + memory.
-        if (mstore->ideal_Opcode() == Op_SafePoint)
+        if (muse->ideal_Opcode() == Op_SafePoint) {
          continue;
+        }
      } else {
        // Some raw memory, such as the load of "top" at an allocation,
        // can be control dependent on the previous safepoint. See
        // comments in GraphKit::allocate_heap() about control input.
-        // Inserting an anti-dep between such a safepoint and a use
+        // Inserting an anti-dependence edge between such a safepoint and a use
        // creates a cycle, and will cause a subsequent failure in
        // local scheduling.  (BugId 4919904)
        // (%%% How can a control input be a safepoint and not a projection??)
-        if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
+        if (muse->ideal_Opcode() == Op_SafePoint && load->in(0) == muse) {
          continue;
+        }
      }
    }

-    // Identify a block that the current load must be above,
-    // or else observe that 'store' is all the way up in the
-    // earliest legal block for 'load'.  In the latter case,
-    // immediately insert an anti-dependence edge.
-    Block* store_block = get_block_for_node(use_mem_state);
-    assert(store_block != nullptr, "unused killing projections skipped above");
+    // Determine the block of the use_mem_state.
+    Block* use_mem_state_block = get_block_for_node(use_mem_state);
+    assert(use_mem_state_block != nullptr,
+           "unused killing projections skipped above");
+
+    // For efficiency, we take a lazy approach to both raising the LCA and
+    // adding anti-dependence edges. In this worklist loop, we only mark blocks
+    // which we must raise the LCA above (set_raise_LCA_mark), and keep
+    // track of nodes that potentially need anti-dependence edges
+    // (non_early_stores). The only exceptions to this are if we
+    // immediately see that we have to raise the LCA all the way to the early
+    // block, and if we find stores in the early block (which always need
+    // anti-dependence edges).
+    //
+    // After the worklist loop, we perform an efficient combined LCA-raising
+    // operation over all marks and only then add anti-dependence edges where
+    // strictly necessary according to the new raised LCA.

    if (use_mem_state->is_Phi()) {
-      // Loop-phis need to raise load before input. (Other phis are treated
-      // as store below.)
+      // We have reached a memory Phi node. On our search from initial_mem to
+      // the Phi, we have found no anti-dependences (otherwise, we would have
+      // already terminated the search along this branch). Consider the example
+      // below, indicating a Phi node and its node inputs (we omit the control
+      // input).
      //
-      // 'load' uses memory which is one (or more) of the Phi's inputs.
-      // It must be scheduled not before the Phi, but rather before
-      // each of the relevant Phi inputs.
+      //    def_mem_state
+      //          |
+      //          | ? ?
+      //          \ | /
+      //           Phi
      //
-      // Instead of finding the LCA of all inputs to a Phi that match 'mem',
-      // we mark each corresponding predecessor block and do a combined
-      // hoisting operation later (raise_LCA_above_marks).
+      // We reached the Phi from def_mem_state and know that, on this
+      // particular input, the memory that the load must witness is not
+      // overwritten. However, for the Phi's other inputs (? in the
+      // illustration), we have no information and must thus conservatively
+      // assume that the load's memory is overwritten at and below the Phi.
      //
-      // Do not assert(store_block != early, "Phi merging memory after access")
+      // It is impossible to schedule the load before the Phi in
+      // the same block as the Phi (use_mem_state_block), and anti-dependence
+      // edges are, therefore, redundant. We must, however, find the
+      // predecessor block of use_mem_state_block that corresponds to
+      // def_mem_state, and raise the LCA above that block. Note that this block
+      // is not necessarily def_mem_state's block! See the continuation of our
+      // previous example below (now illustrating blocks instead of nodes)
+      //
+      //    def_mem_state's block
+      //          |
+      //          |
+      //      pred_block
+      //          |
+      //          |   ?   ?
+      //          |   |   |
+      //      use_mem_state_block
+      //
+      // Here, we must raise the LCA above pred_block rather than
+      // def_mem_state's block.
+      //
+      // Do not assert(use_mem_state_block != early, "Phi merging memory after access")
      // PhiNode may be at start of block 'early' with backedge to 'early'
+      if (LCA == early) {
+        // Don't bother if LCA is already raised all the way
+        continue;
+      }
      DEBUG_ONLY(bool found_match = false);
      for (uint j = PhiNode::Input, jmax = use_mem_state->req(); j < jmax; j++) {
        if (use_mem_state->in(j) == def_mem_state) {   // Found matching input?
          DEBUG_ONLY(found_match = true);
-          Block* pred_block = get_block_for_node(store_block->pred(j));
+          Block* pred_block = get_block_for_node(use_mem_state_block->pred(j));
          if (pred_block != early) {
-            // If any predecessor of the Phi matches the load's "early block",
-            // we do not need a precedence edge between the Phi and 'load'
-            // since the load will be forced into a block preceding the Phi.
+            // Lazily set the LCA mark
            pred_block->set_raise_LCA_mark(load_index);
-            assert(!LCA_orig->dominates(pred_block) ||
-                   early->dominates(pred_block), "early is high enough");
-            must_raise_LCA = true;
-          } else {
-            // anti-dependent upon PHI pinned below 'early', no edge needed
-            LCA = early;             // but can not schedule below 'early'
+            must_raise_LCA_above_marks = true;
+          } else /* if (pred_block == early) */ {
+            // We know already now that we must raise LCA all the way to early.
+            LCA = early;
+            // This turns off the process of gathering non_early_stores.
          }
        }
      }
      assert(found_match, "no worklist bug");
-    } else if (store_block != early) {
-      // 'store' is between the current LCA and earliest possible block.
-      // Label its block, and decide later on how to raise the LCA
-      // to include the effect on LCA of this store.
-      // If this store's block gets chosen as the raised LCA, we
-      // will find him on the non_early_stores list and stick him
-      // with a precedence edge.
-      // (But, don't bother if LCA is already raised all the way.)
-      if (LCA != early && !unrelated_load_in_store_null_block(use_mem_state, load)) {
-        store_block->set_raise_LCA_mark(load_index);
-        must_raise_LCA = true;
-        non_early_stores.push(use_mem_state);
+    } else if (use_mem_state_block != early) {
+      // We found an anti-dependent store outside the load's 'early' block. The
+      // store may be between the current LCA and the earliest possible block
+      // (but it could very well also be on a distinct control-flow path).
+      // Lazily set the LCA mark and push to non_early_stores.
+      if (LCA == early) {
+        // Don't bother if LCA is already raised all the way
+        continue;
      }
-    } else {
-      // Found a possibly-interfering store in the load's 'early' block.
-      // This means 'load' cannot sink at all in the dominator tree.
-      // Add an anti-dep edge, and squeeze 'load' into the highest block.
+      if (unrelated_load_in_store_null_block(use_mem_state, load)) {
+        continue;
+      }
+      use_mem_state_block->set_raise_LCA_mark(load_index);
+      must_raise_LCA_above_marks = true;
+      non_early_stores.push(use_mem_state);
+    } else /* if (use_mem_state_block == early) */ {
+      // We found an anti-dependent store in the load's 'early' block.
+      // Therefore, we know already now that we must raise LCA all the way to
+      // early and that we need to add an anti-dependence edge to the store.
      assert(use_mem_state != load->find_exact_control(load->in(0)), "dependence cycle found");
      if (verify) {
        assert(use_mem_state->find_edge(load) != -1 || unrelated_load_in_store_null_block(use_mem_state, load),
@ -924,36 +1052,54 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
      // This turns off the process of gathering non_early_stores.
    }
  }
-  // (Worklist is now empty; all nearby stores have been visited.)
+  // Worklist is now empty; we have visited all possible anti-dependences.

  // Finished if 'load' must be scheduled in its 'early' block.
  // If we found any stores there, they have already been given
-  // precedence edges.
-  if (LCA == early)  return LCA;
+  // anti-dependence edges.
+  if (LCA == early) {
+    return LCA;
+  }

-  // We get here only if there are no possibly-interfering stores
-  // in the load's 'early' block.  Move LCA up above all predecessors
-  // which contain stores we have noted.
-  //
-  // The raised LCA block can be a home to such interfering stores,
-  // but its predecessors must not contain any such stores.
-  //
-  // The raised LCA will be a lower bound for placing the load,
-  // preventing the load from sinking past any block containing
-  // a store that may invalidate the memory state required by 'load'.
-  if (must_raise_LCA)
+  // We get here only if there are no anti-dependent stores in the load's
+  // 'early' block and if no memory Phi has forced LCA to the early block. Now
+  // we must raise the LCA above the blocks for all the anti-dependent stores
+  // and above the predecessor blocks of anti-dependent memory Phis we reached
+  // during the search.
+  if (must_raise_LCA_above_marks) {
    LCA = raise_LCA_above_marks(LCA, load->_idx, early, this);
-  if (LCA == early)  return LCA;
+  }

-  // Insert anti-dependence edges from 'load' to each store
-  // in the non-early LCA block.
-  // Mine the non_early_stores list for such stores.
+  // If LCA == early at this point, there were no stores that required
+  // anti-dependence edges in the early block. Otherwise, we would have eagerly
+  // raised the LCA to early already in the worklist loop.
+  if (LCA == early) {
+    return LCA;
+  }
+
+  // The raised LCA block can now be a home to anti-dependent stores for which
+  // we still need to add anti-dependence edges, but no LCA predecessor block
+  // contains any such stores (otherwise, we would have raised the LCA even
+  // higher).
+  //
+  // The raised LCA will be a lower bound for placing the load, preventing the
+  // load from sinking past any block containing a store that may overwrite
+  // memory that the load must witness.
+  //
+  // Now we need to insert the necessary anti-dependence edges from 'load' to
+  // each store in the non-early LCA block. We have recorded all such potential
+  // stores in non_early_stores.
+  //
+  // If LCA->raise_LCA_mark() != load_index, it means that we raised the LCA to
+  // a block in which we did not find any anti-dependent stores. So, no need to
+  // search for any such stores.
  if (LCA->raise_LCA_mark() == load_index) {
    while (non_early_stores.size() > 0) {
      Node* store = non_early_stores.pop();
      Block* store_block = get_block_for_node(store);
      if (store_block == LCA) {
-        // add anti_dependence from store to load in its own block
+        // Add anti-dependence edge from the load to the store in the non-early
+        // LCA.
        assert(store != load->find_exact_control(load->in(0)), "dependence cycle found");
        if (verify) {
          assert(store->find_edge(load) != -1, "missing precedence edge");
@ -962,15 +1108,12 @@ Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
        }
      } else {
        assert(store_block->raise_LCA_mark() == load_index, "block was marked");
-        // Any other stores we found must be either inside the new LCA
-        // or else outside the original LCA.  In the latter case, they
-        // did not interfere with any use of 'load'.
-        assert(LCA->dominates(store_block)
-               || !LCA_orig->dominates(store_block), "no stray stores");
      }
    }
  }

+  assert(LCA->dominates(LCA_orig), "unsound updated LCA");
+
  // Return the highest block containing stores; any stores
  // within that block have been given anti-dependence edges.
  return LCA;
@ -1532,7 +1675,7 @@ void PhaseCFG::schedule_late(VectorSet &visited, Node_Stack &stack) {
    if (self->needs_anti_dependence_check()) {
      // Hoist LCA above possible-defs and insert anti-dependences to
      // defs in new LCA block.
-      LCA = insert_anti_dependences(LCA, self);
+      LCA = raise_above_anti_dependences(LCA, self);
      if (C->failing()) {
        return;
      }
--- a/src/hotspot/share/opto/lcm.cpp
+++ b/src/hotspot/share/opto/lcm.cpp
@ -491,7 +491,7 @@ void PhaseCFG::implicit_null_check(Block* block, Node *proj, Node *val, int allo
      if (n->needs_anti_dependence_check() &&
          n->in(LoadNode::Memory) == best->in(StoreNode::Memory)) {
        // Found anti-dependent load
-        insert_anti_dependences(block, n);
+        raise_above_anti_dependences(block, n);
        if (C->failing()) {
          return;
        }
@ -1363,7 +1363,7 @@ void PhaseCFG::call_catch_cleanup(Block* block) {
      sb->insert_node(clone, 1);
      map_node_to_block(clone, sb);
      if (clone->needs_anti_dependence_check()) {
-        insert_anti_dependences(sb, clone);
+        raise_above_anti_dependences(sb, clone);
        if (C->failing()) {
          return;
        }
--- a/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage.java
+++ b/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage.java
@ -24,7 +24,7 @@
 /*
 * @test
 * @bug 8333258
- * @summary C2: high memory usage in PhaseCFG::insert_anti_dependences()
+ * @summary C2: high memory usage in PhaseCFG::raise_above_anti_dependences()
 * @run main/othervm -XX:CompileOnly=TestAntiDependenciesHighMemUsage::test1 -Xcomp TestAntiDependenciesHighMemUsage
 */

--- a/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage2.java
+++ b/test/hotspot/jtreg/compiler/codegen/TestAntiDependenciesHighMemUsage2.java
@ -24,7 +24,7 @@
 /*
 * @test
 * @bug 8333258
- * @summary C2: high memory usage in PhaseCFG::insert_anti_dependences()
+ * @summary C2: high memory usage in PhaseCFG::raise_above_anti_dependences()
 * @run main/othervm -XX:CompileOnly=TestAntiDependenciesHighMemUsage2::test1 -XX:-ClipInlining
 *                   -XX:-BackgroundCompilation -XX:-TieredCompilation -XX:-UseOnStackReplacement TestAntiDependenciesHighMemUsage2
 */
--- a/test/hotspot/jtreg/compiler/loopopts/TestSplitIfPinnedLoadInStripMinedLoop.java
+++ b/test/hotspot/jtreg/compiler/loopopts/TestSplitIfPinnedLoadInStripMinedLoop.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -138,7 +138,7 @@ public class TestSplitIfPinnedLoadInStripMinedLoop {
    }

    // Same as test2 but with reference to inner loop induction variable 'j' and different order of instructions.
-    // Triggers an assert in PhaseCFG::insert_anti_dependences if loop strip mining verification is disabled:
+    // Triggers an assert in PhaseCFG::raise_above_anti_dependences if loop strip mining verification is disabled:
    // assert(!LCA_orig->dominates(pred_block) || early->dominates(pred_block)) failed: early is high enough
    int test4(MyClass obj1, MyClass obj2) {
        for (int i = 0; i < 10; ++i) {