From 8d5c0056420731cbbd83f2d23837bbb5cdc9e4cc Mon Sep 17 00:00:00 2001
From: Thomas Schatzl <tschatzl@openjdk.org>
Date: Mon, 22 Sep 2025 13:47:45 +0000
Subject: [PATCH] 8342382: Implement JEP 522: G1 GC: Improve Throughput by
 Reducing Synchronization

Co-authored-by: Amit Kumar <amitkumar@openjdk.org>
Co-authored-by: Martin Doerr <mdoerr@openjdk.org>
Co-authored-by: Carlo Refice <carlo.refice@oracle.com>
Co-authored-by: Fei Yang <fyang@openjdk.org>
Reviewed-by: iwalulya, rcastanedalo, aph, ayang
---
 .../gc/g1/g1BarrierSetAssembler_aarch64.cpp   | 239 ++----
 .../gc/g1/g1BarrierSetAssembler_aarch64.hpp   |  17 +-
 src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad   |   8 +-
 .../arm/gc/g1/g1BarrierSetAssembler_arm.cpp   | 239 +-----
 .../arm/gc/g1/g1BarrierSetAssembler_arm.hpp   |  17 +-
 src/hotspot/cpu/arm/gc/g1/g1_arm.ad           |   8 +-
 .../ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp   | 270 ++----
 .../ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp   |  21 +-
 src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad           |   8 +-
 .../gc/g1/g1BarrierSetAssembler_riscv.cpp     | 263 ++----
 .../gc/g1/g1BarrierSetAssembler_riscv.hpp     |  18 +-
 src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad       |   8 +-
 .../s390/gc/g1/g1BarrierSetAssembler_s390.cpp | 351 ++------
 .../s390/gc/g1/g1BarrierSetAssembler_s390.hpp |  18 +-
 src/hotspot/cpu/s390/gc/g1/g1_s390.ad         |   8 +-
 .../x86/gc/g1/g1BarrierSetAssembler_x86.cpp   | 265 ++----
 .../x86/gc/g1/g1BarrierSetAssembler_x86.hpp   |  31 +-
 src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad        |  38 +-
 src/hotspot/share/code/aotCodeCache.cpp       |   1 -
 src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp | 130 ++-
 src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp |  33 +-
 src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp |  69 +-
 src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp |  25 +-
 src/hotspot/share/gc/g1/g1Allocator.cpp       |   3 -
 src/hotspot/share/gc/g1/g1Analytics.cpp       |  40 +-
 src/hotspot/share/gc/g1/g1Analytics.hpp       |  19 +-
 src/hotspot/share/gc/g1/g1Arguments.cpp       |   9 +-
 src/hotspot/share/gc/g1/g1BarrierSet.cpp      |  91 +-
 src/hotspot/share/gc/g1/g1BarrierSet.hpp      |  60 +-
 .../share/gc/g1/g1BarrierSet.inline.hpp       |   7 +-
 .../share/gc/g1/g1BarrierSetRuntime.cpp       |  14 +-
 .../share/gc/g1/g1BarrierSetRuntime.hpp       |   3 +-
 src/hotspot/share/gc/g1/g1CardTable.cpp       |  40 +-
 src/hotspot/share/gc/g1/g1CardTable.hpp       |  59 +-
 .../share/gc/g1/g1CardTable.inline.hpp        |  52 +-
 .../share/gc/g1/g1CardTableClaimTable.cpp     |  97 +++
 .../share/gc/g1/g1CardTableClaimTable.hpp     | 137 ++++
 .../gc/g1/g1CardTableClaimTable.inline.hpp    | 128 +++
 src/hotspot/share/gc/g1/g1CollectedHeap.cpp   | 125 ++-
 src/hotspot/share/gc/g1/g1CollectedHeap.hpp   |  46 +-
 .../share/gc/g1/g1CollectedHeap.inline.hpp    |  24 -
 src/hotspot/share/gc/g1/g1CollectionSet.cpp   |   9 +-
 src/hotspot/share/gc/g1/g1ConcurrentMark.cpp  |   7 +-
 src/hotspot/share/gc/g1/g1ConcurrentMark.hpp  |   2 +
 .../gc/g1/g1ConcurrentMarkRemarkTasks.cpp     |  10 +-
 .../gc/g1/g1ConcurrentRebuildAndScrub.cpp     |   2 +-
 .../share/gc/g1/g1ConcurrentRefine.cpp        | 674 +++++++++------
 .../share/gc/g1/g1ConcurrentRefine.hpp        | 247 ++++--
 .../share/gc/g1/g1ConcurrentRefineStats.cpp   |  50 +-
 .../share/gc/g1/g1ConcurrentRefineStats.hpp   |  71 +-
 .../gc/g1/g1ConcurrentRefineSweepTask.cpp     | 191 +++++
 .../g1ConcurrentRefineSweepTask.hpp}          |  31 +-
 .../share/gc/g1/g1ConcurrentRefineThread.cpp  | 270 +++---
 .../share/gc/g1/g1ConcurrentRefineThread.hpp  |  42 +-
 .../gc/g1/g1ConcurrentRefineThreadsNeeded.cpp |  52 +-
 src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp  | 599 --------------
 src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp  | 302 -------
 src/hotspot/share/gc/g1/g1FromCardCache.cpp   |   4 +-
 .../share/gc/g1/g1FullGCCompactTask.cpp       |   4 +
 .../gc/g1/g1FullGCPrepareTask.inline.hpp      |   4 +
 .../share/gc/g1/g1FullGCResetMetadataTask.cpp |   2 +-
 src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp    |  34 +-
 src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp    |  43 +-
 src/hotspot/share/gc/g1/g1HeapRegion.cpp      |  38 +-
 src/hotspot/share/gc/g1/g1HeapRegion.hpp      |   6 +-
 .../share/gc/g1/g1HeapRegionManager.cpp       |  25 +-
 .../share/gc/g1/g1HeapRegionManager.hpp       |   6 +-
 src/hotspot/share/gc/g1/g1HeapVerifier.cpp    | 106 ++-
 src/hotspot/share/gc/g1/g1HeapVerifier.hpp    |  15 +-
 src/hotspot/share/gc/g1/g1OopClosures.hpp     |  36 +-
 .../share/gc/g1/g1OopClosures.inline.hpp      |  31 +-
 .../share/gc/g1/g1ParScanThreadState.cpp      |  56 +-
 .../share/gc/g1/g1ParScanThreadState.hpp      |  50 +-
 .../gc/g1/g1ParScanThreadState.inline.hpp     |  34 +-
 src/hotspot/share/gc/g1/g1Policy.cpp          | 407 +++++----
 src/hotspot/share/gc/g1/g1Policy.hpp          |  67 +-
 .../share/gc/g1/g1RedirtyCardsQueue.cpp       | 148 ----
 .../share/gc/g1/g1RedirtyCardsQueue.hpp       |  98 ---
 src/hotspot/share/gc/g1/g1RemSet.cpp          | 776 ++++++------------
 src/hotspot/share/gc/g1/g1RemSet.hpp          |  29 +-
 src/hotspot/share/gc/g1/g1RemSetSummary.cpp   |  74 +-
 src/hotspot/share/gc/g1/g1RemSetSummary.hpp   |  11 +-
 .../share/gc/g1/g1ReviseYoungLengthTask.cpp   |  96 +++
 .../share/gc/g1/g1ReviseYoungLengthTask.hpp   |  63 ++
 src/hotspot/share/gc/g1/g1ThreadLocalData.hpp |  32 +-
 src/hotspot/share/gc/g1/g1YoungCollector.cpp  |  10 +-
 src/hotspot/share/gc/g1/g1YoungCollector.hpp  |   1 -
 .../gc/g1/g1YoungGCPostEvacuateTasks.cpp      | 113 +--
 .../gc/g1/g1YoungGCPostEvacuateTasks.hpp      |   8 +-
 .../share/gc/g1/g1YoungGCPreEvacuateTasks.cpp |  99 +--
 .../share/gc/g1/g1YoungGCPreEvacuateTasks.hpp |  13 +-
 src/hotspot/share/gc/g1/g1_globals.hpp        |   9 +-
 .../share/gc/g1/jvmFlagConstraintsG1.cpp      |   6 -
 .../share/gc/g1/jvmFlagConstraintsG1.hpp      |   1 -
 src/hotspot/share/gc/g1/vmStructs_g1.hpp      |   4 +-
 .../share/gc/shared/bufferNodeList.cpp        |  38 -
 src/hotspot/share/gc/shared/cardTable.cpp     |   8 +-
 src/hotspot/share/gc/shared/cardTable.hpp     |   8 +-
 .../share/gc/shared/workerDataArray.hpp       |   4 +-
 src/hotspot/share/jvmci/jvmciRuntime.cpp      |   4 -
 src/hotspot/share/jvmci/vmStructs_jvmci.cpp   |   6 +-
 src/hotspot/share/oops/oop.cpp                |  11 +-
 src/hotspot/share/runtime/arguments.cpp       |   1 +
 src/hotspot/share/runtime/cpuTimeCounters.cpp |   3 +
 src/hotspot/share/runtime/cpuTimeCounters.hpp |   1 +
 src/hotspot/share/runtime/mutexLocker.cpp     |  10 +-
 src/hotspot/share/runtime/mutexLocker.hpp     |   2 +-
 src/hotspot/share/runtime/vmOperation.hpp     |   3 +-
 .../gcbarriers/TestG1BarrierGeneration.java   |   4 +-
 .../jtreg/gc/g1/TestGCLogMessages.java        |  25 +-
 .../TestOptionsWithRanges.java                |   1 -
 .../ir_framework/tests/TestIRMatching.java    |   2 +-
 .../vmTestbase/gc/ArrayJuggle/Juggle2.java    |   7 +-
 .../gc/collection/TestG1ParallelPhases.java   |  11 +-
 114 files changed, 3625 insertions(+), 4681 deletions(-)
 create mode 100644 src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp
 create mode 100644 src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp
 create mode 100644 src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp
 create mode 100644 src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp
 rename src/hotspot/share/gc/{shared/bufferNodeList.hpp => g1/g1ConcurrentRefineSweepTask.hpp} (57%)
 delete mode 100644 src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
 delete mode 100644 src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
 delete mode 100644 src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp
 delete mode 100644 src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp
 create mode 100644 src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp
 create mode 100644 src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp
 delete mode 100644 src/hotspot/share/gc/shared/bufferNodeList.cpp

diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
index 42f3c4a015a..9950feb7470 100644
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
@@ -86,15 +86,48 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
   }
 }
 
-void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
-                                                             Register start, Register count, Register scratch, RegSet saved_regs) {
-  __ push(saved_regs, sp);
-  assert_different_registers(start, count, scratch);
-  assert_different_registers(c_rarg0, count);
-  __ mov(c_rarg0, start);
-  __ mov(c_rarg1, count);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop(saved_regs, sp);
+void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm,
+                                                             DecoratorSet decorators,
+                                                             Register start,
+                                                             Register count,
+                                                             Register scratch,
+                                                             RegSet saved_regs) {
+
+  Label done;
+  Label loop;
+  Label next;
+
+  __ cbz(count, done);
+
+  // Calculate the number of card marks to set. Since the object might start and
+  // end within a card, we need to calculate this via the card table indexes of
+  // the actual start and last addresses covered by the object.
+  // Temporarily use the count register for the last element address.
+  __ lea(count, Address(start, count, Address::lsl(LogBytesPerHeapOop))); // end = start + count << LogBytesPerHeapOop
+  __ sub(count, count, BytesPerHeapOop);                                  // Use last element address for end.
+
+  __ lsr(start, start, CardTable::card_shift());
+  __ lsr(count, count, CardTable::card_shift());
+  __ sub(count, count, start);                                            // Number of bytes to mark - 1.
+
+  // Add card table base offset to start.
+  __ ldr(scratch, Address(rthread, in_bytes(G1ThreadLocalData::card_table_base_offset())));
+  __ add(start, start, scratch);
+
+  __ bind(loop);
+  if (UseCondCardMark) {
+    __ ldrb(scratch, Address(start, count));
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(scratch, 0, next);
+  }
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ strb(zr, Address(start, count));
+  __ bind(next);
+  __ subs(count, count, 1);
+  __ br(Assembler::GE, loop);
+
+  __ bind(done);
 }
 
 static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
@@ -202,10 +235,14 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                             const Register store_addr,
                                             const Register new_val,
+                                            const Register thread,
                                             const Register tmp1,
                                             const Register tmp2,
                                             Label& done,
                                             bool new_val_may_be_null) {
+  assert(thread == rthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg, rscratch1);
+
   // Does store cross heap regions?
   __ eor(tmp1, store_addr, new_val);                     // tmp1 := store address ^ new value
   __ lsr(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);   // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
@@ -214,33 +251,19 @@ static void generate_post_barrier_fast_path(MacroAssembler* masm,
   if (new_val_may_be_null) {
     __ cbz(new_val, done);
   }
-  // Storing region crossing non-null, is card young?
+  // Storing region crossing non-null.
   __ lsr(tmp1, store_addr, CardTable::card_shift());     // tmp1 := card address relative to card table base
-  __ load_byte_map_base(tmp2);                           // tmp2 := card table base address
-  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
-  __ ldrb(tmp2, Address(tmp1));                          // tmp2 := card
-  __ cmpw(tmp2, (int)G1CardTable::g1_young_card_val());  // tmp2 := card == young_card_val?
-}
 
-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(Assembler::StoreLoad);  // StoreLoad membar
-  __ ldrb(tmp2, Address(tmp1));     // tmp2 := card
-  __ cbzw(tmp2, done);
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ strb(zr, Address(tmp1));       // *(card address) := dirty_card_val
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, tmp1, tmp2, rscratch1);
-  __ b(done);
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ ldr(tmp2, card_table_addr);                         // tmp2 := card table base address
+  if (UseCondCardMark) {
+    __ ldrb(rscratch1, Address(tmp1, tmp2));             // rscratch1 := card
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(rscratch1, 0, done);
+  }
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ strb(zr, Address(tmp1, tmp2));                      // *(card address) := dirty_card_val
 }
 
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
@@ -249,27 +272,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                   Register thread,
                                                   Register tmp1,
                                                   Register tmp2) {
-  assert(thread == rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
-                             rscratch1);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
-         && tmp2 != noreg, "expecting a register");
-
   Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  __ br(Assembler::EQ, done);
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push(saved, sp);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
-  __ pop(saved, sp);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, false /* new_val_may_be_null */);
   __ bind(done);
 }
 
@@ -329,38 +333,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                      Register thread,
                                                      Register tmp1,
                                                      Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
-                             rscratch1);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
-         && tmp2 != noreg, "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ br(Assembler::NE, *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-  assert(stub->tmp3() == noreg, "not needed in this platform");
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ b(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }
 
 #endif // COMPILER2
@@ -456,20 +432,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
   __ b(*stub->continuation());
 }
 
-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cbz(new_val_reg, *stub->continuation());
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ b(*stub->continuation());
-}
-
 #undef __
 
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->
 
 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@@ -521,74 +496,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
   __ epilogue();
 }
 
-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  // arg0: store_address
-  Address store_addr(rfp, 2*BytesPerWord);
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-
-  Label done;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-
-  const Register thread = rthread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  const Register card_offset = rscratch2;
-  // LR is free here, so we can use it to hold the byte_map_base.
-  const Register byte_map_base = lr;
-
-  assert_different_registers(card_offset, byte_map_base, rscratch1);
-
-  __ load_parameter(0, card_offset);
-  __ lsr(card_offset, card_offset, CardTable::card_shift());
-  __ load_byte_map_base(byte_map_base);
-  __ ldrb(rscratch1, Address(byte_map_base, card_offset));
-  __ cmpw(rscratch1, (int)G1CardTable::g1_young_card_val());
-  __ br(Assembler::EQ, done);
-
-  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
-
-  __ membar(Assembler::StoreLoad);
-  __ ldrb(rscratch1, Address(byte_map_base, card_offset));
-  __ cbzw(rscratch1, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-  __ strb(zr, Address(byte_map_base, card_offset));
-
-  // Convert card offset into an address in card_addr
-  Register card_addr = card_offset;
-  __ add(card_addr, byte_map_base, card_addr);
-
-  __ ldr(rscratch1, queue_index);
-  __ cbz(rscratch1, runtime);
-  __ sub(rscratch1, rscratch1, wordSize);
-  __ str(rscratch1, queue_index);
-
-  // Reuse LR to hold buffer_addr
-  const Register buffer_addr = lr;
-
-  __ ldr(buffer_addr, buffer);
-  __ str(card_addr, Address(buffer_addr, rscratch1));
-  __ b(done);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-  __ pop_call_clobbered_registers();
-  __ bind(done);
-  __ epilogue();
-}
-
 #undef __
 
 #endif // COMPILER1
diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
index 04ac2096096..72040cd7ad2 100644
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -32,9 +32,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@@ -65,10 +63,15 @@ protected:
 public:
 #ifdef COMPILER1
   void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
 
   void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
 #endif
 
 #ifdef COMPILER2
@@ -87,9 +90,7 @@ public:
                                 Register thread,
                                 Register tmp1,
                                 Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif
 
   void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
index 081a67d6880..18fc27a4af4 100644
--- a/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm,
                                Register new_val,
                                Register tmp1,
                                Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
     return;
   }
   Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
   G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, new_val_may_be_null);
 }
 
 %}
diff --git a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
index 049477cda76..71f8931eb5f 100644
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
@@ -201,12 +201,15 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                             const Register store_addr,
                                             const Register new_val,
+                                            const Register thread,
                                             const Register tmp1,
                                             const Register tmp2,
                                             Label& done,
                                             bool new_val_may_be_null) {
-  // Does store cross heap regions?
+  assert(thread == Rthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
 
+  // Does store cross heap regions?
   __ eor(tmp1, store_addr, new_val);
   __ movs(tmp1, AsmOperand(tmp1, lsr, G1HeapRegion::LogOfHRGrainBytes));
   __ b(done, eq);
@@ -215,76 +218,34 @@ static void generate_post_barrier_fast_path(MacroAssembler* masm,
   if (new_val_may_be_null) {
     __ cbz(new_val, done);
   }
-  // storing region crossing non-null, is card already dirty?
-  const Register card_addr = tmp1;
 
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  __ mov_address(tmp2, (address)ct->card_table()->byte_map_base());
-  __ add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift()));
+  // storing region crossing non-null, is card already non-clean?
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ ldr(tmp2, card_table_addr);
+  __ add(tmp1, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift()));
 
-  __ ldrb(tmp2, Address(card_addr));
-  __ cmp(tmp2, (int)G1CardTable::g1_young_card_val());
+  if (UseCondCardMark) {
+    __ ldrb(tmp2, Address(tmp1));
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(tmp2, 0, done);
+  }
+
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zero_register()");
+  __ zero_register(tmp2);
+  __ strb(tmp2, Address(tmp1));                   // *(card address) := dirty_card_val
 }
 
-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            const Register tmp3,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2);
-  assert(CardTable::dirty_card_val() == 0, "adjust this code");
-  // card_addr is loaded by generate_post_barrier_fast_path
-  const Register card_addr = tmp1;
-  __ ldrb(tmp2, Address(card_addr));
-  __ cbz(tmp2, done);
-
-  // storing a region crossing, non-null oop, card is clean.
-  // dirty card and log.
-
-  __ strb(__ zero_register(tmp2), Address(card_addr));
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, card_addr, tmp2, tmp3);
-  __ b(done);
-}
-
-
 // G1 post-barrier.
 // Blows all volatile registers R0-R3,  LR).
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
-                                           Register store_addr,
-                                           Register new_val,
-                                           Register tmp1,
-                                           Register tmp2,
-                                           Register tmp3) {
+                                                  Register store_addr,
+                                                  Register new_val,
+                                                  Register tmp1,
+                                                  Register tmp2,
+                                                  Register tmp3) {
   Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  // card_addr and card are loaded by generate_post_barrier_fast_path
-  const Register card      = tmp2;
-  const Register card_addr = tmp1;
-   __ b(done, eq);
-  generate_post_barrier_slow_path(masm, Rthread, card_addr, tmp2, tmp3, done, runtime);
-
-  __ bind(runtime);
-
-  RegisterSet set = RegisterSet(store_addr) | RegisterSet(R0, R3) | RegisterSet(R12);
-  __ push(set);
-
-  if (card_addr != R0) {
-    __ mov(R0, card_addr);
-  }
-  __ mov(R1, Rthread);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), R0, R1);
-
-  __ pop(set);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, Rthread, tmp1, tmp2, done, true /* new_val_may_be_null */);
   __ bind(done);
 }
 
@@ -344,35 +305,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                      Register tmp1,
                                                      Register tmp2,
                                                      Register tmp3,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == Rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
-
-  stub->initialize_registers(thread, tmp1, tmp2, tmp3);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ b(*stub->entry(), ne);
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-  Register tmp3 = stub->tmp3();
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, tmp3,  *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp2);
-  __ b(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }
 
 #endif // COMPILER2
@@ -463,20 +399,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
   __ b(*stub->continuation());
 }
 
-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cbz(new_val_reg, *stub->continuation());
-  ce->verify_reserved_argument_area_size(1);
-  __ str(stub->addr()->as_pointer_register(), Address(SP));
-  __ call(bs->post_barrier_c1_runtime_code_blob()->code_begin(), relocInfo::runtime_call_type);
-  __ b(*stub->continuation());
+#undef __
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
 }
 
-#undef __
 #define __ sasm->
 
 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@@ -536,102 +471,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
   __ b(done);
 }
 
-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  // Input:
-  // - store_addr, pushed on the stack
-
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  Label done;
-  Label recheck;
-  Label runtime;
-
-  Address queue_index(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  AddressLiteral cardtable(ci_card_table_address_as<address>(), relocInfo::none);
-
-  // save at least the registers that need saving if the runtime is called
-  const RegisterSet saved_regs = RegisterSet(R0,R3) | RegisterSet(R12) | RegisterSet(LR);
-  const int nb_saved_regs = 6;
-  assert(nb_saved_regs == saved_regs.size(), "fix nb_saved_regs");
-  __ push(saved_regs);
-
-  const Register r_card_addr_0 = R0; // must be R0 for the slow case
-  const Register r_obj_0 = R0;
-  const Register r_card_base_1 = R1;
-  const Register r_tmp2 = R2;
-  const Register r_index_2 = R2;
-  const Register r_buffer_3 = R3;
-  const Register tmp1 = Rtemp;
-
-  __ ldr(r_obj_0, Address(SP, nb_saved_regs*wordSize));
-  // Note: there is a comment in x86 code about not using
-  // ExternalAddress / lea, due to relocation not working
-  // properly for that address. Should be OK for arm, where we
-  // explicitly specify that 'cardtable' has a relocInfo::none
-  // type.
-  __ lea(r_card_base_1, cardtable);
-  __ add(r_card_addr_0, r_card_base_1, AsmOperand(r_obj_0, lsr, CardTable::card_shift()));
-
-  // first quick check without barrier
-  __ ldrb(r_tmp2, Address(r_card_addr_0));
-
-  __ cmp(r_tmp2, (int)G1CardTable::g1_young_card_val());
-  __ b(recheck, ne);
-
-  __ bind(done);
-
-  __ pop(saved_regs);
-
-  __ ret();
-
-  __ bind(recheck);
-
-  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp1);
-
-  // reload card state after the barrier that ensures the stored oop was visible
-  __ ldrb(r_tmp2, Address(r_card_addr_0));
-
-  assert(CardTable::dirty_card_val() == 0, "adjust this code");
-  __ cbz(r_tmp2, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-
-  assert(0 == (int)CardTable::dirty_card_val(), "adjust this code");
-  if ((ci_card_table_address_as<intptr_t>() & 0xff) == 0) {
-    // Card table is aligned so the lowest byte of the table address base is zero.
-    __ strb(r_card_base_1, Address(r_card_addr_0));
-  } else {
-    __ strb(__ zero_register(r_tmp2), Address(r_card_addr_0));
-  }
-
-  __ ldr(r_index_2, queue_index);
-  __ ldr(r_buffer_3, buffer);
-
-  __ subs(r_index_2, r_index_2, wordSize);
-  __ b(runtime, lt); // go to runtime if now negative
-
-  __ str(r_index_2, queue_index);
-
-  __ str(r_card_addr_0, Address(r_buffer_3, r_index_2));
-
-  __ b(done);
-
-  __ bind(runtime);
-
-  __ save_live_registers();
-
-  assert(r_card_addr_0 == c_rarg0, "card_addr should be in R0");
-  __ mov(c_rarg1, Rthread);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), c_rarg0, c_rarg1);
-
-  __ restore_live_registers_without_return();
-
-  __ b(done);
-}
-
 #undef __
 
 #endif // COMPILER1
diff --git a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
index 4e49e655e3e..9e0eff4601b 100644
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -32,9 +32,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@@ -66,10 +64,15 @@ public:
 #ifdef COMPILER1
 public:
   void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
 
   void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
 #endif
 
 #ifdef COMPILER2
@@ -89,9 +92,7 @@ public:
                                 Register tmp1,
                                 Register tmp2,
                                 Register tmp3,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif
 
 };
diff --git a/src/hotspot/cpu/arm/gc/g1/g1_arm.ad b/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
index 8a0a9e1aa53..e905ba9ff67 100644
--- a/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
+++ b/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -63,13 +63,13 @@ static void write_barrier_post(MacroAssembler* masm,
                                Register tmp1,
                                Register tmp2,
                                Register tmp3) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
     return;
   }
   Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
   G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, new_val_may_be_null);
 }
 
 %}
diff --git a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
index 4fb13422f59..262bb1eae89 100644
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
@@ -28,7 +28,6 @@
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1BarrierSetRuntime.hpp"
 #include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
@@ -230,78 +229,52 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
   __ bind(filtered);
 }
 
-static void generate_region_crossing_test(MacroAssembler* masm, const Register store_addr, const Register new_val) {
-  __ xorr(R0, store_addr, new_val);                  // tmp1 := store address ^ new value
-  __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
-}
+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+  assert_different_registers(store_addr, new_val, tmp1, R0);
+  assert_different_registers(store_addr, tmp1, tmp2, R0);
 
-static Address generate_card_young_test(MacroAssembler* masm, const Register store_addr, const Register tmp1, const Register tmp2) {
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  __ load_const_optimized(tmp1, (address)(ct->card_table()->byte_map_base()), tmp2);
-  __ srdi(tmp2, store_addr, CardTable::card_shift());        // tmp1 := card address relative to card table base
-  __ lbzx(R0, tmp1, tmp2);                                   // tmp1 := card address
-  __ cmpwi(CR0, R0, (int)G1CardTable::g1_young_card_val());
-  return Address(tmp1, tmp2); // return card address
-}
+  __ xorr(R0, store_addr, new_val);                          // R0 := store address ^ new value
+  __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes);         // R0 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ beq(CR0, done);
 
-static void generate_card_dirty_test(MacroAssembler* masm, Address card_addr) {
-  __ membar(Assembler::StoreLoad);                        // Must reload after StoreLoad membar due to concurrent refinement
-  __ lbzx(R0, card_addr.base(), card_addr.index());       // tmp2 := card
-  __ cmpwi(CR0, R0, (int)G1CardTable::dirty_card_val()); // tmp2 := card == dirty_card_val?
+  // Crosses regions, storing null?
+  if (!new_val_may_be_null) {
+#ifdef ASSERT
+    __ cmpdi(CR0, new_val, 0);
+    __ asm_assert_ne("null oop not allowed (G1 post)");      // Checked by caller.
+#endif
+  } else {
+    __ cmpdi(CR0, new_val, 0);
+    __ beq(CR0, done);
+  }
+
+  __ ld(tmp1, G1ThreadLocalData::card_table_base_offset(), thread);
+  __ srdi(tmp2, store_addr, CardTable::card_shift());        // tmp2 := card address relative to card table base
+  if (UseCondCardMark) {
+    __ lbzx(R0, tmp1, tmp2);
+    __ cmpwi(CR0, R0, (int)G1CardTable::clean_card_val());
+    __ bne(CR0, done);
+  }
+
+  __ li(R0, G1CardTable::dirty_card_val());
+  __ stbx(R0, tmp1, tmp2);
 }
 
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators,
                                                   Register store_addr, Register new_val,
-                                                  Register tmp1, Register tmp2, Register tmp3,
-                                                  MacroAssembler::PreservationLevel preservation_level) {
+                                                  Register tmp1, Register tmp2) {
   bool not_null = (decorators & IS_NOT_NULL) != 0;
 
-  Label runtime, filtered;
-  assert_different_registers(store_addr, new_val, tmp1, tmp2);
-
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  generate_region_crossing_test(masm, store_addr, new_val);
-  __ beq(CR0, filtered);
-
-  // Crosses regions, storing null?
-  if (not_null) {
-#ifdef ASSERT
-    __ cmpdi(CR0, new_val, 0);
-    __ asm_assert_ne("null oop not allowed (G1 post)"); // Checked by caller.
-#endif
-  } else {
-    __ cmpdi(CR0, new_val, 0);
-    __ beq(CR0, filtered);
-  }
-
-  Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2);
-  __ beq(CR0, filtered);
-
-  generate_card_dirty_test(masm, card_addr);
-  __ beq(CR0, filtered);
-
-  __ li(R0, (int)G1CardTable::dirty_card_val());
-  __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val
-
-  Register Rcard_addr = tmp3;
-  __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued.
-
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime, Rcard_addr, tmp1);
-  __ b(filtered);
-
-  __ bind(runtime);
-
-  assert(preservation_level == MacroAssembler::PRESERVATION_NONE,
-         "g1_write_barrier_post doesn't support preservation levels higher than PRESERVATION_NONE");
-
-  // Save the live input values.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), Rcard_addr, R16_thread);
-
-  __ bind(filtered);
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, R16_thread, tmp1, tmp2, done, !not_null);
+  __ bind(done);
 }
 
 void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@@ -333,8 +306,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco
     }
     g1_write_barrier_post(masm, decorators,
                           base, val,
-                          tmp1, tmp2, tmp3,
-                          preservation_level);
+                          tmp1, tmp2);
   }
 }
 
@@ -457,70 +429,29 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                      Register new_val,
                                                      Register tmp1,
                                                      Register tmp2,
-                                                     G1PostBarrierStubC2* stub,
+                                                     bool new_val_may_be_null,
                                                      bool decode_new_val) {
   assert_different_registers(store_addr, new_val, tmp1, R0);
   assert_different_registers(store_addr, tmp1, tmp2, R0);
 
-  stub->initialize_registers(R16_thread, tmp1, tmp2);
+  Label done;
 
-  bool null_check_required = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
   Register new_val_decoded = new_val;
 
   if (decode_new_val) {
     assert(UseCompressedOops, "or should not be here");
-    if (null_check_required && CompressedOops::base() != nullptr) {
+    if (new_val_may_be_null && CompressedOops::base() != nullptr) {
       // We prefer doing the null check after the region crossing check.
       // Only compressed oop modes with base != null require a null check here.
       __ cmpwi(CR0, new_val, 0);
-      __ beq(CR0, *stub->continuation());
-      null_check_required = false;
+      __ beq(CR0, done);
+      new_val_may_be_null = false;
     }
     new_val_decoded = __ decode_heap_oop_not_null(tmp2, new_val);
   }
 
-  generate_region_crossing_test(masm, store_addr, new_val_decoded);
-  __ beq(CR0, *stub->continuation());
-
-  // crosses regions, storing null?
-  if (null_check_required) {
-    __ cmpdi(CR0, new_val_decoded, 0);
-    __ beq(CR0, *stub->continuation());
-  }
-
-  Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2);
-  assert(card_addr.base() == tmp1 && card_addr.index() == tmp2, "needed by post barrier stub");
-  __ bc_far_optimized(Assembler::bcondCRbiIs0, __ bi0(CR0, Assembler::equal), *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Address card_addr(stub->tmp1(), stub->tmp2()); // See above.
-
-  __ bind(*stub->entry());
-
-  generate_card_dirty_test(masm, card_addr);
-  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CR0, Assembler::equal), *stub->continuation());
-
-  __ li(R0, (int)G1CardTable::dirty_card_val());
-  __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val
-
-  Register Rcard_addr = stub->tmp1();
-  __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued.
-
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime, Rcard_addr, stub->tmp2());
-  __ b(*stub->continuation());
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, Rcard_addr, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ b(*stub->continuation());
+  generate_post_barrier_fast_path(masm, store_addr, new_val_decoded, R16_thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }
 
 #endif // COMPILER2
@@ -558,28 +489,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
   __ b(*stub->continuation());
 }
 
-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
+#undef __
 
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register addr_reg = stub->addr()->as_pointer_register();
-  Register new_val_reg = stub->new_val()->as_register();
-
-  __ cmpdi(CR0, new_val_reg, 0);
-  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CR0, Assembler::equal), *stub->continuation());
-
-  address c_code = bs->post_barrier_c1_runtime_code_blob()->code_begin();
-  //__ load_const_optimized(R0, c_code);
-  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(c_code));
-  __ mtctr(R0);
-  __ mr(R0, addr_reg); // Pass addr in R0.
-  __ bctrl();
-  __ b(*stub->continuation());
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
 }
 
-#undef __
 #define __ sasm->
 
 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@@ -642,86 +564,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
   __ b(restart);
 }
 
-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  G1BarrierSet* bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
-
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  // Using stack slots: spill addr, spill tmp2
-  const int stack_slots = 2;
-  Register tmp = R0;
-  Register addr = R14;
-  Register tmp2 = R15;
-  CardTable::CardValue* byte_map_base = bs->card_table()->byte_map_base();
-
-  Label restart, refill, ret;
-
-  // Spill
-  __ std(addr, -8, R1_SP);
-  __ std(tmp2, -16, R1_SP);
-
-  __ srdi(addr, R0, CardTable::card_shift()); // Addr is passed in R0.
-  __ load_const_optimized(/*cardtable*/ tmp2, byte_map_base, tmp);
-  __ add(addr, tmp2, addr);
-  __ lbz(tmp, 0, addr); // tmp := [addr + cardtable]
-
-  // Return if young card.
-  __ cmpwi(CR0, tmp, G1CardTable::g1_young_card_val());
-  __ beq(CR0, ret);
-
-  // Return if sequential consistent value is already dirty.
-  __ membar(Assembler::StoreLoad);
-  __ lbz(tmp, 0, addr); // tmp := [addr + cardtable]
-
-  __ cmpwi(CR0, tmp, G1CardTable::dirty_card_val());
-  __ beq(CR0, ret);
-
-  // Not dirty.
-
-  // First, dirty it.
-  __ li(tmp, G1CardTable::dirty_card_val());
-  __ stb(tmp, 0, addr);
-
-  int dirty_card_q_index_byte_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset());
-  int dirty_card_q_buf_byte_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset());
-
-  __ bind(restart);
-
-  // Get the index into the update buffer. G1DirtyCardQueue::_index is
-  // a size_t so ld_ptr is appropriate here.
-  __ ld(tmp2, dirty_card_q_index_byte_offset, R16_thread);
-
-  // index == 0?
-  __ cmpdi(CR0, tmp2, 0);
-  __ beq(CR0, refill);
-
-  __ ld(tmp, dirty_card_q_buf_byte_offset, R16_thread);
-  __ addi(tmp2, tmp2, -oopSize);
-
-  __ std(tmp2, dirty_card_q_index_byte_offset, R16_thread);
-  __ add(tmp2, tmp, tmp2);
-  __ std(addr, 0, tmp2); // [_buf + index] := <address_of_card>
-
-  // Restore temp registers and return-from-leaf.
-  __ bind(ret);
-  __ ld(tmp2, -16, R1_SP);
-  __ ld(addr, -8, R1_SP);
-  __ blr();
-
-  __ bind(refill);
-  const int nbytes_save = (MacroAssembler::num_volatile_regs + stack_slots) * BytesPerWord;
-  __ save_volatile_gprs(R1_SP, -nbytes_save); // except R0
-  __ mflr(R0);
-  __ std(R0, _abi0(lr), R1_SP);
-  __ push_frame_reg_args(nbytes_save, R0); // dummy frame for C call
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1DirtyCardQueueSet::handle_zero_index_for_thread), R16_thread);
-  __ pop_frame();
-  __ ld(R0, _abi0(lr), R1_SP);
-  __ mtlr(R0);
-  __ restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
-  __ b(restart);
-}
-
 #undef __
 
 #endif // COMPILER1
diff --git a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
index 33cb89dacc6..e059cc661af 100644
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, 2021 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -37,9 +37,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@@ -56,8 +54,7 @@ protected:
                             MacroAssembler::PreservationLevel preservation_level);
   void g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators,
                              Register store_addr, Register new_val,
-                             Register tmp1, Register tmp2, Register tmp3,
-                             MacroAssembler::PreservationLevel preservation_level);
+                             Register tmp1, Register tmp2);
 
   virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                             Register base, RegisterOrConstant ind_or_offs, Register val,
@@ -79,17 +76,21 @@ public:
                                 Register new_val,
                                 Register tmp1,
                                 Register tmp2,
-                                G1PostBarrierStubC2* c2_stub,
+                                bool new_val_may_be_null,
                                 bool decode_new_val);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
 #endif
 #ifdef COMPILER1
   void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
 
   void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif
 
   virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
diff --git a/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad b/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
index 4f24efe872b..0a4a9442855 100644
--- a/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
+++ b/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2025 SAP SE. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@@ -64,13 +64,13 @@ static void post_write_barrier(MacroAssembler* masm,
                                Register tmp1,
                                Register tmp2,
                                bool decode_new_val = false) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
     return;
   }
   Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
   G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub, decode_new_val);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, new_val_may_be_null, decode_new_val);
 }
 
 %}
diff --git a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
index ef5dcdd8074..9c3bd93f8a6 100644
--- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
@@ -87,15 +87,54 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
   }
 }
 
-void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
-                                                             Register start, Register count, Register tmp, RegSet saved_regs) {
-  __ push_reg(saved_regs, sp);
+void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm,
+                                                             DecoratorSet decorators,
+                                                             Register start,
+                                                             Register count,
+                                                             Register tmp,
+                                                             RegSet saved_regs) {
   assert_different_registers(start, count, tmp);
-  assert_different_registers(c_rarg0, count);
-  __ mv(c_rarg0, start);
-  __ mv(c_rarg1, count);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop_reg(saved_regs, sp);
+
+  Label loop, next, done;
+
+  // Zero count? Nothing to do.
+  __ beqz(count, done);
+
+  // Calculate the number of card marks to set. Since the object might start and
+  // end within a card, we need to calculate this via the card table indexes of
+  // the actual start and last addresses covered by the object.
+  // Temporarily use the count register for the last element address.
+  __ shadd(count, count, start, tmp, LogBytesPerHeapOop); // end = start + count << LogBytesPerHeapOop
+  __ subi(count, count, BytesPerHeapOop);                 // Use last element address for end.
+
+  __ srli(start, start, CardTable::card_shift());
+  __ srli(count, count, CardTable::card_shift());
+  __ sub(count, count, start);                            // Number of bytes to mark - 1.
+
+  // Add card table base offset to start.
+  Address card_table_address(xthread, G1ThreadLocalData::card_table_base_offset());
+  __ ld(tmp, card_table_address);
+  __ add(start, start, tmp);
+
+  __ bind(loop);
+  if (UseCondCardMark) {
+    __ add(tmp, start, count);
+    __ lbu(tmp, Address(tmp, 0));
+    static_assert((uint)G1CardTable::clean_card_val() == 0xff, "must be");
+    __ subi(tmp, tmp, G1CardTable::clean_card_val()); // Convert to clean_card_value() to a comparison
+                                                      // against zero to avoid use of an extra temp.
+    __ bnez(tmp, next);
+  }
+
+  __ add(tmp, start, count);
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ sb(zr, Address(tmp, 0));
+
+  __ bind(next);
+  __ subi(count, count, 1);
+  __ bgez(count, loop);
+
+  __ bind(done);
 }
 
 static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
@@ -192,44 +231,37 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                             const Register store_addr,
                                             const Register new_val,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            Label& done,
-                                            bool new_val_may_be_null) {
-  // Does store cross heap regions?
-  __ xorr(tmp1, store_addr, new_val);                    // tmp1 := store address ^ new value
-  __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);  // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
-  __ beqz(tmp1, done);
-  // Crosses regions, storing null?
-  if (new_val_may_be_null) {
-    __ beqz(new_val, done);
-  }
-  // Storing region crossing non-null, is card young?
-  __ srli(tmp1, store_addr, CardTable::card_shift());    // tmp1 := card address relative to card table base
-  __ load_byte_map_base(tmp2);                           // tmp2 := card table base address
-  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
-  __ lbu(tmp2, Address(tmp1));                           // tmp2 := card
-}
-
-static void generate_post_barrier_slow_path(MacroAssembler* masm,
                                             const Register thread,
                                             const Register tmp1,
                                             const Register tmp2,
                                             Label& done,
-                                            Label& runtime) {
-  __ membar(MacroAssembler::StoreLoad);  // StoreLoad membar
-  __ lbu(tmp2, Address(tmp1));           // tmp2 := card
-  __ beqz(tmp2, done, true);
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ sb(zr, Address(tmp1));       // *(card address) := dirty_card_val
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, tmp1, tmp2, t0);
-  __ j(done);
+                                            bool new_val_may_be_null) {
+  assert(thread == xthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
+  // Does store cross heap regions?
+  __ xorr(tmp1, store_addr, new_val);                    // tmp1 := store address ^ new value
+  __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);  // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ beqz(tmp1, done);
+
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ beqz(new_val, done);
+  }
+  // Storing region crossing non-null, is card clean?
+  __ srli(tmp1, store_addr, CardTable::card_shift());    // tmp1 := card address relative to card table base
+
+  Address card_table_address(xthread, G1ThreadLocalData::card_table_base_offset());
+  __ ld(tmp2, card_table_address);                       // tmp2 := card table base address
+  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
+  if (UseCondCardMark) {
+    static_assert((uint)G1CardTable::clean_card_val() == 0xff, "must be");
+    __ lbu(tmp2, Address(tmp1, 0));                      // tmp2 := card
+    __ subi(tmp2, tmp2, G1CardTable::clean_card_val());  // Convert to clean_card_value() to a comparison
+                                                         // against zero to avoid use of an extra temp.
+    __ bnez(tmp2, done);
+  }
+  static_assert((uint)G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ sb(zr, Address(tmp1, 0));
 }
 
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
@@ -238,27 +270,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                   Register thread,
                                                   Register tmp1,
                                                   Register tmp2) {
-  assert(thread == xthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg,
-         "expecting a register");
-
   Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done (tmp2 holds the card value)
-  __ mv(t0, (int)G1CardTable::g1_young_card_val());
-  __ beq(tmp2, t0, done);   // card == young_card_val?
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push_reg(saved, sp);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
-  __ pop_reg(saved, sp);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
   __ bind(done);
 }
 
@@ -318,37 +331,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                      Register thread,
                                                      Register tmp1,
                                                      Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == xthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg,
-         "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path) (tmp2 holds the card value)
-  __ mv(t0, (int)G1CardTable::g1_young_card_val());
-  __ bne(tmp2, t0, *stub->entry(), true);
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ j(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }
 
 #endif // COMPILER2
@@ -443,20 +429,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
   __ j(*stub->continuation());
 }
 
-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition");
-  assert(stub->new_val()->is_register(), "Precondition");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ beqz(new_val_reg, *stub->continuation(), /* is_far */ true);
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ j(*stub->continuation());
-}
-
 #undef __
 
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->
 
 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@@ -507,74 +492,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
   __ epilogue();
 }
 
-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  // arg0 : store_address
-  Address store_addr(fp, 2 * BytesPerWord); // 2 BytesPerWord from fp
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-
-  Label done;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-  const Register thread = xthread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  const Register card_offset = t1;
-  // RA is free here, so we can use it to hold the byte_map_base.
-  const Register byte_map_base = ra;
-
-  assert_different_registers(card_offset, byte_map_base, t0);
-
-  __ load_parameter(0, card_offset);
-  __ srli(card_offset, card_offset, CardTable::card_shift());
-  __ load_byte_map_base(byte_map_base);
-
-  // Convert card offset into an address in card_addr
-  Register card_addr = card_offset;
-  __ add(card_addr, byte_map_base, card_addr);
-
-  __ lbu(t0, Address(card_addr, 0));
-  __ sub(t0, t0, (int)G1CardTable::g1_young_card_val());
-  __ beqz(t0, done);
-
-  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
-
-  __ membar(MacroAssembler::StoreLoad);
-  __ lbu(t0, Address(card_addr, 0));
-  __ beqz(t0, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-  __ sb(zr, Address(card_addr, 0));
-
-  __ ld(t0, queue_index);
-  __ beqz(t0, runtime);
-  __ subi(t0, t0, wordSize);
-  __ sd(t0, queue_index);
-
-  // Reuse RA to hold buffer_addr
-  const Register buffer_addr = ra;
-
-  __ ld(buffer_addr, buffer);
-  __ add(t0, buffer_addr, t0);
-  __ sd(card_addr, Address(t0, 0));
-  __ j(done);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-  __ pop_call_clobbered_registers();
-  __ bind(done);
-  __ epilogue();
-}
-
 #undef __
 
 #endif // COMPILER1
diff --git a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
index 26310231362..654ba934242 100644
--- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -35,9 +35,7 @@ class LIR_Assembler;
 #endif
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@@ -68,10 +66,16 @@ protected:
 public:
 #ifdef COMPILER1
   void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
 
   void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif
 
 #ifdef COMPILER2
@@ -90,9 +94,7 @@ public:
                                 Register thread,
                                 Register tmp1,
                                 Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif
 
   void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
diff --git a/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad b/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
index 7a525323021..8461a36e68c 100644
--- a/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
+++ b/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@@ -63,13 +63,13 @@ static void write_barrier_post(MacroAssembler* masm,
                                Register new_val,
                                Register tmp1,
                                Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
     return;
   }
   Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
   G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, new_val_may_be_null);
 }
 
 %}
diff --git a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
index dea3317270e..3e176309c27 100644
--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
@@ -28,7 +28,6 @@
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1BarrierSetRuntime.hpp"
 #include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
@@ -205,104 +204,71 @@ void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
   BLOCK_COMMENT("} generate_c2_pre_barrier_stub");
 }
 
+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+
+  __ block_comment("generate_post_barrier_fast_path {");
+
+  assert(thread == Z_thread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
+
+  // Does store cross heap regions?
+  if (VM_Version::has_DistinctOpnds()) {
+    __ z_xgrk(tmp1, store_addr, new_val);    // tmp1 := store address ^ new value
+  } else {
+    __ z_lgr(tmp1, store_addr);
+    __ z_xgr(tmp1, new_val);
+  }
+  __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ branch_optimized(Assembler::bcondEqual, done);
+
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ z_ltgr(new_val, new_val);
+    __ z_bre(done);
+  } else {
+#ifdef ASSERT
+    __ z_ltgr(new_val, new_val);
+    __ asm_assert(Assembler::bcondNotZero, "null oop not allowed (G1 post)", 0x322); // Checked by caller.
+#endif
+  }
+
+  __ z_srag(tmp1, store_addr, CardTable::card_shift());
+
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ z_alg(tmp1, card_table_addr);     // tmp1 := card address
+
+  if(UseCondCardMark) {
+    __ z_cli(0, tmp1, G1CardTable::clean_card_val());
+    __ branch_optimized(Assembler::bcondNotEqual, done);
+  }
+
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use z_mvi");
+  __ z_mvi(0, tmp1, G1CardTable::dirty_card_val()); // *(card address) := dirty_card_val
+
+  __ block_comment("} generate_post_barrier_fast_path");
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                      Register store_addr,
                                                      Register new_val,
                                                      Register thread,
                                                      Register tmp1,
                                                      Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
+                                                     bool new_val_may_be_null) {
   BLOCK_COMMENT("g1_write_barrier_post_c2 {");
-
-  assert(thread == Z_thread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, Z_R1_scratch);
-
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  BLOCK_COMMENT("generate_region_crossing_test {");
-  if (VM_Version::has_DistinctOpnds()) {
-    __ z_xgrk(tmp1, store_addr, new_val);
-  } else {
-    __ z_lgr(tmp1, store_addr);
-    __ z_xgr(tmp1, new_val);
-  }
-  __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);
-  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  BLOCK_COMMENT("} generate_region_crossing_test");
-
-  // crosses regions, storing null?
-  if ((stub->barrier_data() & G1C2BarrierPostNotNull) == 0) {
-    __ z_ltgr(new_val, new_val);
-    __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  }
-
-  BLOCK_COMMENT("generate_card_young_test {");
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  // calculate address of card
-  __ load_const_optimized(tmp2, (address)ct->card_table()->byte_map_base());      // Card table base.
-  __ z_srlg(tmp1, store_addr, CardTable::card_shift());         // Index into card table.
-  __ z_algr(tmp1, tmp2);                                      // Explicit calculation needed for cli.
-
-  // Filter young.
-  __ z_cli(0, tmp1, G1CardTable::g1_young_card_val());
-
-  BLOCK_COMMENT("} generate_card_young_test");
-
-  // From here on, tmp1 holds the card address.
-  __ branch_optimized(Assembler::bcondNotEqual, *stub->entry());
-
-  __ bind(*stub->continuation());
-
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
   BLOCK_COMMENT("} g1_write_barrier_post_c2");
 }
 
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-
-  BLOCK_COMMENT("generate_c2_post_barrier_stub {");
-
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-
-  Register thread     = stub->thread();
-  Register tmp1       = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2       = stub->tmp2();
-  Register Rcard_addr = tmp1;
-
-  __ bind(*stub->entry());
-
-  BLOCK_COMMENT("generate_card_clean_test {");
-  __ z_sync(); // Required to support concurrent cleaning.
-  __ z_cli(0, Rcard_addr, 0); // Reload after membar.
-  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  BLOCK_COMMENT("} generate_card_clean_test");
-
-  BLOCK_COMMENT("generate_dirty_card {");
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ z_mvi(0, Rcard_addr, CardTable::dirty_card_val());
-  BLOCK_COMMENT("} generate_dirty_card");
-
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    Z_thread, tmp1, tmp2);
-
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-
-  __ bind(runtime);
-
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-
-  BLOCK_COMMENT("} generate_c2_post_barrier_stub");
-}
-
 #endif //COMPILER2
 
 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@@ -451,99 +417,9 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Decorato
                                                   Register Rtmp1, Register Rtmp2, Register Rtmp3) {
   bool not_null = (decorators & IS_NOT_NULL) != 0;
 
-  assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); // Most probably, Rnew_val == Rtmp3.
-
-  Label callRuntime, filtered;
-
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  BLOCK_COMMENT("g1_write_barrier_post {");
-
-  // Does store cross heap regions?
-  // It does if the two addresses specify different grain addresses.
-  if (VM_Version::has_DistinctOpnds()) {
-    __ z_xgrk(Rtmp1, Rstore_addr, Rnew_val);
-  } else {
-    __ z_lgr(Rtmp1, Rstore_addr);
-    __ z_xgr(Rtmp1, Rnew_val);
-  }
-  __ z_srag(Rtmp1, Rtmp1, G1HeapRegion::LogOfHRGrainBytes);
-  __ z_bre(filtered);
-
-  // Crosses regions, storing null?
-  if (not_null) {
-#ifdef ASSERT
-    __ z_ltgr(Rnew_val, Rnew_val);
-    __ asm_assert(Assembler::bcondNotZero, "null oop not allowed (G1 post)", 0x322); // Checked by caller.
-#endif
-  } else {
-    __ z_ltgr(Rnew_val, Rnew_val);
-    __ z_bre(filtered);
-  }
-
-  Rnew_val = noreg; // end of lifetime
-
-  // Storing region crossing non-null, is card already dirty?
-  assert_different_registers(Rtmp1, Rtmp2, Rtmp3);
-  // Make sure not to use Z_R0 for any of these registers.
-  Register Rcard_addr = (Rtmp1 != Z_R0_scratch) ? Rtmp1 : Rtmp3;
-  Register Rbase      = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp3;
-
-  // calculate address of card
-  __ load_const_optimized(Rbase, (address)ct->card_table()->byte_map_base());      // Card table base.
-  __ z_srlg(Rcard_addr, Rstore_addr, CardTable::card_shift());         // Index into card table.
-  __ z_algr(Rcard_addr, Rbase);                                      // Explicit calculation needed for cli.
-  Rbase = noreg; // end of lifetime
-
-  // Filter young.
-  __ z_cli(0, Rcard_addr, G1CardTable::g1_young_card_val());
-  __ z_bre(filtered);
-
-  // Check the card value. If dirty, we're done.
-  // This also avoids false sharing of the (already dirty) card.
-  __ z_sync(); // Required to support concurrent cleaning.
-  __ z_cli(0, Rcard_addr, G1CardTable::dirty_card_val()); // Reload after membar.
-  __ z_bre(filtered);
-
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  __ z_mvi(0, Rcard_addr, G1CardTable::dirty_card_val());
-
-  Register Rcard_addr_x = Rcard_addr;
-  Register Rqueue_index = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp1;
-  if (Rcard_addr == Rqueue_index) {
-    Rcard_addr_x = Z_R0_scratch;  // Register shortage. We have to use Z_R0.
-  }
-  __ lgr_if_needed(Rcard_addr_x, Rcard_addr);
-
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    callRuntime,
-                                    Z_thread, Rcard_addr_x, Rqueue_index);
-  __ z_bru(filtered);
-
-  __ bind(callRuntime);
-
-  // TODO: do we need a frame? Introduced to be on the safe side.
-  bool needs_frame = true;
-  __ lgr_if_needed(Rcard_addr, Rcard_addr_x); // copy back asap. push_frame will destroy Z_R0_scratch!
-
-  // VM call need frame to access(write) O register.
-  if (needs_frame) {
-    __ save_return_pc();
-    __ push_frame_abi160(0); // Will use Z_R0 as tmp on old CPUs.
-  }
-
-  // Save the live input values.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), Rcard_addr, Z_thread);
-
-  if (needs_frame) {
-    __ pop_frame();
-    __ restore_return_pc();
-  }
-
-  __ bind(filtered);
+  Label done;
+  generate_post_barrier_fast_path(masm, Rstore_addr, Rnew_val, Z_thread, Rtmp1, Rtmp2, done, !not_null);
+  __ bind(done);
 
   BLOCK_COMMENT("} g1_write_barrier_post");
 }
@@ -615,22 +491,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
   __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
 }
 
-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  ce->check_reserved_argument_area(16); // RT stub needs 2 spill slots.
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ z_ltgr(new_val_reg, new_val_reg);
-  __ branch_optimized(Assembler::bcondZero, *stub->continuation());
-  __ z_lgr(Z_R1_scratch, stub->addr()->as_pointer_register());
-  ce->emit_call_c(bs->post_barrier_c1_runtime_code_blob()->code_begin());
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-}
-
 #undef __
 
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+   Label done;
+   generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+   masm->bind(done);
+}
+
 #define __ sasm->
 
 static OopMap* save_volatile_registers(StubAssembler* sasm, Register return_pc = Z_R14) {
@@ -705,92 +578,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
   __ z_bru(restart);
 }
 
-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  // Z_R1_scratch: oop address, address of updated memory slot
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  Register addr_oop  = Z_R1_scratch;
-  Register addr_card = Z_R1_scratch;
-  Register r1        = Z_R6; // Must be saved/restored.
-  Register r2        = Z_R7; // Must be saved/restored.
-  Register cardtable = r1;   // Must be non-volatile, because it is used to save addr_card.
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-  CardTable::CardValue* byte_map_base = ct->byte_map_base();
-
-  // Save registers used below (see assertion in G1PreBarrierStub::emit_code()).
-  __ z_stg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-
-  Label not_already_dirty, restart, refill, young_card;
-
-  // Calculate address of card corresponding to the updated oop slot.
-  AddressLiteral rs(byte_map_base);
-  __ z_srlg(addr_card, addr_oop, CardTable::card_shift());
-  addr_oop = noreg; // dead now
-  __ load_const_optimized(cardtable, rs); // cardtable := <card table base>
-  __ z_agr(addr_card, cardtable); // addr_card := addr_oop>>card_shift + cardtable
-
-  __ z_cli(0, addr_card, (int)G1CardTable::g1_young_card_val());
-  __ z_bre(young_card);
-
-  __ z_sync(); // Required to support concurrent cleaning.
-
-  __ z_cli(0, addr_card, (int)CardTable::dirty_card_val());
-  __ z_brne(not_already_dirty);
-
-  __ bind(young_card);
-  // We didn't take the branch, so we're already dirty: restore
-  // used registers and return.
-  __ z_lg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_br(Z_R14);
-
-  // Not dirty.
-  __ bind(not_already_dirty);
-
-  // First, dirty it: [addr_card] := 0
-  __ z_mvi(0, addr_card, CardTable::dirty_card_val());
-
-  Register idx = cardtable; // Must be non-volatile, because it is used to save addr_card.
-  Register buf = r2;
-  cardtable = noreg; // now dead
-
-  // Save registers used below (see assertion in G1PreBarrierStub::emit_code()).
-  __ z_stg(r2, 1*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-
-  ByteSize dirty_card_q_index_byte_offset = G1ThreadLocalData::dirty_card_queue_index_offset();
-  ByteSize dirty_card_q_buf_byte_offset = G1ThreadLocalData::dirty_card_queue_buffer_offset();
-
-  __ bind(restart);
-
-  // Get the index into the update buffer. G1DirtyCardQueue::_index is
-  // a size_t so z_ltg is appropriate here.
-  __ z_ltg(idx, Address(Z_thread, dirty_card_q_index_byte_offset));
-
-  // index == 0?
-  __ z_brz(refill);
-
-  __ z_lg(buf, Address(Z_thread, dirty_card_q_buf_byte_offset));
-  __ add2reg(idx, -oopSize);
-
-  __ z_stg(addr_card, 0, idx, buf); // [_buf + index] := <address_of_card>
-  __ z_stg(idx, Address(Z_thread, dirty_card_q_index_byte_offset));
-  // Restore killed registers and return.
-  __ z_lg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_lg(r2, 1*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_br(Z_R14);
-
-  __ bind(refill);
-  save_volatile_registers(sasm);
-  __ z_lgr(idx, addr_card); // Save addr_card, tmp3 must be non-volatile.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1DirtyCardQueueSet::handle_zero_index_for_thread),
-                                   Z_thread);
-  __ z_lgr(addr_card, idx);
-  restore_volatile_registers(sasm); // Restore addr_card.
-  __ z_bru(restart);
-}
-
 #undef __
 
 #endif // COMPILER1
diff --git a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
index 0f0bdd8b83c..fdec751c43b 100644
--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, 2024 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -33,9 +33,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
  protected:
@@ -60,10 +58,16 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
  public:
 #ifdef COMPILER1
   void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
 
   void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif // COMPILER1
 
 #ifdef COMPILER2
@@ -81,9 +85,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
                                 Register thread,
                                 Register tmp1,
                                 Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif // COMPILER2
 
   virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
diff --git a/src/hotspot/cpu/s390/gc/g1/g1_s390.ad b/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
index 31f60c4aeff..7aed374fdae 100644
--- a/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
+++ b/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright 2024 IBM Corporation. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm,
                                Register new_val,
                                Register tmp1,
                                Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
     return;
   }
   Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
   G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, new_val_may_be_null);
 }
 
 %} // source
diff --git a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
index c1920b52837..31f27e140e0 100644
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
@@ -89,19 +89,53 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
 
 void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                              Register addr, Register count, Register tmp) {
-  __ push_call_clobbered_registers(false /* save_fpu */);
-  if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
-    assert_different_registers(c_rarg1, addr);
-    __ mov(c_rarg1, count);
-    __ mov(c_rarg0, addr);
-  } else {
-    assert_different_registers(c_rarg0, count);
-    __ mov(c_rarg0, addr);
-    __ mov(c_rarg1, count);
-  }
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop_call_clobbered_registers(false /* save_fpu */);
+  Label done;
 
+  __ testptr(count, count);
+  __ jcc(Assembler::zero, done);
+
+  // Calculate end address in "count".
+  Address::ScaleFactor scale = UseCompressedOops ? Address::times_4 : Address::times_8;
+  __ leaq(count, Address(addr, count, scale));
+
+  // Calculate start card address in "addr".
+  __ shrptr(addr, CardTable::card_shift());
+
+  Register thread = r15_thread;
+
+  __ movptr(tmp, Address(thread, in_bytes(G1ThreadLocalData::card_table_base_offset())));
+  __ addptr(addr, tmp);
+
+  // Calculate address of card of last word in the array.
+  __ subptr(count, 1);
+  __ shrptr(count, CardTable::card_shift());
+  __ addptr(count, tmp);
+
+  Label loop;
+  // Iterate from start card to end card (inclusive).
+  __ bind(loop);
+
+  Label is_clean_card;
+  if (UseCondCardMark) {
+    __ cmpb(Address(addr, 0), G1CardTable::clean_card_val());
+    __ jcc(Assembler::equal, is_clean_card);
+  } else {
+   __ movb(Address(addr, 0), G1CardTable::dirty_card_val());
+  }
+
+  Label next_card;
+  __ bind(next_card);
+  __ addptr(addr, sizeof(CardTable::CardValue));
+  __ cmpptr(addr, count);
+  __ jcc(Assembler::belowEqual, loop);
+  __ jmp(done);
+
+  __ bind(is_clean_card);
+  // Card was clean. Dirty card and go to next..
+  __ movb(Address(addr, 0), G1CardTable::dirty_card_val());
+  __ jmp(next_card);
+
+  __ bind(done);
 }
 
 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@@ -182,7 +216,6 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
   // If expand_call is true then we expand the call_VM_leaf macro
   // directly to skip generating the check by
   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
-
   const Register thread = r15_thread;
 
   Label done;
@@ -238,73 +271,46 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                             const Register store_addr,
                                             const Register new_val,
-                                            const Register tmp,
-                                            const Register tmp2,
+                                            const Register tmp1,
                                             Label& done,
                                             bool new_val_may_be_null) {
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
+
+  assert_different_registers(store_addr, new_val, tmp1, noreg);
+
+  Register thread = r15_thread;
+
   // Does store cross heap regions?
-  __ movptr(tmp, store_addr);                                    // tmp := store address
-  __ xorptr(tmp, new_val);                                       // tmp := store address ^ new value
-  __ shrptr(tmp, G1HeapRegion::LogOfHRGrainBytes);               // ((store address ^ new value) >> LogOfHRGrainBytes) == 0?
+  __ movptr(tmp1, store_addr);                                    // tmp1 := store address
+  __ xorptr(tmp1, new_val);                                       // tmp1 := store address ^ new value
+  __ shrptr(tmp1, G1HeapRegion::LogOfHRGrainBytes);               // ((store address ^ new value) >> LogOfHRGrainBytes) == 0?
   __ jcc(Assembler::equal, done);
+
   // Crosses regions, storing null?
   if (new_val_may_be_null) {
-    __ cmpptr(new_val, NULL_WORD);                               // new value == null?
+    __ cmpptr(new_val, NULL_WORD);                                // new value == null?
     __ jcc(Assembler::equal, done);
   }
-  // Storing region crossing non-null, is card young?
-  __ movptr(tmp, store_addr);                                    // tmp := store address
-  __ shrptr(tmp, CardTable::card_shift());                       // tmp := card address relative to card table base
-  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
-  // a valid address and therefore is not properly handled by the relocation code.
-  __ movptr(tmp2, (intptr_t)ct->card_table()->byte_map_base());  // tmp2 := card table base address
-  __ addptr(tmp, tmp2);                                          // tmp := card address
-  __ cmpb(Address(tmp, 0), G1CardTable::g1_young_card_val());    // *(card address) == young_card_val?
-}
 
-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp,
-                                            const Register tmp2,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));  // StoreLoad membar
-  __ cmpb(Address(tmp, 0), G1CardTable::dirty_card_val());       // *(card address) == dirty_card_val?
-  __ jcc(Assembler::equal, done);
+  __ movptr(tmp1, store_addr);                                    // tmp1 := store address
+  __ shrptr(tmp1, CardTable::card_shift());                       // tmp1 := card address relative to card table base
+
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ addptr(tmp1, card_table_addr);                               // tmp1 := card address
+  if (UseCondCardMark) {
+    __ cmpb(Address(tmp1, 0), G1CardTable::clean_card_val());     // *(card address) == clean_card_val?
+    __ jcc(Assembler::notEqual, done);
+  }
   // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  __ movb(Address(tmp, 0), G1CardTable::dirty_card_val());       // *(card address) := dirty_card_val
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime,
-                           thread, tmp, tmp2);
-  __ jmp(done);
+  // Dirty card.
+  __ movb(Address(tmp1, 0), G1CardTable::dirty_card_val());       // *(card address) := dirty_card_val
 }
 
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                   Register store_addr,
                                                   Register new_val,
-                                                  Register tmp,
-                                                  Register tmp2) {
-  const Register thread = r15_thread;
-
+                                                  Register tmp) {
   Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  __ jcc(Assembler::equal, done);
-  generate_post_barrier_slow_path(masm, thread, tmp, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push_set(saved);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp, thread);
-  __ pop_set(saved);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, done, true /* new_val_may_be_null */);
   __ bind(done);
 }
 
@@ -367,34 +373,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                      Register store_addr,
                                                      Register new_val,
                                                      Register tmp,
-                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  const Register thread = r15_thread;
-  stub->initialize_registers(thread, tmp, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ jcc(Assembler::notEqual, *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp = stub->tmp1(); // tmp holds the card address.
-  Register tmp2 = stub->tmp2();
-  assert(stub->tmp3() == noreg, "not needed in this platform");
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ jmp(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, done, new_val_may_be_null);
+  __ bind(done);
 }
 
 #endif // COMPILER2
@@ -441,8 +423,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco
       g1_write_barrier_post(masm /*masm*/,
                             tmp1 /* store_adr */,
                             new_val /* new_val */,
-                            tmp3 /* tmp */,
-                            tmp2 /* tmp2 */);
+                            tmp3 /* tmp */);
     }
   }
 }
@@ -476,21 +457,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
 
 }
 
-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cmpptr(new_val_reg, NULL_WORD);
-  __ jcc(Assembler::equal, *stub->continuation());
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ jmp(*stub->continuation());
-}
-
 #undef __
 
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2 /* unused on x86 */) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->
 
 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@@ -555,78 +534,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
   __ epilogue();
 }
 
-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  CardTableBarrierSet* ct =
-    barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  Label done;
-  Label enqueued;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-
-  const Register thread = r15_thread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  __ push_ppx(rax);
-  __ push_ppx(rcx);
-
-  const Register cardtable = rax;
-  const Register card_addr = rcx;
-
-  __ load_parameter(0, card_addr);
-  __ shrptr(card_addr, CardTable::card_shift());
-  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
-  // a valid address and therefore is not properly handled by the relocation code.
-  __ movptr(cardtable, (intptr_t)ct->card_table()->byte_map_base());
-  __ addptr(card_addr, cardtable);
-
-  __ cmpb(Address(card_addr, 0), G1CardTable::g1_young_card_val());
-  __ jcc(Assembler::equal, done);
-
-  __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
-  __ cmpb(Address(card_addr, 0), CardTable::dirty_card_val());
-  __ jcc(Assembler::equal, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-
-  __ movb(Address(card_addr, 0), CardTable::dirty_card_val());
-
-  const Register tmp = rdx;
-  __ push_ppx(rdx);
-
-  __ movptr(tmp, queue_index);
-  __ testptr(tmp, tmp);
-  __ jcc(Assembler::zero, runtime);
-  __ subptr(tmp, wordSize);
-  __ movptr(queue_index, tmp);
-  __ addptr(tmp, buffer);
-  __ movptr(Address(tmp, 0), card_addr);
-  __ jmp(enqueued);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-
-  __ pop_call_clobbered_registers();
-
-  __ bind(enqueued);
-  __ pop_ppx(rdx);
-
-  __ bind(done);
-  __ pop_ppx(rcx);
-  __ pop_ppx(rax);
-
-  __ epilogue();
-}
-
 #undef __
 
 #endif // COMPILER1
diff --git a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
index 774e87b916c..4b2de41de69 100644
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,10 +31,8 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1BarrierStubC2;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
  protected:
@@ -51,22 +49,28 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
   void g1_write_barrier_post(MacroAssembler* masm,
                              Register store_addr,
                              Register new_val,
-                             Register tmp,
-                             Register tmp2);
+                             Register tmp);
 
   virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                             Address dst, Register val, Register tmp1, Register tmp2, Register tmp3);
 
  public:
-  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
-
-  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
-
   virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                        Register dst, Address src, Register tmp1);
 
+#ifdef COMPILER1
+  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
+
+  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+#endif
+
 #ifdef COMPILER2
   void g1_write_barrier_pre_c2(MacroAssembler* masm,
                                Register obj,
@@ -79,10 +83,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
                                 Register store_addr,
                                 Register new_val,
                                 Register tmp,
-                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif // COMPILER2
 };
 
diff --git a/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad b/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
index 819cd97696c..94607cd6796 100644
--- a/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
+++ b/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -59,15 +59,14 @@ static void write_barrier_post(MacroAssembler* masm,
                                const MachNode* node,
                                Register store_addr,
                                Register new_val,
-                               Register tmp1,
-                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+                               Register tmp1) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
     return;
   }
   Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
   G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, new_val_may_be_null);
 }
 
 %}
@@ -95,8 +94,7 @@ instruct g1StoreP(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rRegP tmp3,
     write_barrier_post(masm, this,
                        $tmp1$$Register /* store_addr */,
                        $src$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
   %}
   ins_pipe(ialu_mem_reg);
 %}
@@ -127,8 +125,7 @@ instruct g1StoreN(memory mem, rRegN src, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFl
     write_barrier_post(masm, this,
                        $tmp1$$Register /* store_addr */,
                        $tmp2$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
   %}
   ins_pipe(ialu_mem_reg);
 %}
@@ -158,8 +155,7 @@ instruct g1EncodePAndStoreN(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rR
     write_barrier_post(masm, this,
                        $tmp1$$Register /* store_addr */,
                        $src$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
   %}
   ins_pipe(ialu_mem_reg);
 %}
@@ -187,8 +183,7 @@ instruct g1CompareAndExchangeP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp
     write_barrier_post(masm, this,
                        $mem$$Register /* store_addr */,
                        $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
   %}
   ins_pipe(pipe_cmpxchg);
 %}
@@ -214,8 +209,7 @@ instruct g1CompareAndExchangeN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp
     write_barrier_post(masm, this,
                        $mem$$Register /* store_addr */,
                        $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
   %}
   ins_pipe(pipe_cmpxchg);
 %}
@@ -246,8 +240,7 @@ instruct g1CompareAndSwapP(rRegI res, indirect mem, rRegP newval, rRegP tmp1, rR
     write_barrier_post(masm, this,
                        $mem$$Register /* store_addr */,
                        $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
   %}
   ins_pipe(pipe_cmpxchg);
 %}
@@ -279,8 +272,7 @@ instruct g1CompareAndSwapN(rRegI res, indirect mem, rRegN newval, rRegP tmp1, rR
     write_barrier_post(masm, this,
                        $mem$$Register /* store_addr */,
                        $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
   %}
   ins_pipe(pipe_cmpxchg);
 %}
@@ -303,8 +295,7 @@ instruct g1GetAndSetP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp2, rRegP
     write_barrier_post(masm, this,
                        $mem$$Register /* store_addr */,
                        $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
   %}
   ins_pipe(pipe_cmpxchg);
 %}
@@ -328,8 +319,7 @@ instruct g1GetAndSetN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp2, rRegP
     write_barrier_post(masm, this,
                        $mem$$Register /* store_addr */,
                        $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
   %}
   ins_pipe(pipe_cmpxchg);
 %}
diff --git a/src/hotspot/share/code/aotCodeCache.cpp b/src/hotspot/share/code/aotCodeCache.cpp
index a24bae03137..04776f4c16c 100644
--- a/src/hotspot/share/code/aotCodeCache.cpp
+++ b/src/hotspot/share/code/aotCodeCache.cpp
@@ -1365,7 +1365,6 @@ void AOTCodeAddressTable::init_extrs() {
 #endif // COMPILER2
 
 #if INCLUDE_G1GC
-  SET_ADDRESS(_extrs, G1BarrierSetRuntime::write_ref_field_post_entry);
   SET_ADDRESS(_extrs, G1BarrierSetRuntime::write_ref_field_pre_entry);
 #endif
 #if INCLUDE_SHENANDOAHGC
diff --git a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp
index 425be474602..51c8a53b54a 100644
--- a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp
+++ b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp
@@ -23,12 +23,15 @@
  */
 
 #include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_LIRGenerator.hpp"
+#include "c1/c1_MacroAssembler.hpp"
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
+#include "utilities/formatBuffer.hpp"
 #include "utilities/macros.hpp"
 
 #ifdef ASSERT
@@ -42,11 +45,6 @@ void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
   bs->gen_pre_barrier_stub(ce, this);
 }
 
-void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
-  G1BarrierSetAssembler* bs = (G1BarrierSetAssembler*)BarrierSet::barrier_set()->barrier_set_assembler();
-  bs->gen_post_barrier_stub(ce, this);
-}
-
 void G1BarrierSetC1::pre_barrier(LIRAccess& access, LIR_Opr addr_opr,
                                  LIR_Opr pre_val, CodeEmitInfo* info) {
   LIRGenerator* gen = access.gen();
@@ -114,6 +112,87 @@ void G1BarrierSetC1::pre_barrier(LIRAccess& access, LIR_Opr addr_opr,
   __ branch_destination(slow->continuation());
 }
 
+class LIR_OpG1PostBarrier : public LIR_Op {
+ friend class LIR_OpVisitState;
+
+private:
+  LIR_Opr       _addr;
+  LIR_Opr       _new_val;
+  LIR_Opr       _thread;
+  LIR_Opr       _tmp1;
+  LIR_Opr       _tmp2;
+
+public:
+  LIR_OpG1PostBarrier(LIR_Opr addr,
+                      LIR_Opr new_val,
+                      LIR_Opr thread,
+                      LIR_Opr tmp1,
+                      LIR_Opr tmp2)
+    : LIR_Op(lir_none, lir_none, nullptr),
+      _addr(addr),
+      _new_val(new_val),
+      _thread(thread),
+      _tmp1(tmp1),
+      _tmp2(tmp2)
+    {}
+
+  virtual void visit(LIR_OpVisitState* state) {
+    state->do_input(_addr);
+    state->do_input(_new_val);
+    state->do_input(_thread);
+
+    // Use temps to enforce different registers.
+    state->do_temp(_addr);
+    state->do_temp(_new_val);
+    state->do_temp(_thread);
+    state->do_temp(_tmp1);
+    state->do_temp(_tmp2);
+
+    if (_info != nullptr) {
+      state->do_info(_info);
+    }
+  }
+
+  virtual void emit_code(LIR_Assembler* ce) {
+    if (_info != nullptr) {
+      ce->add_debug_info_for_null_check_here(_info);
+    }
+
+    Register addr = _addr->as_pointer_register();
+    Register new_val = _new_val->as_pointer_register();
+    Register thread = _thread->as_pointer_register();
+    Register tmp1 = _tmp1->as_pointer_register();
+    Register tmp2 = _tmp2->as_pointer_register();
+
+    // This may happen for a store of x.a = x - we do not need a post barrier for those
+    // as the cross-region test will always exit early anyway.
+    // The post barrier implementations can assume that addr and new_val are different
+    // then.
+    if (addr == new_val) {
+      ce->masm()->block_comment(err_msg("same addr/new_val due to self-referential store with imprecise card mark %s", addr->name()));
+      return;
+    }
+
+    G1BarrierSetAssembler* bs_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+    bs_asm->g1_write_barrier_post_c1(ce->masm(), addr, new_val, thread, tmp1, tmp2);
+  }
+
+  virtual void print_instr(outputStream* out) const {
+    _addr->print(out);     out->print(" ");
+    _new_val->print(out);  out->print(" ");
+    _thread->print(out);   out->print(" ");
+    _tmp1->print(out);     out->print(" ");
+    _tmp2->print(out);     out->print(" ");
+    out->cr();
+  }
+
+#ifndef PRODUCT
+  virtual const char* name() const  {
+    return "lir_g1_post_barrier";
+  }
+#endif // PRODUCT
+};
+
 void G1BarrierSetC1::post_barrier(LIRAccess& access, LIR_Opr addr, LIR_Opr new_val) {
   LIRGenerator* gen = access.gen();
   DecoratorSet decorators = access.decorators();
@@ -150,29 +229,11 @@ void G1BarrierSetC1::post_barrier(LIRAccess& access, LIR_Opr addr, LIR_Opr new_v
   }
   assert(addr->is_register(), "must be a register at this point");
 
-  LIR_Opr xor_res = gen->new_pointer_register();
-  LIR_Opr xor_shift_res = gen->new_pointer_register();
-  if (two_operand_lir_form) {
-    __ move(addr, xor_res);
-    __ logical_xor(xor_res, new_val, xor_res);
-    __ move(xor_res, xor_shift_res);
-    __ unsigned_shift_right(xor_shift_res,
-                            LIR_OprFact::intConst(checked_cast<jint>(G1HeapRegion::LogOfHRGrainBytes)),
-                            xor_shift_res,
-                            LIR_Opr::illegalOpr());
-  } else {
-    __ logical_xor(addr, new_val, xor_res);
-    __ unsigned_shift_right(xor_res,
-                            LIR_OprFact::intConst(checked_cast<jint>(G1HeapRegion::LogOfHRGrainBytes)),
-                            xor_shift_res,
-                            LIR_Opr::illegalOpr());
-  }
-
-  __ cmp(lir_cond_notEqual, xor_shift_res, LIR_OprFact::intptrConst(NULL_WORD));
-
-  CodeStub* slow = new G1PostBarrierStub(addr, new_val);
-  __ branch(lir_cond_notEqual, slow);
-  __ branch_destination(slow->continuation());
+  __ append(new LIR_OpG1PostBarrier(addr,
+                                    new_val,
+                                    gen->getThreadPointer() /* thread */,
+                                    gen->new_pointer_register() /* tmp1 */,
+                                    gen->new_pointer_register() /* tmp2 */));
 }
 
 void G1BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
@@ -207,20 +268,9 @@ class C1G1PreBarrierCodeGenClosure : public StubAssemblerCodeGenClosure {
   }
 };
 
-class C1G1PostBarrierCodeGenClosure : public StubAssemblerCodeGenClosure {
-  virtual OopMapSet* generate_code(StubAssembler* sasm) {
-    G1BarrierSetAssembler* bs = (G1BarrierSetAssembler*)BarrierSet::barrier_set()->barrier_set_assembler();
-    bs->generate_c1_post_barrier_runtime_stub(sasm);
-    return nullptr;
-  }
-};
-
 bool G1BarrierSetC1::generate_c1_runtime_stubs(BufferBlob* buffer_blob) {
   C1G1PreBarrierCodeGenClosure pre_code_gen_cl;
-  C1G1PostBarrierCodeGenClosure post_code_gen_cl;
   _pre_barrier_c1_runtime_code_blob = Runtime1::generate_blob(buffer_blob, StubId::NO_STUBID, "g1_pre_barrier_slow",
                                                               false, &pre_code_gen_cl);
-  _post_barrier_c1_runtime_code_blob = Runtime1::generate_blob(buffer_blob, StubId::NO_STUBID, "g1_post_barrier_slow",
-                                                               false, &post_code_gen_cl);
-  return _pre_barrier_c1_runtime_code_blob != nullptr && _post_barrier_c1_runtime_code_blob != nullptr;
+  return _pre_barrier_c1_runtime_code_blob != nullptr;
 }
diff --git a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp
index 4baaf8ac58c..89f5676a2d2 100644
--- a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp
+++ b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp
@@ -91,40 +91,11 @@ class G1PreBarrierStub: public CodeStub {
 #endif // PRODUCT
 };
 
-class G1PostBarrierStub: public CodeStub {
-  friend class G1BarrierSetC1;
- private:
-  LIR_Opr _addr;
-  LIR_Opr _new_val;
-
- public:
-  // addr (the address of the object head) and new_val must be registers.
-  G1PostBarrierStub(LIR_Opr addr, LIR_Opr new_val): _addr(addr), _new_val(new_val) {
-    FrameMap* f = Compilation::current()->frame_map();
-    f->update_reserved_argument_area_size(2 * BytesPerWord);
-  }
-
-  LIR_Opr addr() const { return _addr; }
-  LIR_Opr new_val() const { return _new_val; }
-
-  virtual void emit_code(LIR_Assembler* e);
-  virtual void visit(LIR_OpVisitState* visitor) {
-    // don't pass in the code emit info since it's processed in the fast path
-    visitor->do_slow_case();
-    visitor->do_input(_addr);
-    visitor->do_input(_new_val);
-  }
-#ifndef PRODUCT
-  virtual void print_name(outputStream* out) const { out->print("G1PostBarrierStub"); }
-#endif // PRODUCT
-};
-
 class CodeBlob;
 
 class G1BarrierSetC1 : public ModRefBarrierSetC1 {
  protected:
   CodeBlob* _pre_barrier_c1_runtime_code_blob;
-  CodeBlob* _post_barrier_c1_runtime_code_blob;
 
   virtual void pre_barrier(LIRAccess& access, LIR_Opr addr_opr,
                            LIR_Opr pre_val, CodeEmitInfo* info);
@@ -134,11 +105,9 @@ class G1BarrierSetC1 : public ModRefBarrierSetC1 {
 
  public:
   G1BarrierSetC1()
-    : _pre_barrier_c1_runtime_code_blob(nullptr),
-      _post_barrier_c1_runtime_code_blob(nullptr) {}
+    : _pre_barrier_c1_runtime_code_blob(nullptr) {}
 
   CodeBlob* pre_barrier_c1_runtime_code_blob() { return _pre_barrier_c1_runtime_code_blob; }
-  CodeBlob* post_barrier_c1_runtime_code_blob() { return _post_barrier_c1_runtime_code_blob; }
 
   virtual bool generate_c1_runtime_stubs(BufferBlob* buffer_blob);
 };
diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
index bca2255479b..61402301eb1 100644
--- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
+++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
@@ -298,7 +298,13 @@ uint G1BarrierSetC2::estimated_barrier_size(const Node* node) const {
     nodes += 6;
   }
   if ((barrier_data & G1C2BarrierPost) != 0) {
-    nodes += 60;
+    // Approximate the number of nodes needed; an if costs 4 nodes (Cmp, Bool,
+    // If, If projection), any other (Assembly) instruction is approximated with
+    // a cost of 1.
+    nodes +=   4  // base cost for the card write containing getting base offset, address calculation and the card write;
+             + 6  // same region check: Uncompress (new_val) oop, xor, shr, (cmp), jmp
+             + 4  // new_val is null check
+             + (UseCondCardMark ? 4 : 0); // card not clean check.
   }
   return nodes;
 }
@@ -386,8 +392,9 @@ public:
   }
 
   bool needs_liveness_data(const MachNode* mach) const {
-    return G1PreBarrierStubC2::needs_barrier(mach) ||
-           G1PostBarrierStubC2::needs_barrier(mach);
+    // Liveness data is only required to compute registers that must be preserved
+    // across the runtime call in the pre-barrier stub.
+    return G1BarrierStubC2::needs_pre_barrier(mach);
   }
 
   bool needs_livein_data() const {
@@ -401,10 +408,22 @@ static G1BarrierSetC2State* barrier_set_state() {
 
 G1BarrierStubC2::G1BarrierStubC2(const MachNode* node) : BarrierStubC2(node) {}
 
+bool G1BarrierStubC2::needs_pre_barrier(const MachNode* node) {
+  return (node->barrier_data() & G1C2BarrierPre) != 0;
+}
+
+bool G1BarrierStubC2::needs_post_barrier(const MachNode* node) {
+  return (node->barrier_data() & G1C2BarrierPost) != 0;
+}
+
+bool G1BarrierStubC2::post_new_val_may_be_null(const MachNode* node) {
+  return (node->barrier_data() & G1C2BarrierPostNotNull) == 0;
+}
+
 G1PreBarrierStubC2::G1PreBarrierStubC2(const MachNode* node) : G1BarrierStubC2(node) {}
 
 bool G1PreBarrierStubC2::needs_barrier(const MachNode* node) {
-  return (node->barrier_data() & G1C2BarrierPre) != 0;
+  return needs_pre_barrier(node);
 }
 
 G1PreBarrierStubC2* G1PreBarrierStubC2::create(const MachNode* node) {
@@ -448,48 +467,6 @@ void G1PreBarrierStubC2::emit_code(MacroAssembler& masm) {
   bs->generate_c2_pre_barrier_stub(&masm, this);
 }
 
-G1PostBarrierStubC2::G1PostBarrierStubC2(const MachNode* node) : G1BarrierStubC2(node) {}
-
-bool G1PostBarrierStubC2::needs_barrier(const MachNode* node) {
-  return (node->barrier_data() & G1C2BarrierPost) != 0;
-}
-
-G1PostBarrierStubC2* G1PostBarrierStubC2::create(const MachNode* node) {
-  G1PostBarrierStubC2* const stub = new (Compile::current()->comp_arena()) G1PostBarrierStubC2(node);
-  if (!Compile::current()->output()->in_scratch_emit_size()) {
-    barrier_set_state()->stubs()->append(stub);
-  }
-  return stub;
-}
-
-void G1PostBarrierStubC2::initialize_registers(Register thread, Register tmp1, Register tmp2, Register tmp3) {
-  _thread = thread;
-  _tmp1 = tmp1;
-  _tmp2 = tmp2;
-  _tmp3 = tmp3;
-}
-
-Register G1PostBarrierStubC2::thread() const {
-  return _thread;
-}
-
-Register G1PostBarrierStubC2::tmp1() const {
-  return _tmp1;
-}
-
-Register G1PostBarrierStubC2::tmp2() const {
-  return _tmp2;
-}
-
-Register G1PostBarrierStubC2::tmp3() const {
-  return _tmp3;
-}
-
-void G1PostBarrierStubC2::emit_code(MacroAssembler& masm) {
-  G1BarrierSetAssembler* bs = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  bs->generate_c2_post_barrier_stub(&masm, this);
-}
-
 void* G1BarrierSetC2::create_barrier_state(Arena* comp_arena) const {
   return new (comp_arena) G1BarrierSetC2State(comp_arena);
 }
diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp
index 5f85714d889..601d0f1138e 100644
--- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp
+++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp
@@ -37,6 +37,10 @@ const int G1C2BarrierPostNotNull = 4;
 
 class G1BarrierStubC2 : public BarrierStubC2 {
 public:
+  static bool needs_pre_barrier(const MachNode* node);
+  static bool needs_post_barrier(const MachNode* node);
+  static bool post_new_val_may_be_null(const MachNode* node);
+
   G1BarrierStubC2(const MachNode* node);
   virtual void emit_code(MacroAssembler& masm) = 0;
 };
@@ -64,27 +68,6 @@ public:
   virtual void emit_code(MacroAssembler& masm);
 };
 
-class G1PostBarrierStubC2 : public G1BarrierStubC2 {
-private:
-  Register _thread;
-  Register _tmp1;
-  Register _tmp2;
-  Register _tmp3;
-
-protected:
-  G1PostBarrierStubC2(const MachNode* node);
-
-public:
-  static bool needs_barrier(const MachNode* node);
-  static G1PostBarrierStubC2* create(const MachNode* node);
-  void initialize_registers(Register thread, Register tmp1 = noreg, Register tmp2 = noreg, Register tmp3 = noreg);
-  Register thread() const;
-  Register tmp1() const;
-  Register tmp2() const;
-  Register tmp3() const;
-  virtual void emit_code(MacroAssembler& masm);
-};
-
 class G1BarrierSetC2: public CardTableBarrierSetC2 {
 private:
   void analyze_dominating_barriers() const;
diff --git a/src/hotspot/share/gc/g1/g1Allocator.cpp b/src/hotspot/share/gc/g1/g1Allocator.cpp
index 7f2916ae895..713bafd4782 100644
--- a/src/hotspot/share/gc/g1/g1Allocator.cpp
+++ b/src/hotspot/share/gc/g1/g1Allocator.cpp
@@ -262,9 +262,6 @@ HeapWord* G1Allocator::survivor_attempt_allocation(uint node_index,
       }
     }
   }
-  if (result != nullptr) {
-    _g1h->dirty_young_block(result, *actual_word_size);
-  }
   return result;
 }
 
diff --git a/src/hotspot/share/gc/g1/g1Analytics.cpp b/src/hotspot/share/gc/g1/g1Analytics.cpp
index 8fe0b25ceb7..6e7f46ca1d1 100644
--- a/src/hotspot/share/gc/g1/g1Analytics.cpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.cpp
@@ -37,12 +37,10 @@
 // They were chosen by running GCOld and SPECjbb on debris with different
 //   numbers of GC threads and choosing them based on the results
 
-static double cost_per_logged_card_ms_defaults[] = {
-  0.01, 0.005, 0.005, 0.003, 0.003, 0.002, 0.002, 0.0015
-};
+static double cost_per_pending_card_ms_default = 0.01;
 
 // all the same
-static double young_card_scan_to_merge_ratio_defaults[] = {
+static double young_card_merge_to_scan_ratio_defaults[] = {
   1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 };
 
@@ -78,8 +76,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
     _concurrent_gc_cpu_time_ms(),
     _concurrent_refine_rate_ms_seq(TruncatedSeqLength),
     _dirtied_cards_rate_ms_seq(TruncatedSeqLength),
-    _dirtied_cards_in_thread_buffers_seq(TruncatedSeqLength),
-    _card_scan_to_merge_ratio_seq(TruncatedSeqLength),
+    _card_merge_to_scan_ratio_seq(TruncatedSeqLength),
     _cost_per_card_scan_ms_seq(TruncatedSeqLength),
     _cost_per_card_merge_ms_seq(TruncatedSeqLength),
     _cost_per_code_root_ms_seq(TruncatedSeqLength),
@@ -87,6 +84,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
     _pending_cards_seq(TruncatedSeqLength),
     _card_rs_length_seq(TruncatedSeqLength),
     _code_root_rs_length_seq(TruncatedSeqLength),
+    _merge_refinement_table_ms_seq(TruncatedSeqLength),
     _constant_other_time_ms_seq(TruncatedSeqLength),
     _young_other_cost_per_region_ms_seq(TruncatedSeqLength),
     _non_young_other_cost_per_region_ms_seq(TruncatedSeqLength),
@@ -100,17 +98,17 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
 
   uint index = MIN2(ParallelGCThreads - 1, 7u);
 
-  // Start with inverse of maximum STW cost.
-  _concurrent_refine_rate_ms_seq.add(1/cost_per_logged_card_ms_defaults[0]);
-  // Some applications have very low rates for logging cards.
+  _concurrent_refine_rate_ms_seq.add(1 / cost_per_pending_card_ms_default);
+  // Some applications have very low rates for dirtying cards.
   _dirtied_cards_rate_ms_seq.add(0.0);
 
-  _card_scan_to_merge_ratio_seq.set_initial(young_card_scan_to_merge_ratio_defaults[index]);
+  _card_merge_to_scan_ratio_seq.set_initial(young_card_merge_to_scan_ratio_defaults[index]);
   _cost_per_card_scan_ms_seq.set_initial(young_only_cost_per_card_scan_ms_defaults[index]);
   _card_rs_length_seq.set_initial(0);
   _code_root_rs_length_seq.set_initial(0);
   _cost_per_byte_copied_ms_seq.set_initial(cost_per_byte_ms_defaults[index]);
 
+  _merge_refinement_table_ms_seq.add(0);
   _constant_other_time_ms_seq.add(constant_other_time_ms_defaults[index]);
   _young_other_cost_per_region_ms_seq.add(young_other_cost_per_region_ms_defaults[index]);
   _non_young_other_cost_per_region_ms_seq.add(non_young_other_cost_per_region_ms_defaults[index]);
@@ -196,10 +194,6 @@ void G1Analytics::report_dirtied_cards_rate_ms(double cards_per_ms) {
   _dirtied_cards_rate_ms_seq.add(cards_per_ms);
 }
 
-void G1Analytics::report_dirtied_cards_in_thread_buffers(size_t cards) {
-  _dirtied_cards_in_thread_buffers_seq.add(double(cards));
-}
-
 void G1Analytics::report_cost_per_card_scan_ms(double cost_per_card_ms, bool for_young_only_phase) {
   _cost_per_card_scan_ms_seq.add(cost_per_card_ms, for_young_only_phase);
 }
@@ -212,8 +206,8 @@ void G1Analytics::report_cost_per_code_root_scan_ms(double cost_per_code_root_ms
   _cost_per_code_root_ms_seq.add(cost_per_code_root_ms, for_young_only_phase);
 }
 
-void G1Analytics::report_card_scan_to_merge_ratio(double merge_to_scan_ratio, bool for_young_only_phase) {
-  _card_scan_to_merge_ratio_seq.add(merge_to_scan_ratio, for_young_only_phase);
+void G1Analytics::report_card_merge_to_scan_ratio(double merge_to_scan_ratio, bool for_young_only_phase) {
+  _card_merge_to_scan_ratio_seq.add(merge_to_scan_ratio, for_young_only_phase);
 }
 
 void G1Analytics::report_cost_per_byte_ms(double cost_per_byte_ms, bool for_young_only_phase) {
@@ -228,6 +222,10 @@ void G1Analytics::report_non_young_other_cost_per_region_ms(double other_cost_pe
   _non_young_other_cost_per_region_ms_seq.add(other_cost_per_region_ms);
 }
 
+void G1Analytics::report_merge_refinement_table_time_ms(double merge_refinement_table_time_ms) {
+  _merge_refinement_table_ms_seq.add(merge_refinement_table_time_ms);
+}
+
 void G1Analytics::report_constant_other_time_ms(double constant_other_time_ms) {
   _constant_other_time_ms_seq.add(constant_other_time_ms);
 }
@@ -260,12 +258,8 @@ double G1Analytics::predict_dirtied_cards_rate_ms() const {
   return predict_zero_bounded(&_dirtied_cards_rate_ms_seq);
 }
 
-size_t G1Analytics::predict_dirtied_cards_in_thread_buffers() const {
-  return predict_size(&_dirtied_cards_in_thread_buffers_seq);
-}
-
 size_t G1Analytics::predict_scan_card_num(size_t card_rs_length, bool for_young_only_phase) const {
-  return card_rs_length * predict_in_unit_interval(&_card_scan_to_merge_ratio_seq, for_young_only_phase);
+  return card_rs_length * predict_in_unit_interval(&_card_merge_to_scan_ratio_seq, for_young_only_phase);
 }
 
 double G1Analytics::predict_card_merge_time_ms(size_t card_num, bool for_young_only_phase) const {
@@ -284,6 +278,10 @@ double G1Analytics::predict_object_copy_time_ms(size_t bytes_to_copy, bool for_y
   return bytes_to_copy * predict_zero_bounded(&_cost_per_byte_copied_ms_seq, for_young_only_phase);
 }
 
+double G1Analytics::predict_merge_refinement_table_time_ms() const {
+  return predict_zero_bounded(&_merge_refinement_table_ms_seq);
+}
+
 double G1Analytics::predict_constant_other_time_ms() const {
   return predict_zero_bounded(&_constant_other_time_ms_seq);
 }
diff --git a/src/hotspot/share/gc/g1/g1Analytics.hpp b/src/hotspot/share/gc/g1/g1Analytics.hpp
index e5e2dd74101..1f609815632 100644
--- a/src/hotspot/share/gc/g1/g1Analytics.hpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -56,14 +56,13 @@ class G1Analytics: public CHeapObj<mtGC> {
 
   TruncatedSeq _concurrent_refine_rate_ms_seq;
   TruncatedSeq _dirtied_cards_rate_ms_seq;
-  TruncatedSeq _dirtied_cards_in_thread_buffers_seq;
-  // The ratio between the number of scanned cards and actually merged cards, for
-  // young-only and mixed gcs.
-  G1PhaseDependentSeq _card_scan_to_merge_ratio_seq;
+  // The ratio between the number of merged cards to actually scanned cards for
+  // card based remembered sets, for young-only and mixed gcs.
+  G1PhaseDependentSeq _card_merge_to_scan_ratio_seq;
 
   // The cost to scan a card during young-only and mixed gcs in ms.
   G1PhaseDependentSeq _cost_per_card_scan_ms_seq;
-  // The cost to merge a card during young-only and mixed gcs in ms.
+  // The cost to merge a card from the remembered sets for non-young regions in ms.
   G1PhaseDependentSeq _cost_per_card_merge_ms_seq;
   // The cost to scan entries in the code root remembered set in ms.
   G1PhaseDependentSeq _cost_per_code_root_ms_seq;
@@ -74,6 +73,8 @@ class G1Analytics: public CHeapObj<mtGC> {
   G1PhaseDependentSeq _card_rs_length_seq;
   G1PhaseDependentSeq _code_root_rs_length_seq;
 
+  // Prediction for merging the refinement table to the card table during GC.
+  TruncatedSeq _merge_refinement_table_ms_seq;
   TruncatedSeq _constant_other_time_ms_seq;
   TruncatedSeq _young_other_cost_per_region_ms_seq;
   TruncatedSeq _non_young_other_cost_per_region_ms_seq;
@@ -149,14 +150,14 @@ public:
   void report_alloc_rate_ms(double alloc_rate);
   void report_concurrent_refine_rate_ms(double cards_per_ms);
   void report_dirtied_cards_rate_ms(double cards_per_ms);
-  void report_dirtied_cards_in_thread_buffers(size_t num_cards);
   void report_cost_per_card_scan_ms(double cost_per_remset_card_ms, bool for_young_only_phase);
   void report_cost_per_card_merge_ms(double cost_per_card_ms, bool for_young_only_phase);
   void report_cost_per_code_root_scan_ms(double cost_per_code_root_ms, bool for_young_only_phase);
-  void report_card_scan_to_merge_ratio(double cards_per_entry_ratio, bool for_young_only_phase);
+  void report_card_merge_to_scan_ratio(double merge_to_scan_ratio, bool for_young_only_phase);
   void report_cost_per_byte_ms(double cost_per_byte_ms, bool for_young_only_phase);
   void report_young_other_cost_per_region_ms(double other_cost_per_region_ms);
   void report_non_young_other_cost_per_region_ms(double other_cost_per_region_ms);
+  void report_merge_refinement_table_time_ms(double pending_card_merge_time_ms);
   void report_constant_other_time_ms(double constant_other_time_ms);
   void report_pending_cards(double pending_cards, bool for_young_only_phase);
   void report_card_rs_length(double card_rs_length, bool for_young_only_phase);
@@ -167,7 +168,6 @@ public:
 
   double predict_concurrent_refine_rate_ms() const;
   double predict_dirtied_cards_rate_ms() const;
-  size_t predict_dirtied_cards_in_thread_buffers() const;
 
   // Predict how many of the given remembered set of length card_rs_length will add to
   // the number of total cards scanned.
@@ -180,6 +180,7 @@ public:
 
   double predict_object_copy_time_ms(size_t bytes_to_copy, bool for_young_only_phase) const;
 
+  double predict_merge_refinement_table_time_ms() const;
   double predict_constant_other_time_ms() const;
 
   double predict_young_other_time_ms(size_t young_num) const;
diff --git a/src/hotspot/share/gc/g1/g1Arguments.cpp b/src/hotspot/share/gc/g1/g1Arguments.cpp
index ee91c327337..5cbafd2ae94 100644
--- a/src/hotspot/share/gc/g1/g1Arguments.cpp
+++ b/src/hotspot/share/gc/g1/g1Arguments.cpp
@@ -68,6 +68,12 @@ void G1Arguments::initialize_alignments() {
   if (FLAG_IS_DEFAULT(G1EagerReclaimRemSetThreshold)) {
     FLAG_SET_ERGO(G1EagerReclaimRemSetThreshold, G1RemSetArrayOfCardsEntries);
   }
+  // G1 prefers to use conditional card marking to avoid overwriting cards that
+  // have already been found to contain a to-collection set reference. This reduces
+  // refinement effort.
+  if (FLAG_IS_DEFAULT(UseCondCardMark)) {
+    FLAG_SET_ERGO(UseCondCardMark, true);
+  }
 }
 
 size_t G1Arguments::conservative_max_heap_alignment() {
@@ -241,9 +247,8 @@ void G1Arguments::initialize() {
 
   // Verify that the maximum parallelism isn't too high to eventually overflow
   // the refcount in G1CardSetContainer.
-  uint max_parallel_refinement_threads = G1ConcRefinementThreads + G1DirtyCardQueueSet::num_par_ids();
   uint const divisor = 3;  // Safe divisor; we increment by 2 for each claim, but there is a small initial value.
-  if (max_parallel_refinement_threads > UINT_MAX / divisor) {
+  if (G1ConcRefinementThreads > UINT_MAX / divisor) {
     vm_exit_during_initialization("Too large parallelism for remembered sets.");
   }
 
diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.cpp b/src/hotspot/share/gc/g1/g1BarrierSet.cpp
index c56434340cd..ab7d6febf4c 100644
--- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp
@@ -32,12 +32,14 @@
 #include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/shared/satbMarkQueue.hpp"
 #include "logging/log.hpp"
+#include "memory/iterator.hpp"
 #include "oops/access.inline.hpp"
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/javaThread.hpp"
 #include "runtime/orderAccess.hpp"
+#include "runtime/threads.hpp"
 #include "utilities/macros.hpp"
 #ifdef COMPILER1
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
@@ -49,18 +51,38 @@
 class G1BarrierSetC1;
 class G1BarrierSetC2;
 
-G1BarrierSet::G1BarrierSet(G1CardTable* card_table) :
+G1BarrierSet::G1BarrierSet(G1CardTable* card_table,
+                           G1CardTable* refinement_table) :
   CardTableBarrierSet(make_barrier_set_assembler<G1BarrierSetAssembler>(),
                       make_barrier_set_c1<G1BarrierSetC1>(),
                       make_barrier_set_c2<G1BarrierSetC2>(),
                       card_table,
                       BarrierSet::FakeRtti(BarrierSet::G1BarrierSet)),
   _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize),
-  _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize),
   _satb_mark_queue_set(&_satb_mark_queue_buffer_allocator),
-  _dirty_card_queue_set(&_dirty_card_queue_buffer_allocator)
+  _refinement_table(refinement_table)
 {}
 
+G1BarrierSet::~G1BarrierSet() {
+  delete _refinement_table;
+}
+
+void G1BarrierSet::swap_global_card_table() {
+  G1CardTable* temp = static_cast<G1CardTable*>(_card_table);
+  _card_table = _refinement_table;
+  _refinement_table = temp;
+}
+
+void G1BarrierSet::update_card_table_base(Thread* thread) {
+#ifdef ASSERT
+  {
+    ResourceMark rm;
+    assert(thread->is_Java_thread(), "may only update card table base of JavaThreads, not %s", thread->name());
+  }
+#endif
+  G1ThreadLocalData::set_byte_map_base(thread, _card_table->byte_map_base());
+}
+
 template <class T> void
 G1BarrierSet::write_ref_array_pre_work(T* dst, size_t count) {
   G1SATBMarkQueueSet& queue_set = G1BarrierSet::satb_mark_queue_set();
@@ -89,28 +111,14 @@ void G1BarrierSet::write_ref_array_pre(narrowOop* dst, size_t count, bool dest_u
   }
 }
 
-void G1BarrierSet::write_ref_field_post_slow(volatile CardValue* byte) {
-  // In the slow path, we know a card is not young
-  assert(*byte != G1CardTable::g1_young_card_val(), "slow path invoked without filtering");
-  OrderAccess::storeload();
-  if (*byte != G1CardTable::dirty_card_val()) {
-    *byte = G1CardTable::dirty_card_val();
-    Thread* thr = Thread::current();
-    G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thr);
-    G1BarrierSet::dirty_card_queue_set().enqueue(queue, byte);
-  }
-}
-
 void G1BarrierSet::write_region(JavaThread* thread, MemRegion mr) {
   if (mr.is_empty()) {
     return;
   }
-  volatile CardValue* byte = _card_table->byte_for(mr.start());
-  CardValue* last_byte = _card_table->byte_for(mr.last());
 
-  // skip young gen cards
-  if (*byte == G1CardTable::g1_young_card_val()) {
-    // MemRegion should not span multiple regions for the young gen.
+  // Skip writes to young gen.
+  if (G1CollectedHeap::heap()->heap_region_containing(mr.start())->is_young()) {
+    // MemRegion should not span multiple regions for arrays in young gen.
     DEBUG_ONLY(G1HeapRegion* containing_hr = G1CollectedHeap::heap()->heap_region_containing(mr.start());)
     assert(containing_hr->is_young(), "it should be young");
     assert(containing_hr->is_in(mr.start()), "it should contain start");
@@ -118,16 +126,25 @@ void G1BarrierSet::write_region(JavaThread* thread, MemRegion mr) {
     return;
   }
 
-  OrderAccess::storeload();
-  // Enqueue if necessary.
-  G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-  G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
+  // We need to make sure that we get the start/end byte information for the area
+  // to mark from the same card table to avoid getting confused in the mark loop
+  // further below - we might execute while the global card table is being switched.
+  //
+  // It does not matter which card table we write to: at worst we may write to the
+  // new card table (after the switching), which means that we will catch the
+  // marks next time.
+  // If we write to the old card table (after the switching, then the refinement
+  // table) the oncoming handshake will do the memory synchronization.
+  CardTable* card_table = AtomicAccess::load(&_card_table);
+
+  volatile CardValue* byte = card_table->byte_for(mr.start());
+  CardValue* last_byte = card_table->byte_for(mr.last());
+
+  // Dirty cards only if necessary.
   for (; byte <= last_byte; byte++) {
     CardValue bv = *byte;
-    assert(bv != G1CardTable::g1_young_card_val(), "Invalid card");
-    if (bv != G1CardTable::dirty_card_val()) {
+    if (bv == G1CardTable::clean_card_val()) {
       *byte = G1CardTable::dirty_card_val();
-      qset.enqueue(queue, byte);
     }
   }
 }
@@ -148,14 +165,15 @@ void G1BarrierSet::on_thread_attach(Thread* thread) {
   assert(!satbq.is_active(), "SATB queue should not be active");
   assert(satbq.buffer() == nullptr, "SATB queue should not have a buffer");
   assert(satbq.index() == 0, "SATB queue index should be zero");
-  G1DirtyCardQueue& dirtyq = G1ThreadLocalData::dirty_card_queue(thread);
-  assert(dirtyq.buffer() == nullptr, "Dirty Card queue should not have a buffer");
-  assert(dirtyq.index() == 0, "Dirty Card queue index should be zero");
-
   // If we are creating the thread during a marking cycle, we should
   // set the active field of the SATB queue to true.  That involves
   // copying the global is_active value to this thread's queue.
   satbq.set_active(_satb_mark_queue_set.is_active());
+
+  if (thread->is_Java_thread()) {
+    assert(Threads_lock->is_locked(), "must be, synchronization with refinement.");
+    update_card_table_base(thread);
+  }
 }
 
 void G1BarrierSet::on_thread_detach(Thread* thread) {
@@ -165,14 +183,13 @@ void G1BarrierSet::on_thread_detach(Thread* thread) {
     SATBMarkQueue& queue = G1ThreadLocalData::satb_mark_queue(thread);
     G1BarrierSet::satb_mark_queue_set().flush_queue(queue);
   }
-  {
-    G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
-    G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-    qset.flush_queue(queue);
-    qset.record_detached_refinement_stats(queue.refinement_stats());
-  }
   {
     G1RegionPinCache& cache = G1ThreadLocalData::pin_count_cache(thread);
     cache.flush();
   }
 }
+
+void G1BarrierSet::print_on(outputStream* st) const {
+  _card_table->print_on(st, "Card");
+  _refinement_table->print_on(st, "Refinement");
+}
diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.hpp b/src/hotspot/share/gc/g1/g1BarrierSet.hpp
index 2b1074fcd7a..40e87c373b7 100644
--- a/src/hotspot/share/gc/g1/g1BarrierSet.hpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,32 +25,65 @@
 #ifndef SHARE_GC_G1_G1BARRIERSET_HPP
 #define SHARE_GC_G1_G1BARRIERSET_HPP
 
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
 #include "gc/shared/bufferNode.hpp"
 #include "gc/shared/cardTable.hpp"
 #include "gc/shared/cardTableBarrierSet.hpp"
 
 class G1CardTable;
+class Thread;
 
-// This barrier is specialized to use a logging barrier to support
-// snapshot-at-the-beginning marking.
-
+// This barrier set is specialized to manage two card tables:
+// * one the mutator is currently working on ("card table")
+// * one the refinement threads or GC during pause are working on ("refinement table")
+//
+// The card table acts like a regular card table where the mutator dirties cards
+// containing potentially interesting references.
+//
+// When the amount of dirty cards on the card table exceeds a threshold, G1 swaps
+// the card tables and has the refinement threads reduce them by "refining"
+// them.
+// I.e. refinement looks at all dirty cards on the refinement table, and updates
+// the remembered sets accordingly, clearing the cards on the refinement table.
+//
+// Meanwhile the mutator continues dirtying the now empty card table.
+//
+// This separation of data the mutator and refinement threads are working on
+// removes the need for any fine-grained (per mutator write) synchronization between
+// them, keeping the write barrier simple.
+//
+// The refinement threads mark cards in the current collection set specially on the
+// card table - this is fine wrt synchronization with the mutator, because at
+// most the mutator will overwrite it again if there is a race, as G1 will scan the
+// entire card either way during the GC pause.
+//
+// During garbage collection, if the refinement table is known to be non-empty, G1
+// merges it back (and cleaning it) to the card table which is scanned for dirty
+// cards.
+//
 class G1BarrierSet: public CardTableBarrierSet {
   friend class VMStructs;
  private:
   BufferNode::Allocator _satb_mark_queue_buffer_allocator;
-  BufferNode::Allocator _dirty_card_queue_buffer_allocator;
   G1SATBMarkQueueSet _satb_mark_queue_set;
-  G1DirtyCardQueueSet _dirty_card_queue_set;
+
+  G1CardTable* _refinement_table;
+
+ public:
+  G1BarrierSet(G1CardTable* card_table, G1CardTable* refinement_table);
+  virtual ~G1BarrierSet();
 
   static G1BarrierSet* g1_barrier_set() {
     return barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
   }
 
- public:
-  G1BarrierSet(G1CardTable* table);
-  ~G1BarrierSet() { }
+  G1CardTable* refinement_table() const { return _refinement_table; }
+
+  // Swap the global card table references, without synchronization.
+  void swap_global_card_table();
+
+  // Update the given thread's card table (byte map) base to the current card table's.
+  void update_card_table_base(Thread* thread);
 
   virtual bool card_mark_must_follow_store() const {
     return true;
@@ -74,9 +107,8 @@ class G1BarrierSet: public CardTableBarrierSet {
   inline void write_region(MemRegion mr);
   void write_region(JavaThread* thread, MemRegion mr);
 
-  template <DecoratorSet decorators, typename T>
+  template <DecoratorSet decorators = DECORATORS_NONE, typename T>
   void write_ref_field_post(T* field);
-  void write_ref_field_post_slow(volatile CardValue* byte);
 
   virtual void on_thread_create(Thread* thread);
   virtual void on_thread_destroy(Thread* thread);
@@ -87,9 +119,7 @@ class G1BarrierSet: public CardTableBarrierSet {
     return g1_barrier_set()->_satb_mark_queue_set;
   }
 
-  static G1DirtyCardQueueSet& dirty_card_queue_set() {
-    return g1_barrier_set()->_dirty_card_queue_set;
-  }
+  virtual void print_on(outputStream* st) const;
 
   // Callbacks for runtime accesses.
   template <DecoratorSet decorators, typename BarrierSetT = G1BarrierSet>
diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp
index 9678da190af..0888fc58937 100644
--- a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp
@@ -75,9 +75,8 @@ inline void G1BarrierSet::write_region(MemRegion mr) {
 template <DecoratorSet decorators, typename T>
 inline void G1BarrierSet::write_ref_field_post(T* field) {
   volatile CardValue* byte = _card_table->byte_for(field);
-  if (*byte != G1CardTable::g1_young_card_val()) {
-    // Take a slow path for cards in old
-    write_ref_field_post_slow(byte);
+  if (*byte == G1CardTable::clean_card_val()) {
+    *byte = G1CardTable::dirty_card_val();
   }
 }
 
@@ -127,7 +126,7 @@ inline void G1BarrierSet::AccessBarrier<decorators, BarrierSetT>::
 oop_store_not_in_heap(T* addr, oop new_value) {
   // Apply SATB barriers for all non-heap references, to allow
   // concurrent scanning of such references.
-  G1BarrierSet *bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
+  G1BarrierSet *bs = g1_barrier_set();
   bs->write_ref_field_pre<decorators>(addr);
   Raw::oop_store(addr, new_value);
 }
diff --git a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp
index 205829bba1a..24ade277afe 100644
--- a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp
@@ -29,17 +29,17 @@
 #include "utilities/macros.hpp"
 
 void G1BarrierSetRuntime::write_ref_array_pre_oop_entry(oop* dst, size_t length) {
-  G1BarrierSet *bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
+  G1BarrierSet *bs = G1BarrierSet::g1_barrier_set();
   bs->write_ref_array_pre(dst, length, false);
 }
 
 void G1BarrierSetRuntime::write_ref_array_pre_narrow_oop_entry(narrowOop* dst, size_t length) {
-  G1BarrierSet *bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
+  G1BarrierSet *bs = G1BarrierSet::g1_barrier_set();
   bs->write_ref_array_pre(dst, length, false);
 }
 
 void G1BarrierSetRuntime::write_ref_array_post_entry(HeapWord* dst, size_t length) {
-  G1BarrierSet *bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
+  G1BarrierSet *bs = G1BarrierSet::g1_barrier_set();
   bs->G1BarrierSet::write_ref_array(dst, length);
 }
 
@@ -53,14 +53,6 @@ JRT_LEAF(void, G1BarrierSetRuntime::write_ref_field_pre_entry(oopDesc* orig, Jav
   G1BarrierSet::satb_mark_queue_set().enqueue_known_active(queue, orig);
 JRT_END
 
-// G1 post write barrier slowpath
-JRT_LEAF(void, G1BarrierSetRuntime::write_ref_field_post_entry(volatile G1CardTable::CardValue* card_addr,
-                                                               JavaThread* thread))
-  assert(thread == JavaThread::current(), "pre-condition");
-  G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
-  G1BarrierSet::dirty_card_queue_set().enqueue(queue, card_addr);
-JRT_END
-
 JRT_LEAF(void, G1BarrierSetRuntime::clone(oopDesc* src, oopDesc* dst, size_t size))
   HeapAccess<>::clone(src, dst, size);
 JRT_END
diff --git a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp
index 27287a0624b..ba7bc4d90f4 100644
--- a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,7 +47,6 @@ public:
 
   // C2 slow-path runtime calls.
   static void write_ref_field_pre_entry(oopDesc* orig, JavaThread *thread);
-  static void write_ref_field_post_entry(volatile CardValue* card_addr, JavaThread* thread);
 
   static address clone_addr();
 };
diff --git a/src/hotspot/share/gc/g1/g1CardTable.cpp b/src/hotspot/share/gc/g1/g1CardTable.cpp
index 303b8cda91f..6df178d49c5 100644
--- a/src/hotspot/share/gc/g1/g1CardTable.cpp
+++ b/src/hotspot/share/gc/g1/g1CardTable.cpp
@@ -28,18 +28,37 @@
 #include "logging/log.hpp"
 #include "runtime/os.hpp"
 
-void G1CardTable::g1_mark_as_young(const MemRegion& mr) {
-  CardValue *const first = byte_for(mr.start());
-  CardValue *const last = byte_after(mr.last());
+void G1CardTable::verify_region(MemRegion mr, CardValue val, bool val_equals) {
+  if (mr.is_empty()) {
+    return;
+  }
+  CardValue* start    = byte_for(mr.start());
+  CardValue* end      = byte_for(mr.last());
 
-  memset_with_concurrent_readers(first, g1_young_gen, pointer_delta(last, first, sizeof(CardValue)));
-}
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  G1HeapRegion* r = g1h->heap_region_containing(mr.start());
 
-#ifndef PRODUCT
-void G1CardTable::verify_g1_young_region(MemRegion mr) {
-  verify_region(mr, g1_young_gen,  true);
+  assert(r == g1h->heap_region_containing(mr.last()), "MemRegion crosses region");
+
+  bool failures = false;
+  for (CardValue* curr = start; curr <= end; ++curr) {
+    CardValue curr_val = *curr;
+    bool failed = (val_equals) ? (curr_val != val) : (curr_val == val);
+    if (failed) {
+      if (!failures) {
+        log_error(gc, verify)("== CT verification failed: [" PTR_FORMAT "," PTR_FORMAT "] r: %d (%s) %sexpecting value: %d",
+                              p2i(start), p2i(end), r->hrm_index(), r->get_short_type_str(),
+                              (val_equals) ? "" : "not ", val);
+        failures = true;
+      }
+      log_error(gc, verify)("==   card " PTR_FORMAT " [" PTR_FORMAT "," PTR_FORMAT "], val: %d",
+                            p2i(curr), p2i(addr_for(curr)),
+                            p2i((HeapWord*) (((size_t) addr_for(curr)) + _card_size)),
+                            (int) curr_val);
+    }
+  }
+  guarantee(!failures, "there should not have been any failures");
 }
-#endif
 
 void G1CardTableChangedListener::on_commit(uint start_idx, size_t num_regions, bool zero_filled) {
   // Default value for a clean card on the card table is -1. So we cannot take advantage of the zero_filled parameter.
@@ -74,6 +93,5 @@ void G1CardTable::initialize(G1RegionToSpaceMapper* mapper) {
 }
 
 bool G1CardTable::is_in_young(const void* p) const {
-  volatile CardValue* card = byte_for(p);
-  return *card == G1CardTable::g1_young_card_val();
+  return G1CollectedHeap::heap()->heap_region_containing(p)->is_young();
 }
diff --git a/src/hotspot/share/gc/g1/g1CardTable.hpp b/src/hotspot/share/gc/g1/g1CardTable.hpp
index 16133029a11..060e5459778 100644
--- a/src/hotspot/share/gc/g1/g1CardTable.hpp
+++ b/src/hotspot/share/gc/g1/g1CardTable.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -52,8 +52,6 @@ class G1CardTable : public CardTable {
 
 public:
   enum G1CardValues {
-    g1_young_gen = CT_MR_BS_last_reserved << 1,
-
     // During evacuation we use the card table to consolidate the cards we need to
     // scan for roots onto the card table from the various sources. Further it is
     // used to record already completely scanned cards to avoid re-scanning them
@@ -63,18 +61,43 @@ public:
     // The merge at the start of each evacuation round simply sets cards to dirty
     // that are clean; scanned cards are set to 0x1.
     //
-    // This means that the LSB determines what to do with the card during evacuation
-    // given the following possible values:
+    // This means that the LSB determines whether the card is clean or non-clean
+    // (LSB is 1 -> clean, LSB is 0 -> non-clean) given the following possible values:
     //
-    // 11111111 - clean, do not scan
-    // 00000001 - already scanned, do not scan
+    // xxxxxxx1 - clean, already scanned, do not scan again (during GC only).
+    // 00000100 - dirty, needs to be scanned, dirty from remembered set (during GC only)
+    // 00000010 - dirty, needs to be scanned, contains reference to collection set.
     // 00000000 - dirty, needs to be scanned.
     //
-    g1_card_already_scanned = 0x1
+    // g1_to_cset_card and g1_from_remset_card are both used for optimization and
+    // needed for more accurate prediction of card generation rate.
+    //
+    // g1_to_cset_card allows to separate dirty card generation rate by the mutator
+    // (which just dirties cards) from cards that will be scanned during next garbage
+    // collection anyway.
+    // Further it allows the optimization to not refine them, assuming that their
+    // references to young gen does not change, and not add this card to any other
+    // remembered set.
+    // This color is sticky during mutator time: refinement threads encountering
+    // this card on the refinement table will just copy it over to the regular card
+    // table without re-refining this card. This saves on refinement effort spent
+    // on that card because most of the time already found interesting references
+    // stay interesting.
+    //
+    // g1_from_remset_card allows separation of cards generated by the mutator from
+    // cards in the remembered set, again to make mutator dirty card generation
+    // prediction more accurate.
+    //
+    // More accurate prediction allow better (less wasteful) refinement control.
+    g1_dirty_card = dirty_card,
+    g1_card_already_scanned = 0x1,
+    g1_to_cset_card = 0x2,
+    g1_from_remset_card = 0x4
   };
 
   static const size_t WordAllClean = SIZE_MAX;
   static const size_t WordAllDirty = 0;
+  static const size_t WordAllFromRemset = (SIZE_MAX / 255) * g1_from_remset_card;
 
   STATIC_ASSERT(BitsPerByte == 8);
   static const size_t WordAlreadyScanned = (SIZE_MAX / 255) * g1_card_already_scanned;
@@ -83,27 +106,27 @@ public:
     _listener.set_card_table(this);
   }
 
-  static CardValue g1_young_card_val() { return g1_young_gen; }
   static CardValue g1_scanned_card_val() { return g1_card_already_scanned; }
 
-  void verify_g1_young_region(MemRegion mr) PRODUCT_RETURN;
-  void g1_mark_as_young(const MemRegion& mr);
+  void verify_region(MemRegion mr, CardValue val, bool val_equals) override;
 
   size_t index_for_cardvalue(CardValue const* p) const {
     return pointer_delta(p, _byte_map, sizeof(CardValue));
   }
 
-  // Mark the given card as Dirty if it is Clean. Returns whether the card was
+  // Mark the given card as From Remset if it is Clean. Returns whether the card was
   // Clean before this operation. This result may be inaccurate as it does not
   // perform the dirtying atomically.
-  inline bool mark_clean_as_dirty(CardValue* card);
+  inline bool mark_clean_as_from_remset(CardValue* card);
 
-  // Change Clean cards in a (large) area on the card table as Dirty, preserving
-  // already scanned cards. Assumes that most cards in that area are Clean.
-  inline void mark_range_dirty(size_t start_card_index, size_t num_cards);
+  // Change Clean cards in a (large) area on the card table as From_Remset, preserving
+  // cards already marked otherwise. Assumes that most cards in that area are Clean.
+  // Not atomic.
+  inline size_t mark_clean_range_as_from_remset(size_t start_card_index, size_t num_cards);
 
-  // Change the given range of dirty cards to "which". All of these cards must be Dirty.
-  inline void change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which);
+  // Change the given range of dirty cards to "which". All of these cards must be non-clean.
+  // Returns the number of pending cards found.
+  inline size_t change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which);
 
   inline uint region_idx_for(CardValue* p);
 
diff --git a/src/hotspot/share/gc/g1/g1CardTable.inline.hpp b/src/hotspot/share/gc/g1/g1CardTable.inline.hpp
index 03bce7d50d7..370dc22ded0 100644
--- a/src/hotspot/share/gc/g1/g1CardTable.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1CardTable.inline.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,25 +28,39 @@
 #include "gc/g1/g1CardTable.hpp"
 
 #include "gc/g1/g1HeapRegion.hpp"
+#include "utilities/population_count.hpp"
 
 inline uint G1CardTable::region_idx_for(CardValue* p) {
   size_t const card_idx = pointer_delta(p, _byte_map, sizeof(CardValue));
   return (uint)(card_idx >> G1HeapRegion::LogCardsPerRegion);
 }
 
-inline bool G1CardTable::mark_clean_as_dirty(CardValue* card) {
+inline bool G1CardTable::mark_clean_as_from_remset(CardValue* card) {
   CardValue value = *card;
   if (value == clean_card_val()) {
-    *card = dirty_card_val();
+    *card = g1_from_remset_card;
     return true;
   }
   return false;
 }
 
-inline void G1CardTable::mark_range_dirty(size_t start_card_index, size_t num_cards) {
+// Returns bits from a where mask is 0, and bits from b where mask is 1.
+//
+// Example:
+// a      = 0xAAAAAAAA
+// b      = 0xBBBBBBBB
+// mask   = 0xFF00FF00
+// result = 0xBBAABBAA
+inline size_t blend(size_t a, size_t b, size_t mask) {
+  return (a & ~mask) | (b & mask);
+}
+
+inline size_t G1CardTable::mark_clean_range_as_from_remset(size_t start_card_index, size_t num_cards) {
   assert(is_aligned(start_card_index, sizeof(size_t)), "Start card index must be aligned.");
   assert(is_aligned(num_cards, sizeof(size_t)), "Number of cards to change must be evenly divisible.");
 
+  size_t result = 0;
+
   size_t const num_chunks = num_cards / sizeof(size_t);
 
   size_t* cur_word = (size_t*)&_byte_map[start_card_index];
@@ -54,31 +68,33 @@ inline void G1CardTable::mark_range_dirty(size_t start_card_index, size_t num_ca
   while (cur_word < end_word_map) {
     size_t value = *cur_word;
     if (value == WordAllClean) {
-      *cur_word = WordAllDirty;
-    } else if (value == WordAllDirty) {
-      // do nothing.
+      *cur_word = WordAllFromRemset;
+      result += sizeof(size_t);
+    } else if ((value & WordAlreadyScanned) == 0) {
+      // Do nothing if there is no "Clean" card in it.
     } else {
-      // There is a mix of cards in there. Tread slowly.
-      CardValue* cur = (CardValue*)cur_word;
-      for (size_t i = 0; i < sizeof(size_t); i++) {
-        CardValue value = *cur;
-        if (value == clean_card_val()) {
-          *cur = dirty_card_val();
-        }
-        cur++;
-      }
+      // There is a mix of cards in there. Tread "slowly".
+      size_t clean_card_mask = (value & WordAlreadyScanned) * 0xff; // All "Clean" cards have 0xff, all other places 0x00 now.
+      result += population_count(clean_card_mask) / BitsPerByte;
+      *cur_word = blend(value, WordAllFromRemset, clean_card_mask);
     }
     cur_word++;
   }
+  return result;
 }
 
-inline void G1CardTable::change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which) {
+inline size_t G1CardTable::change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which) {
+  size_t result = 0;
   for (CardValue* i_card = start_card; i_card < end_card; ++i_card) {
     CardValue value = *i_card;
-    assert(value == dirty_card_val(),
+    assert((value & g1_card_already_scanned) == 0,
            "Must have been dirty %d start " PTR_FORMAT " " PTR_FORMAT, value, p2i(start_card), p2i(end_card));
+    if (value == g1_dirty_card) {
+      result++;
+    }
     *i_card = which;
   }
+  return result;
 }
 
 #endif /* SHARE_GC_G1_G1CARDTABLE_INLINE_HPP */
diff --git a/src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp b/src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp
new file mode 100644
index 00000000000..e0cadbdd907
--- /dev/null
+++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1HeapRegion.inline.hpp"
+#include "gc/shared/workerThread.hpp"
+#include "memory/allocation.hpp"
+#include "utilities/checkedCast.hpp"
+#include "utilities/powerOfTwo.hpp"
+
+G1CardTableClaimTable::G1CardTableClaimTable(uint chunks_per_region) :
+  _max_reserved_regions(0),
+  _card_claims(nullptr),
+  _cards_per_chunk(checked_cast<uint>(G1HeapRegion::CardsPerRegion / chunks_per_region))
+{
+  guarantee(chunks_per_region > 0, "%u chunks per region", chunks_per_region);
+}
+
+G1CardTableClaimTable::~G1CardTableClaimTable() {
+  FREE_C_HEAP_ARRAY(uint, _card_claims);
+}
+
+void G1CardTableClaimTable::initialize(uint max_reserved_regions) {
+  assert(_card_claims == nullptr, "Must not be initialized twice");
+  _card_claims = NEW_C_HEAP_ARRAY(uint, max_reserved_regions, mtGC);
+  _max_reserved_regions = max_reserved_regions;
+  reset_all_to_unclaimed();
+}
+
+void G1CardTableClaimTable::reset_all_to_unclaimed() {
+  for (uint i = 0; i < _max_reserved_regions; i++) {
+    _card_claims[i] = 0;
+  }
+}
+
+void G1CardTableClaimTable::reset_all_to_claimed() {
+  for (uint i = 0; i < _max_reserved_regions; i++) {
+    _card_claims[i] = (uint)G1HeapRegion::CardsPerRegion;
+  }
+}
+
+void G1CardTableClaimTable::heap_region_iterate_from_worker_offset(G1HeapRegionClosure* cl, uint worker_id, uint max_workers) {
+  // Every worker will actually look at all regions, skipping over regions that
+  // are completed.
+  const size_t n_regions = _max_reserved_regions;
+  const uint start_index = (uint)(worker_id * n_regions / max_workers);
+
+  for (uint count = 0; count < n_regions; count++) {
+    const uint index = (start_index + count) % n_regions;
+    assert(index < n_regions, "sanity");
+    // Skip over fully processed regions
+    if (!has_unclaimed_cards(index)) {
+      continue;
+    }
+    G1HeapRegion* r = G1CollectedHeap::heap()->region_at(index);
+    bool res = cl->do_heap_region(r);
+    if (res) {
+      return;
+    }
+  }
+}
+
+G1CardTableChunkClaimer::G1CardTableChunkClaimer(G1CardTableClaimTable* scan_state, uint region_idx) :
+  _claim_values(scan_state),
+  _region_idx(region_idx),
+  _cur_claim(0) {
+  guarantee(size() <= G1HeapRegion::CardsPerRegion, "Should not claim more space than possible.");
+}
+
+G1ChunkScanner::G1ChunkScanner(CardValue* const start_card, CardValue* const end_card) :
+  _start_card(start_card),
+  _end_card(end_card) {
+    assert(is_word_aligned(start_card), "precondition");
+    assert(is_word_aligned(end_card), "precondition");
+}
diff --git a/src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp b/src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp
new file mode 100644
index 00000000000..4f524b83f97
--- /dev/null
+++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP
+#define SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP
+
+#include "gc/g1/g1CardTable.hpp"
+#include "memory/allocation.hpp"
+
+class G1HeapRegionClosure;
+
+// Helper class representing claim values for the cards in the card table corresponding
+// to a region.
+// I.e. for every region this class stores an atomic counter that represents the
+// number of cards from 0 to the number of cards per region already claimed for
+// this region.
+// If the claimed value is >= the number of cards of a region, the region can be
+// considered fully claimed.
+//
+// Claiming works on full region (all cards in region) or a range of contiguous cards
+// (chunk). Chunk size is given at construction time.
+class G1CardTableClaimTable : public CHeapObj<mtGC> {
+  uint _max_reserved_regions;
+
+  // Card table iteration claim values for every heap region, from 0 (completely unclaimed)
+  // to (>=) G1HeapRegion::CardsPerRegion (completely claimed).
+  uint volatile* _card_claims;
+
+  uint _cards_per_chunk;           // For conversion between card index and chunk index.
+
+  // Claim increment number of cards, returning the previous claim value.
+  inline uint claim_cards(uint region, uint increment);
+
+public:
+  G1CardTableClaimTable(uint chunks_per_region);
+  ~G1CardTableClaimTable();
+
+  // Allocates the data structure and initializes the claims to unclaimed.
+  void initialize(uint max_reserved_regions);
+
+  void reset_all_to_unclaimed();
+  void reset_all_to_claimed();
+
+  inline bool has_unclaimed_cards(uint region);
+  inline void reset_to_unclaimed(uint region);
+
+  // Claims all cards in that region, returning the previous claim value.
+  inline uint claim_all_cards(uint region);
+
+  // Claim a single chunk in that region, returning the previous claim value.
+  inline uint claim_chunk(uint region);
+  inline uint cards_per_chunk() const;
+
+  size_t max_reserved_regions() { return _max_reserved_regions; }
+
+  void heap_region_iterate_from_worker_offset(G1HeapRegionClosure* cl, uint worker_id, uint max_workers);
+};
+
+// Helper class to claim dirty chunks within the card table for a given region.
+class G1CardTableChunkClaimer {
+  G1CardTableClaimTable* _claim_values;
+
+  uint _region_idx;
+  uint _cur_claim;
+
+public:
+  G1CardTableChunkClaimer(G1CardTableClaimTable* claim_table, uint region_idx);
+
+  inline bool has_next();
+
+  inline uint value() const;
+  inline uint size() const;
+};
+
+// Helper class to locate consecutive dirty cards inside a range of cards.
+class G1ChunkScanner {
+  using Word = size_t;
+  using CardValue = G1CardTable::CardValue;
+
+  CardValue* const _start_card;
+  CardValue* const _end_card;
+
+  static const size_t ExpandedToScanMask = G1CardTable::WordAlreadyScanned;
+  static const size_t ToScanMask = G1CardTable::g1_card_already_scanned;
+
+  inline bool is_card_dirty(const CardValue* const card) const;
+
+  inline bool is_word_aligned(const void* const addr) const;
+
+  inline CardValue* find_first_dirty_card(CardValue* i_card) const;
+  inline CardValue* find_first_non_dirty_card(CardValue* i_card) const;
+
+public:
+  G1ChunkScanner(CardValue* const start_card, CardValue* const end_card);
+
+  template<typename Func>
+  void on_dirty_cards(Func&& f) {
+    for (CardValue* cur_card = _start_card; cur_card < _end_card; /* empty */) {
+      CardValue* dirty_l = find_first_dirty_card(cur_card);
+      CardValue* dirty_r = find_first_non_dirty_card(dirty_l);
+
+      assert(dirty_l <= dirty_r, "inv");
+
+      if (dirty_l == dirty_r) {
+        assert(dirty_r == _end_card, "finished the entire chunk");
+        return;
+      }
+
+      f(dirty_l, dirty_r);
+
+      cur_card = dirty_r + 1;
+    }
+  }
+};
+
+#endif // SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP
diff --git a/src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp b/src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp
new file mode 100644
index 00000000000..d682f0d17ae
--- /dev/null
+++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP
+#define SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP
+
+#include "gc/g1/g1CardTableClaimTable.hpp"
+
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1HeapRegion.inline.hpp"
+#include "runtime/atomicAccess.hpp"
+
+bool G1CardTableClaimTable::has_unclaimed_cards(uint region) {
+  assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
+  return AtomicAccess::load(&_card_claims[region]) < G1HeapRegion::CardsPerRegion;
+}
+
+void G1CardTableClaimTable::reset_to_unclaimed(uint region) {
+  assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
+  AtomicAccess::store(&_card_claims[region], 0u);
+}
+
+uint G1CardTableClaimTable::claim_cards(uint region, uint increment) {
+  assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
+  return AtomicAccess::fetch_then_add(&_card_claims[region], increment, memory_order_relaxed);
+}
+
+uint G1CardTableClaimTable::claim_chunk(uint region) {
+  assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
+  return AtomicAccess::fetch_then_add(&_card_claims[region], cards_per_chunk(), memory_order_relaxed);
+}
+
+uint G1CardTableClaimTable::claim_all_cards(uint region) {
+  return claim_cards(region, (uint)G1HeapRegion::CardsPerRegion);
+}
+
+uint G1CardTableClaimTable::cards_per_chunk() const { return _cards_per_chunk; }
+
+bool G1CardTableChunkClaimer::has_next() {
+  _cur_claim = _claim_values->claim_chunk(_region_idx);
+  return (_cur_claim < G1HeapRegion::CardsPerRegion);
+}
+
+uint G1CardTableChunkClaimer::value() const { return _cur_claim; }
+uint G1CardTableChunkClaimer::size() const { return _claim_values->cards_per_chunk(); }
+
+bool G1ChunkScanner::is_card_dirty(const CardValue* const card) const {
+  return (*card & ToScanMask) == 0;
+}
+
+bool G1ChunkScanner::is_word_aligned(const void* const addr) const {
+  return ((uintptr_t)addr) % sizeof(Word) == 0;
+}
+
+G1CardTable::CardValue* G1ChunkScanner::find_first_dirty_card(CardValue* i_card) const {
+  while (!is_word_aligned(i_card)) {
+    if (is_card_dirty(i_card)) {
+      return i_card;
+    }
+    i_card++;
+  }
+
+  for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) {
+    Word word_value = *reinterpret_cast<Word*>(i_card);
+    bool has_dirty_cards_in_word = (~word_value & ExpandedToScanMask) != 0;
+
+    if (has_dirty_cards_in_word) {
+      for (uint i = 0; i < sizeof(Word); ++i) {
+        if (is_card_dirty(i_card)) {
+          return i_card;
+        }
+        i_card++;
+      }
+      ShouldNotReachHere();
+    }
+  }
+
+  return _end_card;
+}
+
+G1CardTable::CardValue* G1ChunkScanner::find_first_non_dirty_card(CardValue* i_card) const {
+  while (!is_word_aligned(i_card)) {
+    if (!is_card_dirty(i_card)) {
+      return i_card;
+    }
+    i_card++;
+  }
+
+  for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) {
+    Word word_value = *reinterpret_cast<Word*>(i_card);
+    bool all_cards_dirty = (word_value & ExpandedToScanMask) == 0;
+
+    if (!all_cards_dirty) {
+      for (uint i = 0; i < sizeof(Word); ++i) {
+        if (!is_card_dirty(i_card)) {
+          return i_card;
+        }
+        i_card++;
+      }
+      ShouldNotReachHere();
+    }
+  }
+
+  return _end_card;
+}
+
+#endif // SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP
diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
index 4a257265931..ed21c9aa370 100644
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@@ -38,7 +38,6 @@
 #include "gc/g1/g1ConcurrentMarkThread.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1EvacStats.inline.hpp"
 #include "gc/g1/g1FullCollector.hpp"
 #include "gc/g1/g1GCCounters.hpp"
@@ -60,10 +59,10 @@
 #include "gc/g1/g1ParScanThreadState.inline.hpp"
 #include "gc/g1/g1PeriodicGCTask.hpp"
 #include "gc/g1/g1Policy.hpp"
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
 #include "gc/g1/g1RegionPinCache.inline.hpp"
 #include "gc/g1/g1RegionToSpaceMapper.hpp"
 #include "gc/g1/g1RemSet.hpp"
+#include "gc/g1/g1ReviseYoungLengthTask.hpp"
 #include "gc/g1/g1RootClosures.hpp"
 #include "gc/g1/g1RootProcessor.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
@@ -111,6 +110,7 @@
 #include "runtime/init.hpp"
 #include "runtime/java.hpp"
 #include "runtime/orderAccess.hpp"
+#include "runtime/threads.hpp"
 #include "runtime/threadSMR.hpp"
 #include "runtime/vmThread.hpp"
 #include "utilities/align.hpp"
@@ -146,7 +146,7 @@ void G1CollectedHeap::run_batch_task(G1BatchedTask* cl) {
   workers()->run_task(cl, num_workers);
 }
 
-uint G1CollectedHeap::get_chunks_per_region() {
+uint G1CollectedHeap::get_chunks_per_region_for_scan() {
   uint log_region_size = G1HeapRegion::LogOfHRGrainBytes;
   // Limit the expected input values to current known possible values of the
   // (log) region size. Adjust as necessary after testing if changing the permissible
@@ -156,6 +156,18 @@ uint G1CollectedHeap::get_chunks_per_region() {
   return 1u << (log_region_size / 2 - 4);
 }
 
+uint G1CollectedHeap::get_chunks_per_region_for_merge() {
+  uint log_region_size = G1HeapRegion::LogOfHRGrainBytes;
+  // Limit the expected input values to current known possible values of the
+  // (log) region size. Adjust as necessary after testing if changing the permissible
+  // values for region size.
+  assert(log_region_size >= 20 && log_region_size <= 29,
+         "expected value in [20,29], but got %u", log_region_size);
+
+  uint half_log_region_size = (log_region_size + 1) / 2;
+  return 1 << (half_log_region_size - 9);
+}
+
 G1HeapRegion* G1CollectedHeap::new_heap_region(uint hrs_index,
                                                MemRegion mr) {
   return new G1HeapRegion(hrs_index, bot(), mr, &_card_set_config);
@@ -614,7 +626,6 @@ inline HeapWord* G1CollectedHeap::attempt_allocation(size_t min_word_size,
   assert_heap_not_locked();
   if (result != nullptr) {
     assert(*actual_word_size != 0, "Actual size must have been set here");
-    dirty_young_block(result, *actual_word_size);
   } else {
     *actual_word_size = 0;
   }
@@ -809,11 +820,27 @@ void G1CollectedHeap::prepare_for_mutator_after_full_collection(size_t allocatio
 }
 
 void G1CollectedHeap::abort_refinement() {
-  // Discard all remembered set updates and reset refinement statistics.
-  G1BarrierSet::dirty_card_queue_set().abandon_logs_and_stats();
-  assert(G1BarrierSet::dirty_card_queue_set().num_cards() == 0,
-         "DCQS should be empty");
-  concurrent_refine()->get_and_reset_refinement_stats();
+  G1ConcurrentRefineSweepState& sweep_state = concurrent_refine()->sweep_state();
+  if (sweep_state.is_in_progress()) {
+
+    if (!sweep_state.are_java_threads_synched()) {
+      // Synchronize Java threads with global card table that has already been swapped.
+      class SwapThreadCardTableClosure : public ThreadClosure {
+      public:
+
+        virtual void do_thread(Thread* t) {
+          G1BarrierSet* bs = G1BarrierSet::g1_barrier_set();
+          bs->update_card_table_base(t);
+        }
+      } cl;
+      Threads::java_threads_do(&cl);
+    }
+
+    // Record any available refinement statistics.
+    policy()->record_refinement_stats(sweep_state.stats());
+    sweep_state.complete_work(false /* concurrent */, false /* print_log */);
+  }
+  sweep_state.reset_stats();
 }
 
 void G1CollectedHeap::verify_after_full_collection() {
@@ -825,6 +852,7 @@ void G1CollectedHeap::verify_after_full_collection() {
   }
   _hrm.verify_optional();
   _verifier->verify_region_sets_optional();
+  _verifier->verify_card_tables_clean(true /* both_card_tables */);
   _verifier->verify_after_gc();
   _verifier->verify_bitmap_clear(false /* above_tams_only */);
 
@@ -1168,8 +1196,13 @@ G1CollectedHeap::G1CollectedHeap() :
   _service_thread(nullptr),
   _periodic_gc_task(nullptr),
   _free_arena_memory_task(nullptr),
+  _revise_young_length_task(nullptr),
   _workers(nullptr),
-  _card_table(nullptr),
+  _refinement_epoch(0),
+  _last_synchronized_start(0),
+  _last_refinement_epoch_start(0),
+  _yield_duration_in_refinement_epoch(0),
+  _last_safepoint_refinement_epoch(0),
   _collection_pause_end(Ticks::now()),
   _old_set("Old Region Set", new OldRegionSetChecker()),
   _humongous_set("Humongous Region Set", new HumongousRegionSetChecker()),
@@ -1289,7 +1322,7 @@ G1RegionToSpaceMapper* G1CollectedHeap::create_aux_memory_mapper(const char* des
 
 jint G1CollectedHeap::initialize_concurrent_refinement() {
   jint ecode = JNI_OK;
-  _cr = G1ConcurrentRefine::create(policy(), &ecode);
+  _cr = G1ConcurrentRefine::create(this, &ecode);
   return ecode;
 }
 
@@ -1345,18 +1378,12 @@ jint G1CollectedHeap::initialize() {
   initialize_reserved_region(heap_rs);
 
   // Create the barrier set for the entire reserved region.
-  G1CardTable* ct = new G1CardTable(_reserved);
-  G1BarrierSet* bs = new G1BarrierSet(ct);
+  G1CardTable* card_table = new G1CardTable(_reserved);
+  G1CardTable* refinement_table = new G1CardTable(_reserved);
+
+  G1BarrierSet* bs = new G1BarrierSet(card_table, refinement_table);
   bs->initialize();
   assert(bs->is_a(BarrierSet::G1BarrierSet), "sanity");
-  BarrierSet::set_barrier_set(bs);
-  _card_table = ct;
-
-  {
-    G1SATBMarkQueueSet& satbqs = bs->satb_mark_queue_set();
-    satbqs.set_process_completed_buffers_threshold(G1SATBProcessCompletedThreshold);
-    satbqs.set_buffer_enqueue_threshold_percentage(G1SATBBufferEnqueueingThresholdPercent);
-  }
 
   // Create space mappers.
   size_t page_size = heap_rs.page_size();
@@ -1391,12 +1418,26 @@ jint G1CollectedHeap::initialize() {
                              G1CardTable::compute_size(heap_rs.size() / HeapWordSize),
                              G1CardTable::heap_map_factor());
 
+  G1RegionToSpaceMapper* refinement_cards_storage =
+    create_aux_memory_mapper("Refinement Card Table",
+                             G1CardTable::compute_size(heap_rs.size() / HeapWordSize),
+                             G1CardTable::heap_map_factor());
+
   size_t bitmap_size = G1CMBitMap::compute_size(heap_rs.size());
   G1RegionToSpaceMapper* bitmap_storage =
     create_aux_memory_mapper("Mark Bitmap", bitmap_size, G1CMBitMap::heap_map_factor());
 
-  _hrm.initialize(heap_storage, bitmap_storage, bot_storage, cardtable_storage);
-  _card_table->initialize(cardtable_storage);
+  _hrm.initialize(heap_storage, bitmap_storage, bot_storage, cardtable_storage, refinement_cards_storage);
+  card_table->initialize(cardtable_storage);
+  refinement_table->initialize(refinement_cards_storage);
+
+  BarrierSet::set_barrier_set(bs);
+
+  {
+    G1SATBMarkQueueSet& satbqs = bs->satb_mark_queue_set();
+    satbqs.set_process_completed_buffers_threshold(G1SATBProcessCompletedThreshold);
+    satbqs.set_buffer_enqueue_threshold_percentage(G1SATBBufferEnqueueingThresholdPercent);
+  }
 
   // 6843694 - ensure that the maximum region index can fit
   // in the remembered set structures.
@@ -1408,7 +1449,7 @@ jint G1CollectedHeap::initialize() {
   guarantee((uintptr_t)(heap_rs.base()) >= G1CardTable::card_size(), "Java heap must not start within the first card.");
   G1FromCardCache::initialize(max_num_regions());
   // Also create a G1 rem set.
-  _rem_set = new G1RemSet(this, _card_table);
+  _rem_set = new G1RemSet(this);
   _rem_set->initialize(max_num_regions());
 
   size_t max_cards_per_region = ((size_t)1 << (sizeof(CardIdx_t)*BitsPerByte-1)) - 1;
@@ -1467,6 +1508,11 @@ jint G1CollectedHeap::initialize() {
   _free_arena_memory_task = new G1MonotonicArenaFreeMemoryTask("Card Set Free Memory Task");
   _service_thread->register_task(_free_arena_memory_task);
 
+  if (policy()->use_adaptive_young_list_length()) {
+    _revise_young_length_task = new G1ReviseYoungLengthTask("Revise Young Length List Task");
+    _service_thread->register_task(_revise_young_length_task);
+  }
+
   // Here we allocate the dummy G1HeapRegion that is required by the
   // G1AllocRegion class.
   G1HeapRegion* dummy_region = _hrm.get_dummy_region();
@@ -1495,6 +1541,7 @@ jint G1CollectedHeap::initialize() {
   CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_parallel_workers);
   CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_mark);
   CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_refine);
+  CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_refine_control);
   CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_service);
 
   G1InitLogger::print();
@@ -1519,12 +1566,35 @@ void G1CollectedHeap::stop() {
 
 void G1CollectedHeap::safepoint_synchronize_begin() {
   SuspendibleThreadSet::synchronize();
+
+  _last_synchronized_start = os::elapsed_counter();
 }
 
 void G1CollectedHeap::safepoint_synchronize_end() {
+  jlong now = os::elapsed_counter();
+  jlong synchronize_duration = now - _last_synchronized_start;
+
+  if (_last_safepoint_refinement_epoch == _refinement_epoch) {
+    _yield_duration_in_refinement_epoch += synchronize_duration;
+  } else {
+    _last_refinement_epoch_start = now;
+    _last_safepoint_refinement_epoch = _refinement_epoch;
+    _yield_duration_in_refinement_epoch = 0;
+  }
+
   SuspendibleThreadSet::desynchronize();
 }
 
+void G1CollectedHeap::set_last_refinement_epoch_start(jlong epoch_start, jlong last_yield_duration) {
+  _last_refinement_epoch_start = epoch_start;
+  guarantee(_yield_duration_in_refinement_epoch >= last_yield_duration, "should be");
+  _yield_duration_in_refinement_epoch -= last_yield_duration;
+}
+
+jlong G1CollectedHeap::yield_duration_in_refinement_epoch() {
+  return _yield_duration_in_refinement_epoch;
+}
+
 void G1CollectedHeap::post_initialize() {
   CollectedHeap::post_initialize();
   ref_processing_init();
@@ -2336,6 +2406,7 @@ void G1CollectedHeap::gc_epilogue(bool full) {
                                             &_collection_set_candidates_card_set_stats);
 
   update_perf_counter_cpu_time();
+  _refinement_epoch++;
 }
 
 uint G1CollectedHeap::uncommit_regions(uint region_limit) {
@@ -2468,7 +2539,6 @@ void G1CollectedHeap::verify_before_young_collection(G1HeapVerifier::G1VerifyTyp
   Ticks start = Ticks::now();
   _verifier->prepare_for_verify();
   _verifier->verify_region_sets_optional();
-  _verifier->verify_dirty_young_regions();
   _verifier->verify_before_gc();
   verify_numa_regions("GC Start");
   phase_times()->record_verify_before_time_ms((Ticks::now() - start).seconds() * MILLIUNITS);
@@ -2734,6 +2804,11 @@ void G1CollectedHeap::free_region(G1HeapRegion* hr, G1FreeRegionList* free_list)
   if (free_list != nullptr) {
     free_list->add_ordered(hr);
   }
+  if (VerifyDuringGC) {
+    // Card and refinement table must be clear for freed regions.
+    card_table()->verify_region(MemRegion(hr->bottom(), hr->end()), G1CardTable::clean_card_val(), true);
+    refinement_table()->verify_region(MemRegion(hr->bottom(), hr->end()), G1CardTable::clean_card_val(), true);
+  }
 }
 
 void G1CollectedHeap::retain_region(G1HeapRegion* hr) {
diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
index 8d26bcb1c0b..43839cc48d5 100644
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
@@ -75,6 +75,7 @@ class G1GCPhaseTimes;
 class G1HeapSizingPolicy;
 class G1NewTracer;
 class G1RemSet;
+class G1ReviseYoungLengthTask;
 class G1ServiceTask;
 class G1ServiceThread;
 class GCMemoryManager;
@@ -171,9 +172,23 @@ private:
   G1ServiceThread* _service_thread;
   G1ServiceTask* _periodic_gc_task;
   G1MonotonicArenaFreeMemoryTask* _free_arena_memory_task;
+  G1ReviseYoungLengthTask* _revise_young_length_task;
 
   WorkerThreads* _workers;
-  G1CardTable* _card_table;
+
+  // The current epoch for refinement, i.e. the number of times the card tables
+  // have been swapped by a garbage collection.
+  // Used for detecting whether concurrent refinement has been interrupted by a
+  // garbage collection.
+  size_t _refinement_epoch;
+
+  // The following members are for tracking safepoint durations between garbage
+  // collections.
+  jlong _last_synchronized_start;
+
+  jlong _last_refinement_epoch_start;
+  jlong _yield_duration_in_refinement_epoch;       // Time spent in safepoints since beginning of last refinement epoch.
+  size_t _last_safepoint_refinement_epoch;         // Refinement epoch before last safepoint.
 
   Ticks _collection_pause_end;
 
@@ -541,12 +556,17 @@ public:
   void run_batch_task(G1BatchedTask* cl);
 
   // Return "optimal" number of chunks per region we want to use for claiming areas
-  // within a region to claim.
+  // within a region to claim during card table scanning.
   // The returned value is a trade-off between granularity of work distribution and
   // memory usage and maintenance costs of that table.
   // Testing showed that 64 for 1M/2M region, 128 for 4M/8M regions, 256 for 16/32M regions,
   // and so on seems to be such a good trade-off.
-  static uint get_chunks_per_region();
+  static uint get_chunks_per_region_for_scan();
+  // Return "optimal" number of chunks per region we want to use for claiming areas
+  // within a region to claim during card table merging.
+  // This is much smaller than for scanning as the merge work is much smaller.
+  // Currently 1 for 1M regions, 2 for 2/4M regions, 4 for 8/16M regions and so on.
+  static uint get_chunks_per_region_for_merge();
 
   G1Allocator* allocator() {
     return _allocator;
@@ -687,11 +707,6 @@ public:
 
   // Add the given region to the retained regions collection set candidates.
   void retain_region(G1HeapRegion* hr);
-  // It dirties the cards that cover the block so that the post
-  // write barrier never queues anything when updating objects on this
-  // block. It is assumed (and in fact we assert) that the block
-  // belongs to a young region.
-  inline void dirty_young_block(HeapWord* start, size_t word_size);
 
   // Frees a humongous region by collapsing it into individual regions
   // and calling free_region() for each of them. The freed regions
@@ -905,6 +920,10 @@ public:
   void safepoint_synchronize_begin() override;
   void safepoint_synchronize_end() override;
 
+  jlong last_refinement_epoch_start() const { return _last_refinement_epoch_start; }
+  void set_last_refinement_epoch_start(jlong epoch_start, jlong last_yield_duration);
+  jlong yield_duration_in_refinement_epoch();
+
   // Does operations required after initialization has been done.
   void post_initialize() override;
 
@@ -1069,7 +1088,16 @@ public:
   }
 
   G1CardTable* card_table() const {
-    return _card_table;
+    return static_cast<G1CardTable*>(G1BarrierSet::g1_barrier_set()->card_table());
+  }
+
+  G1CardTable* refinement_table() const {
+    return G1BarrierSet::g1_barrier_set()->refinement_table();
+  }
+
+  G1CardTable::CardValue* card_table_base() const {
+    assert(card_table() != nullptr, "must be");
+    return card_table()->byte_map_base();
   }
 
   // Iteration functions.
diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp
index 3370ff9938f..fdc8585dbc0 100644
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp
@@ -149,30 +149,6 @@ inline void G1CollectedHeap::old_set_remove(G1HeapRegion* hr) {
   _old_set.remove(hr);
 }
 
-// It dirties the cards that cover the block so that the post
-// write barrier never queues anything when updating objects on this
-// block. It is assumed (and in fact we assert) that the block
-// belongs to a young region.
-inline void
-G1CollectedHeap::dirty_young_block(HeapWord* start, size_t word_size) {
-  assert_heap_not_locked();
-
-  // Assign the containing region to containing_hr so that we don't
-  // have to keep calling heap_region_containing() in the
-  // asserts below.
-  DEBUG_ONLY(G1HeapRegion* containing_hr = heap_region_containing(start);)
-  assert(word_size > 0, "pre-condition");
-  assert(containing_hr->is_in(start), "it should contain start");
-  assert(containing_hr->is_young(), "it should be young");
-  assert(!containing_hr->is_humongous(), "it should not be humongous");
-
-  HeapWord* end = start + word_size;
-  assert(containing_hr->is_in(end - 1), "it should also contain end - 1");
-
-  MemRegion mr(start, end);
-  card_table()->g1_mark_as_young(mr);
-}
-
 inline G1ScannerTasksQueueSet* G1CollectedHeap::task_queues() const {
   return _task_queues;
 }
diff --git a/src/hotspot/share/gc/g1/g1CollectionSet.cpp b/src/hotspot/share/gc/g1/g1CollectionSet.cpp
index d501ee5b47b..abfb620d626 100644
--- a/src/hotspot/share/gc/g1/g1CollectionSet.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectionSet.cpp
@@ -308,7 +308,8 @@ double G1CollectionSet::finalize_young_part(double target_pause_time_ms, G1Survi
   guarantee(target_pause_time_ms > 0.0,
             "target_pause_time_ms = %1.6lf should be positive", target_pause_time_ms);
 
-  size_t pending_cards = _policy->pending_cards_at_gc_start();
+  bool in_young_only_phase = _policy->collector_state()->in_young_only_phase();
+  size_t pending_cards = _policy->analytics()->predict_pending_cards(in_young_only_phase);
 
   log_trace(gc, ergo, cset)("Start choosing CSet. Pending cards: %zu target pause time: %1.2fms",
                             pending_cards, target_pause_time_ms);
@@ -323,10 +324,8 @@ double G1CollectionSet::finalize_young_part(double target_pause_time_ms, G1Survi
 
   verify_young_cset_indices();
 
-  size_t num_young_cards = _g1h->young_regions_cardset()->occupied();
-  _policy->record_card_rs_length(num_young_cards);
-
-  double predicted_base_time_ms = _policy->predict_base_time_ms(pending_cards, num_young_cards);
+  size_t card_rs_length = _policy->analytics()->predict_card_rs_length(in_young_only_phase);
+  double predicted_base_time_ms = _policy->predict_base_time_ms(pending_cards, card_rs_length);
   // Base time already includes the whole remembered set related time, so do not add that here
   // again.
   double predicted_eden_time = _policy->predict_young_region_other_time_ms(eden_region_length) +
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
index e52d380e26b..97386cb9720 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
@@ -27,6 +27,7 @@
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1BatchedTask.hpp"
 #include "gc/g1/g1CardSetMemory.hpp"
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectionSetChooser.hpp"
 #include "gc/g1/g1CollectorState.hpp"
@@ -34,7 +35,7 @@
 #include "gc/g1/g1ConcurrentMarkRemarkTasks.hpp"
 #include "gc/g1/g1ConcurrentMarkThread.inline.hpp"
 #include "gc/g1/g1ConcurrentRebuildAndScrub.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
+#include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1HeapRegion.inline.hpp"
 #include "gc/g1/g1HeapRegionManager.hpp"
 #include "gc/g1/g1HeapRegionPrinter.hpp"
@@ -483,7 +484,7 @@ G1ConcurrentMark::G1ConcurrentMark(G1CollectedHeap* g1h,
 
   // _finger set in set_non_marking_state
 
-  _worker_id_offset(G1DirtyCardQueueSet::num_par_ids() + G1ConcRefinementThreads),
+  _worker_id_offset(G1ConcRefinementThreads), // The refinement control thread does not refine cards, so it's just the worker threads.
   _max_num_tasks(MAX2(ConcGCThreads, ParallelGCThreads)),
   // _num_active_tasks set in set_non_marking_state()
   // _tasks set inside the constructor
@@ -1141,7 +1142,7 @@ void G1ConcurrentMark::mark_from_roots() {
   // worker threads may currently exist and more may not be
   // available.
   active_workers = _concurrent_workers->set_active_workers(active_workers);
-  log_info(gc, task)("Using %u workers of %u for marking", active_workers, _concurrent_workers->max_workers());
+  log_info(gc, task)("Concurrent Mark Using %u of %u Workers", active_workers, _concurrent_workers->max_workers());
 
   _num_concurrent_workers = active_workers;
 
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
index 4977da4729d..752082ce629 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
@@ -580,6 +580,8 @@ public:
   // TARS for the given region during remembered set rebuilding.
   inline HeapWord* top_at_rebuild_start(G1HeapRegion* r) const;
 
+  uint worker_id_offset() const { return _worker_id_offset; }
+
   // Clear statistics gathered during the concurrent cycle for the given region after
   // it has been reclaimed.
   void clear_statistics(G1HeapRegion* r);
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp b/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp
index 02afc443d68..fdef4214622 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp
@@ -25,6 +25,7 @@
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentMark.inline.hpp"
 #include "gc/g1/g1ConcurrentMarkRemarkTasks.hpp"
+#include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1HeapRegion.inline.hpp"
 #include "gc/g1/g1HeapRegionPrinter.hpp"
 #include "gc/g1/g1RemSetTrackingPolicy.hpp"
@@ -54,15 +55,16 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public
     _num_humongous_regions_removed(0),
     _local_cleanup_list(local_cleanup_list) {}
 
-  void reclaim_empty_region(G1HeapRegion* hr) {
+  void reclaim_empty_region_common(G1HeapRegion* hr) {
     assert(!hr->has_pinned_objects(), "precondition");
     assert(hr->used() > 0, "precondition");
 
     _freed_bytes += hr->used();
     hr->set_containing_set(nullptr);
-    hr->clear_cardtable();
+    hr->clear_both_card_tables();
     _cm->clear_statistics(hr);
     G1HeapRegionPrinter::mark_reclaim(hr);
+    _g1h->concurrent_refine()->notify_region_reclaimed(hr);
   }
 
   void reclaim_empty_humongous_region(G1HeapRegion* hr) {
@@ -71,8 +73,8 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public
     auto on_humongous_region = [&] (G1HeapRegion* hr) {
       assert(hr->is_humongous(), "precondition");
 
-      reclaim_empty_region(hr);
       _num_humongous_regions_removed++;
+      reclaim_empty_region_common(hr);
       _g1h->free_humongous_region(hr, _local_cleanup_list);
     };
 
@@ -82,8 +84,8 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public
   void reclaim_empty_old_region(G1HeapRegion* hr) {
     assert(hr->is_old(), "precondition");
 
-    reclaim_empty_region(hr);
     _num_old_regions_removed++;
+    reclaim_empty_region_common(hr);
     _g1h->free_region(hr, _local_cleanup_list);
   }
 
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp
index 0633e18411d..cd560a41333 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp
@@ -245,7 +245,7 @@ class G1RebuildRSAndScrubTask : public WorkerTask {
     G1RebuildRSAndScrubRegionClosure(G1ConcurrentMark* cm, bool should_rebuild_remset, uint worker_id) :
       _cm(cm),
       _bitmap(_cm->mark_bitmap()),
-      _rebuild_closure(G1CollectedHeap::heap(), worker_id),
+      _rebuild_closure(G1CollectedHeap::heap(), worker_id + cm->worker_id_offset()),
       _should_rebuild_remset(should_rebuild_remset),
       _processed_words(0) { }
 
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
index 84776b7a4b1..ed6a9ad4292 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
@@ -22,15 +22,20 @@
  *
  */
 
+#include "gc/g1/g1Analytics.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
+#include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectionSet.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
+#include "gc/g1/g1ConcurrentRefineSweepTask.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.inline.hpp"
 #include "gc/g1/g1HeapRegionRemSet.inline.hpp"
 #include "gc/g1/g1Policy.hpp"
 #include "gc/shared/gc_globals.hpp"
+#include "gc/shared/gcTraceTime.inline.hpp"
+#include "gc/shared/workerThread.hpp"
 #include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/iterator.hpp"
@@ -38,17 +43,15 @@
 #include "runtime/mutexLocker.hpp"
 #include "utilities/debug.hpp"
 #include "utilities/globalDefinitions.hpp"
+#include "utilities/ticks.hpp"
 
 #include <math.h>
 
-G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thread(uint worker_id, bool initializing) {
+G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thread() {
   G1ConcurrentRefineThread* result = nullptr;
-  if (initializing || !InjectGCWorkerCreationFailure) {
-    result = G1ConcurrentRefineThread::create(_cr, worker_id);
-  }
+  result = G1ConcurrentRefineThread::create(_cr);
   if (result == nullptr || result->osthread() == nullptr) {
-    log_warning(gc)("Failed to create refinement thread %u, no more %s",
-                    worker_id,
+    log_warning(gc)("Failed to create refinement control thread, no more %s",
                     result == nullptr ? "memory" : "OS threads");
     if (result != nullptr) {
       delete result;
@@ -60,106 +63,392 @@ G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thr
 
 G1ConcurrentRefineThreadControl::G1ConcurrentRefineThreadControl(uint max_num_threads) :
   _cr(nullptr),
-  _threads(max_num_threads)
+  _control_thread(nullptr),
+  _workers(nullptr),
+  _max_num_threads(max_num_threads)
 {}
 
 G1ConcurrentRefineThreadControl::~G1ConcurrentRefineThreadControl() {
-  while (_threads.is_nonempty()) {
-    delete _threads.pop();
-  }
-}
-
-bool G1ConcurrentRefineThreadControl::ensure_threads_created(uint worker_id, bool initializing) {
-  assert(worker_id < max_num_threads(), "precondition");
-
-  while ((uint)_threads.length() <= worker_id) {
-    G1ConcurrentRefineThread* rt = create_refinement_thread(_threads.length(), initializing);
-    if (rt == nullptr) {
-      return false;
-    }
-    _threads.push(rt);
-  }
-
-  return true;
+  delete _control_thread;
+  delete _workers;
 }
 
 jint G1ConcurrentRefineThreadControl::initialize(G1ConcurrentRefine* cr) {
   assert(cr != nullptr, "G1ConcurrentRefine must not be null");
   _cr = cr;
 
-  if (max_num_threads() > 0) {
-    _threads.push(create_refinement_thread(0, true));
-    if (_threads.at(0) == nullptr) {
-      vm_shutdown_during_initialization("Could not allocate primary refinement thread");
+  if (is_refinement_enabled()) {
+    _control_thread = create_refinement_thread();
+    if (_control_thread == nullptr) {
+      vm_shutdown_during_initialization("Could not allocate refinement control thread");
       return JNI_ENOMEM;
     }
-
-    if (!UseDynamicNumberOfGCThreads) {
-      if (!ensure_threads_created(max_num_threads() - 1, true)) {
-        vm_shutdown_during_initialization("Could not allocate refinement threads");
-        return JNI_ENOMEM;
-      }
-    }
+    _workers = new WorkerThreads("G1 Refinement Workers", max_num_threads());
+    _workers->initialize_workers();
   }
-
   return JNI_OK;
 }
 
 #ifdef ASSERT
-void G1ConcurrentRefineThreadControl::assert_current_thread_is_primary_refinement_thread() const {
-  assert(Thread::current() == _threads.at(0), "Not primary thread");
+void G1ConcurrentRefineThreadControl::assert_current_thread_is_control_refinement_thread() const {
+  assert(Thread::current() == _control_thread, "Not refinement control thread");
 }
 #endif // ASSERT
 
-bool G1ConcurrentRefineThreadControl::activate(uint worker_id) {
-  if (ensure_threads_created(worker_id, false)) {
-    _threads.at(worker_id)->activate();
-    return true;
-  }
+void G1ConcurrentRefineThreadControl::activate() {
+  _control_thread->activate();
+}
 
-  return false;
+void G1ConcurrentRefineThreadControl::run_task(WorkerTask* task, uint num_workers) {
+  assert(num_workers >= 1, "must be");
+
+  WithActiveWorkers w(_workers, num_workers);
+  _workers->run_task(task);
+}
+
+void G1ConcurrentRefineThreadControl::control_thread_do(ThreadClosure* tc) {
+  if (is_refinement_enabled()) {
+    tc->do_thread(_control_thread);
+  }
 }
 
 void G1ConcurrentRefineThreadControl::worker_threads_do(ThreadClosure* tc) {
-  for (G1ConcurrentRefineThread* t : _threads) {
-    tc->do_thread(t);
+  if (is_refinement_enabled()) {
+    _workers->threads_do(tc);
   }
 }
 
 void G1ConcurrentRefineThreadControl::stop() {
-  for (G1ConcurrentRefineThread* t : _threads) {
-    t->stop();
+  if (is_refinement_enabled()) {
+    _control_thread->stop();
   }
 }
 
+G1ConcurrentRefineSweepState::G1ConcurrentRefineSweepState(uint max_reserved_regions) :
+  _state(State::Idle),
+  _sweep_table(new G1CardTableClaimTable(G1CollectedHeap::get_chunks_per_region_for_merge())),
+  _stats()
+{
+  _sweep_table->initialize(max_reserved_regions);
+}
+
+G1ConcurrentRefineSweepState::~G1ConcurrentRefineSweepState() {
+  delete _sweep_table;
+}
+
+void G1ConcurrentRefineSweepState::set_state_start_time() {
+  _state_start[static_cast<uint>(_state)] = Ticks::now();
+}
+
+Tickspan G1ConcurrentRefineSweepState::get_duration(State start, State end) {
+  return _state_start[static_cast<uint>(end)] - _state_start[static_cast<uint>(start)];
+}
+
+void G1ConcurrentRefineSweepState::reset_stats() {
+  stats()->reset();
+}
+
+void G1ConcurrentRefineSweepState::add_yield_during_sweep_duration(jlong duration) {
+  stats()->inc_yield_during_sweep_duration(duration);
+}
+
+bool G1ConcurrentRefineSweepState::advance_state(State next_state) {
+  bool result = is_in_progress();
+  if (result) {
+    _state = next_state;
+  } else {
+    _state = State::Idle;
+  }
+  return result;
+}
+
+void G1ConcurrentRefineSweepState::assert_state(State expected) {
+  assert(_state == expected, "must be %s but is %s", state_name(expected), state_name(_state));
+}
+
+void G1ConcurrentRefineSweepState::start_work() {
+  assert_state(State::Idle);
+
+  set_state_start_time();
+
+  _stats.reset();
+
+  _state = State::SwapGlobalCT;
+}
+
+bool G1ConcurrentRefineSweepState::swap_global_card_table() {
+  assert_state(State::SwapGlobalCT);
+
+  GCTraceTime(Info, gc, refine) tm("Concurrent Refine Global Card Table Swap");
+  set_state_start_time();
+
+  {
+    // We can't have any new threads being in the process of created while we
+    // swap the card table because we read the current card table state during
+    // initialization.
+    // A safepoint may occur during that time, so leave the STS temporarily.
+    SuspendibleThreadSetLeaver sts_leave;
+
+    MutexLocker mu(Threads_lock);
+    // A GC that advanced the epoch might have happened, which already switched
+    // The global card table. Do nothing.
+    if (is_in_progress()) {
+      G1BarrierSet::g1_barrier_set()->swap_global_card_table();
+    }
+  }
+
+  return advance_state(State::SwapJavaThreadsCT);
+}
+
+bool G1ConcurrentRefineSweepState::swap_java_threads_ct() {
+  assert_state(State::SwapJavaThreadsCT);
+
+  GCTraceTime(Info, gc, refine) tm("Concurrent Refine Java Thread CT swap");
+
+  set_state_start_time();
+
+  {
+    // Need to leave the STS to avoid potential deadlock in the handshake.
+    SuspendibleThreadSetLeaver sts;
+
+    class G1SwapThreadCardTableClosure : public HandshakeClosure {
+    public:
+      G1SwapThreadCardTableClosure() : HandshakeClosure("G1 Java Thread CT swap") { }
+
+      virtual void do_thread(Thread* thread) {
+        G1BarrierSet* bs = G1BarrierSet::g1_barrier_set();
+        bs->update_card_table_base(thread);
+      }
+    } cl;
+    Handshake::execute(&cl);
+  }
+
+  return advance_state(State::SynchronizeGCThreads);
+  }
+
+bool G1ConcurrentRefineSweepState::swap_gc_threads_ct() {
+  assert_state(State::SynchronizeGCThreads);
+
+  GCTraceTime(Info, gc, refine) tm("Concurrent Refine GC Thread CT swap");
+
+  set_state_start_time();
+
+  {
+    class RendezvousGCThreads: public VM_Operation {
+    public:
+      VMOp_Type type() const { return VMOp_G1RendezvousGCThreads; }
+
+      virtual bool evaluate_at_safepoint() const {
+        // We only care about synchronizing the GC threads.
+        // Leave the Java threads running.
+        return false;
+      }
+
+      virtual bool skip_thread_oop_barriers() const {
+        fatal("Concurrent VMOps should not call this");
+        return true;
+      }
+
+      void doit() {
+        // Light weight "handshake" of the GC threads for memory synchronization;
+        // both changes to the Java heap need to be synchronized as well as the
+        // previous global card table reference change, so that no GC thread
+        // accesses the wrong card table.
+        // For example in the rebuild remset process the marking threads write
+        // marks into the card table, and that card table reference must be the
+        // correct one.
+        SuspendibleThreadSet::synchronize();
+        SuspendibleThreadSet::desynchronize();
+      };
+    } op;
+
+    SuspendibleThreadSetLeaver sts_leave;
+    VMThread::execute(&op);
+  }
+
+  return advance_state(State::SnapshotHeap);
+}
+
+void G1ConcurrentRefineSweepState::snapshot_heap(bool concurrent) {
+  if (concurrent) {
+    GCTraceTime(Info, gc, refine) tm("Concurrent Refine Snapshot Heap");
+
+    assert_state(State::SnapshotHeap);
+
+    set_state_start_time();
+
+    snapshot_heap_inner();
+
+    advance_state(State::SweepRT);
+  } else {
+    assert_state(State::Idle);
+    assert_at_safepoint();
+
+    snapshot_heap_inner();
+  }
+}
+
+void G1ConcurrentRefineSweepState::sweep_refinement_table_start() {
+  assert_state(State::SweepRT);
+
+  set_state_start_time();
+}
+
+bool G1ConcurrentRefineSweepState::sweep_refinement_table_step() {
+  assert_state(State::SweepRT);
+
+  GCTraceTime(Info, gc, refine) tm("Concurrent Refine Table Step");
+
+  G1ConcurrentRefine* cr = G1CollectedHeap::heap()->concurrent_refine();
+
+  G1ConcurrentRefineSweepTask task(_sweep_table, &_stats, cr->num_threads_wanted());
+  cr->run_with_refinement_workers(&task);
+
+  if (task.sweep_completed()) {
+    advance_state(State::CompleteRefineWork);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool G1ConcurrentRefineSweepState::complete_work(bool concurrent, bool print_log) {
+  if (concurrent) {
+    assert_state(State::CompleteRefineWork);
+  } else {
+    // May have been forced to complete at any other time.
+    assert(is_in_progress() && _state != State::CompleteRefineWork, "must be but is %s", state_name(_state));
+  }
+
+  set_state_start_time();
+
+  if (print_log) {
+    G1ConcurrentRefineStats* s = &_stats;
+
+    log_debug(gc, refine)("Refinement took %.2fms (pre-sweep %.2fms card refine %.2f) "
+                          "(scanned %zu clean %zu (%.2f%%) not_clean %zu (%.2f%%) not_parsable %zu "
+                          "refers_to_cset %zu (%.2f%%) still_refers_to_cset %zu (%.2f%%) no_cross_region %zu pending %zu)",
+                          get_duration(State::Idle, _state).seconds() * 1000.0,
+                          get_duration(State::Idle, State::SweepRT).seconds() * 1000.0,
+                          TimeHelper::counter_to_millis(s->refine_duration()),
+                          s->cards_scanned(),
+                          s->cards_clean(),
+                          percent_of(s->cards_clean(), s->cards_scanned()),
+                          s->cards_not_clean(),
+                          percent_of(s->cards_not_clean(), s->cards_scanned()),
+                          s->cards_not_parsable(),
+                          s->cards_refer_to_cset(),
+                          percent_of(s->cards_refer_to_cset(), s->cards_not_clean()),
+                          s->cards_already_refer_to_cset(),
+                          percent_of(s->cards_already_refer_to_cset(), s->cards_not_clean()),
+                          s->cards_no_cross_region(),
+                          s->cards_pending()
+                         );
+  }
+
+  bool has_sweep_rt_work = _state == State::SweepRT;
+
+  advance_state(State::Idle);
+  return has_sweep_rt_work;
+}
+
+void G1ConcurrentRefineSweepState::snapshot_heap_inner() {
+  // G1CollectedHeap::heap_region_iterate() below will only visit currently committed
+  // regions. Initialize all entries in the state table here and later in this method
+  // selectively enable regions that we are interested. This way regions committed
+  // later will be automatically excluded from iteration.
+  // Their refinement table must be completely empty anyway.
+  _sweep_table->reset_all_to_claimed();
+
+  class SnapshotRegionsClosure : public G1HeapRegionClosure {
+    G1CardTableClaimTable* _sweep_table;
+
+  public:
+    SnapshotRegionsClosure(G1CardTableClaimTable* sweep_table) : G1HeapRegionClosure(), _sweep_table(sweep_table) { }
+
+    bool do_heap_region(G1HeapRegion* r) override {
+      if (!r->is_free()) {
+        // Need to scan all parts of non-free regions, so reset the claim.
+        // No need for synchronization: we are only interested in regions
+        // that were allocated before the handshake; the handshake makes such
+        // regions' metadata visible to all threads, and we do not care about
+        // humongous regions that were allocated afterwards.
+        _sweep_table->reset_to_unclaimed(r->hrm_index());
+      }
+      return false;
+    }
+  } cl(_sweep_table);
+  G1CollectedHeap::heap()->heap_region_iterate(&cl);
+}
+
+bool G1ConcurrentRefineSweepState::is_in_progress() const {
+  return _state != State::Idle;
+}
+
+bool G1ConcurrentRefineSweepState::are_java_threads_synched() const {
+  return _state > State::SwapJavaThreadsCT || !is_in_progress();
+}
+
 uint64_t G1ConcurrentRefine::adjust_threads_period_ms() const {
   // Instead of a fixed value, this could be a command line option.  But then
   // we might also want to allow configuration of adjust_threads_wait_ms().
-  return 50;
+
+  // Use a prime number close to 50ms, different to other components that derive
+  // their wait time from the try_get_available_bytes_estimate() call to minimize
+  // interference.
+  return 53;
 }
 
 static size_t minimum_pending_cards_target() {
-  // One buffer per thread.
-  return ParallelGCThreads * G1UpdateBufferSize;
+  return ParallelGCThreads * G1PerThreadPendingCardThreshold;
 }
 
-G1ConcurrentRefine::G1ConcurrentRefine(G1Policy* policy) :
-  _policy(policy),
-  _threads_wanted(0),
+G1ConcurrentRefine::G1ConcurrentRefine(G1CollectedHeap* g1h) :
+  _policy(g1h->policy()),
+  _num_threads_wanted(0),
   _pending_cards_target(PendingCardsTargetUninitialized),
   _last_adjust(),
   _needs_adjust(false),
-  _threads_needed(policy, adjust_threads_period_ms()),
+  _heap_was_locked(false),
+  _threads_needed(g1h->policy(), adjust_threads_period_ms()),
   _thread_control(G1ConcRefinementThreads),
-  _dcqs(G1BarrierSet::dirty_card_queue_set())
-{}
+  _sweep_state(g1h->max_num_regions())
+{ }
 
 jint G1ConcurrentRefine::initialize() {
   return _thread_control.initialize(this);
 }
 
-G1ConcurrentRefine* G1ConcurrentRefine::create(G1Policy* policy, jint* ecode) {
-  G1ConcurrentRefine* cr = new G1ConcurrentRefine(policy);
+G1ConcurrentRefineSweepState& G1ConcurrentRefine::sweep_state_for_merge() {
+  bool has_sweep_claims = sweep_state().complete_work(false /* concurrent */);
+  if (has_sweep_claims) {
+    log_debug(gc, refine)("Continue existing work");
+  } else {
+    // Refinement has been interrupted without having a snapshot. There may
+    // be a mix of already swapped and not-swapped card tables assigned to threads,
+    // so they might have already dirtied the swapped card tables.
+    // Conservatively scan all (non-free, non-committed) region's card tables,
+    // creating the snapshot right now.
+    log_debug(gc, refine)("Create work from scratch");
+
+    sweep_state().snapshot_heap(false /* concurrent */);
+  }
+  return sweep_state();
+}
+
+void G1ConcurrentRefine::run_with_refinement_workers(WorkerTask* task) {
+  _thread_control.run_task(task, num_threads_wanted());
+}
+
+void G1ConcurrentRefine::notify_region_reclaimed(G1HeapRegion* r) {
+  assert_at_safepoint();
+  if (_sweep_state.is_in_progress()) {
+    _sweep_state.sweep_table()->claim_all_cards(r->hrm_index());
+  }
+}
+
+G1ConcurrentRefine* G1ConcurrentRefine::create(G1CollectedHeap* g1h, jint* ecode) {
+  G1ConcurrentRefine* cr = new G1ConcurrentRefine(g1h);
   *ecode = cr->initialize();
   if (*ecode != 0) {
     delete cr;
@@ -176,25 +465,31 @@ G1ConcurrentRefine::~G1ConcurrentRefine() {
 }
 
 void G1ConcurrentRefine::threads_do(ThreadClosure *tc) {
+  worker_threads_do(tc);
+  control_thread_do(tc);
+}
+
+void G1ConcurrentRefine::worker_threads_do(ThreadClosure *tc) {
   _thread_control.worker_threads_do(tc);
 }
 
-void G1ConcurrentRefine::update_pending_cards_target(double logged_cards_time_ms,
-                                                     size_t processed_logged_cards,
-                                                     size_t predicted_thread_buffer_cards,
+void G1ConcurrentRefine::control_thread_do(ThreadClosure *tc) {
+  _thread_control.control_thread_do(tc);
+}
+
+void G1ConcurrentRefine::update_pending_cards_target(double pending_cards_time_ms,
+                                                     size_t processed_pending_cards,
                                                      double goal_ms) {
   size_t minimum = minimum_pending_cards_target();
-  if ((processed_logged_cards < minimum) || (logged_cards_time_ms == 0.0)) {
-    log_debug(gc, ergo, refine)("Unchanged pending cards target: %zu",
-                                _pending_cards_target);
+  if ((processed_pending_cards < minimum) || (pending_cards_time_ms == 0.0)) {
+    log_debug(gc, ergo, refine)("Unchanged pending cards target: %zu (processed %zu minimum %zu time %1.2f)",
+                                _pending_cards_target, processed_pending_cards, minimum, pending_cards_time_ms);
     return;
   }
 
   // Base the pending cards budget on the measured rate.
-  double rate = processed_logged_cards / logged_cards_time_ms;
-  size_t budget = static_cast<size_t>(goal_ms * rate);
-  // Deduct predicted cards in thread buffers to get target.
-  size_t new_target = budget - MIN2(budget, predicted_thread_buffer_cards);
+  double rate = processed_pending_cards / pending_cards_time_ms;
+  size_t new_target = static_cast<size_t>(goal_ms * rate);
   // Add some hysteresis with previous values.
   if (is_pending_cards_target_initialized()) {
     new_target = (new_target + _pending_cards_target) / 2;
@@ -205,46 +500,36 @@ void G1ConcurrentRefine::update_pending_cards_target(double logged_cards_time_ms
   log_debug(gc, ergo, refine)("New pending cards target: %zu", new_target);
 }
 
-void G1ConcurrentRefine::adjust_after_gc(double logged_cards_time_ms,
-                                         size_t processed_logged_cards,
-                                         size_t predicted_thread_buffer_cards,
+void G1ConcurrentRefine::adjust_after_gc(double pending_cards_time_ms,
+                                         size_t processed_pending_cards,
                                          double goal_ms) {
-  if (!G1UseConcRefinement) return;
+  if (!G1UseConcRefinement) {
+    return;
+  }
 
-  update_pending_cards_target(logged_cards_time_ms,
-                              processed_logged_cards,
-                              predicted_thread_buffer_cards,
+  update_pending_cards_target(pending_cards_time_ms,
+                              processed_pending_cards,
                               goal_ms);
-  if (_thread_control.max_num_threads() == 0) {
-    // If no refinement threads then the mutator threshold is the target.
-    _dcqs.set_mutator_refinement_threshold(_pending_cards_target);
-  } else {
-    // Provisionally make the mutator threshold unlimited, to be updated by
-    // the next periodic adjustment.  Because card state may have changed
-    // drastically, record that adjustment is needed and kick the primary
-    // thread, in case it is waiting.
-    _dcqs.set_mutator_refinement_threshold(SIZE_MAX);
+  if (_thread_control.is_refinement_enabled()) {
     _needs_adjust = true;
     if (is_pending_cards_target_initialized()) {
-      _thread_control.activate(0);
+      _thread_control.activate();
     }
   }
 }
 
-// Wake up the primary thread less frequently when the time available until
-// the next GC is longer.  But don't increase the wait time too rapidly.
-// This reduces the number of primary thread wakeups that just immediately
-// go back to waiting, while still being responsive to behavior changes.
-static uint64_t compute_adjust_wait_time_ms(double available_ms) {
-  return static_cast<uint64_t>(sqrt(available_ms) * 4.0);
-}
-
 uint64_t G1ConcurrentRefine::adjust_threads_wait_ms() const {
-  assert_current_thread_is_primary_refinement_thread();
+  assert_current_thread_is_control_refinement_thread();
   if (is_pending_cards_target_initialized()) {
-    double available_ms = _threads_needed.predicted_time_until_next_gc_ms();
-    uint64_t wait_time_ms = compute_adjust_wait_time_ms(available_ms);
-    return MAX2(wait_time_ms, adjust_threads_period_ms());
+    // Retry asap when the cause for not getting a prediction was that we temporarily
+    // did not get the heap lock. Otherwise we might wait for too long until we get
+    // back here.
+    if (_heap_was_locked) {
+      return 1;
+    }
+    double available_time_ms = _threads_needed.predicted_time_until_next_gc_ms();
+
+    return _policy->adjust_wait_time_ms(available_time_ms, adjust_threads_period_ms());
   } else {
     // If target not yet initialized then wait forever (until explicitly
     // activated).  This happens during startup, when we don't bother with
@@ -253,185 +538,74 @@ uint64_t G1ConcurrentRefine::adjust_threads_wait_ms() const {
   }
 }
 
-class G1ConcurrentRefine::RemSetSamplingClosure : public G1HeapRegionClosure {
-  size_t _sampled_code_root_rs_length;
+bool G1ConcurrentRefine::adjust_num_threads_periodically() {
+  assert_current_thread_is_control_refinement_thread();
 
-public:
-  RemSetSamplingClosure() :
-    _sampled_code_root_rs_length(0) {}
-
-  bool do_heap_region(G1HeapRegion* r) override {
-    G1HeapRegionRemSet* rem_set = r->rem_set();
-    _sampled_code_root_rs_length += rem_set->code_roots_list_length();
-    return false;
-  }
-
-  size_t sampled_code_root_rs_length() const { return _sampled_code_root_rs_length; }
-};
-
-// Adjust the target length (in regions) of the young gen, based on the
-// current length of the remembered sets.
-//
-// At the end of the GC G1 determines the length of the young gen based on
-// how much time the next GC can take, and when the next GC may occur
-// according to the MMU.
-//
-// The assumption is that a significant part of the GC is spent on scanning
-// the remembered sets (and many other components), so this thread constantly
-// reevaluates the prediction for the remembered set scanning costs, and potentially
-// resizes the young gen. This may do a premature GC or even increase the young
-// gen size to keep pause time length goal.
-void G1ConcurrentRefine::adjust_young_list_target_length() {
-  if (_policy->use_adaptive_young_list_length()) {
-    G1CollectedHeap* g1h = G1CollectedHeap::heap();
-    G1CollectionSet* cset = g1h->collection_set();
-    RemSetSamplingClosure cl;
-    cset->iterate(&cl);
-
-    size_t card_rs_length = g1h->young_regions_cardset()->occupied();
-
-    size_t sampled_code_root_rs_length = cl.sampled_code_root_rs_length();
-    _policy->revise_young_list_target_length(card_rs_length, sampled_code_root_rs_length);
-  }
-}
-
-bool G1ConcurrentRefine::adjust_threads_periodically() {
-  assert_current_thread_is_primary_refinement_thread();
-
-  // Check whether it's time to do a periodic adjustment.
+  _heap_was_locked = false;
+  // Check whether it's time to do a periodic adjustment if there is no explicit
+  // request pending. We might have spuriously woken up.
   if (!_needs_adjust) {
     Tickspan since_adjust = Ticks::now() - _last_adjust;
-    if (since_adjust.milliseconds() >= adjust_threads_period_ms()) {
-      _needs_adjust = true;
+    if (since_adjust.milliseconds() < adjust_threads_period_ms()) {
+      _num_threads_wanted = 0;
+      return false;
     }
   }
 
-  // If needed, try to adjust threads wanted.
-  if (_needs_adjust) {
-    // Getting used young bytes requires holding Heap_lock.  But we can't use
-    // normal lock and block until available.  Blocking on the lock could
-    // deadlock with a GC VMOp that is holding the lock and requesting a
-    // safepoint.  Instead try to lock, and if fail then skip adjustment for
-    // this iteration of the thread, do some refinement work, and retry the
-    // adjustment later.
-    if (Heap_lock->try_lock()) {
-      size_t used_bytes = _policy->estimate_used_young_bytes_locked();
-      Heap_lock->unlock();
-      adjust_young_list_target_length();
-      size_t young_bytes = _policy->young_list_target_length() * G1HeapRegion::GrainBytes;
-      size_t available_bytes = young_bytes - MIN2(young_bytes, used_bytes);
-      adjust_threads_wanted(available_bytes);
-      _needs_adjust = false;
-      _last_adjust = Ticks::now();
-      return true;
-    }
+  // Reset pending request.
+  _needs_adjust = false;
+  size_t available_bytes = 0;
+  if (_policy->try_get_available_bytes_estimate(available_bytes)) {
+    adjust_threads_wanted(available_bytes);
+    _last_adjust = Ticks::now();
+  } else {
+    _heap_was_locked = true;
+    // Defer adjustment to next time.
+    _needs_adjust = true;
   }
 
-  return false;
-}
-
-bool G1ConcurrentRefine::is_in_last_adjustment_period() const {
-  return _threads_needed.predicted_time_until_next_gc_ms() <= adjust_threads_period_ms();
+  return (_num_threads_wanted > 0) && !heap_was_locked();
 }
 
 void G1ConcurrentRefine::adjust_threads_wanted(size_t available_bytes) {
-  assert_current_thread_is_primary_refinement_thread();
-  size_t num_cards = _dcqs.num_cards();
-  size_t mutator_threshold = SIZE_MAX;
-  uint old_wanted = AtomicAccess::load(&_threads_wanted);
+  assert_current_thread_is_control_refinement_thread();
 
-  _threads_needed.update(old_wanted,
+  G1Policy* policy = G1CollectedHeap::heap()->policy();
+  const G1Analytics* analytics = policy->analytics();
+
+  size_t num_cards = policy->current_pending_cards();
+
+  _threads_needed.update(_num_threads_wanted,
                          available_bytes,
                          num_cards,
                          _pending_cards_target);
   uint new_wanted = _threads_needed.threads_needed();
   if (new_wanted > _thread_control.max_num_threads()) {
-    // If running all the threads can't reach goal, turn on refinement by
-    // mutator threads.  Using target as the threshold may be stronger
-    // than required, but will do the most to get us under goal, and we'll
-    // reevaluate with the next adjustment.
-    mutator_threshold = _pending_cards_target;
+    // Bound the wanted threads by maximum available.
     new_wanted = _thread_control.max_num_threads();
-  } else if (is_in_last_adjustment_period()) {
-    // If very little time remains until GC, enable mutator refinement.  If
-    // the target has been reached, this keeps the number of pending cards on
-    // target even if refinement threads deactivate in the meantime.  And if
-    // the target hasn't been reached, this prevents things from getting
-    // worse.
-    mutator_threshold = _pending_cards_target;
   }
-  AtomicAccess::store(&_threads_wanted, new_wanted);
-  _dcqs.set_mutator_refinement_threshold(mutator_threshold);
-  log_debug(gc, refine)("Concurrent refinement: wanted %u, cards: %zu, "
-                        "predicted: %zu, time: %1.2fms",
+
+  _num_threads_wanted = new_wanted;
+
+  log_debug(gc, refine)("Concurrent refinement: wanted %u, pending cards: %zu (pending-from-gc %zu), "
+                        "predicted: %zu, goal %zu, time-until-next-gc: %1.2fms pred-refine-rate %1.2fc/ms log-rate %1.2fc/ms",
                         new_wanted,
                         num_cards,
+                        G1CollectedHeap::heap()->policy()->pending_cards_from_gc(),
                         _threads_needed.predicted_cards_at_next_gc(),
-                        _threads_needed.predicted_time_until_next_gc_ms());
-  // Activate newly wanted threads.  The current thread is the primary
-  // refinement thread, so is already active.
-  for (uint i = MAX2(old_wanted, 1u); i < new_wanted; ++i) {
-    if (!_thread_control.activate(i)) {
-      // Failed to allocate and activate thread.  Stop trying to activate, and
-      // instead use mutator threads to make up the gap.
-      AtomicAccess::store(&_threads_wanted, i);
-      _dcqs.set_mutator_refinement_threshold(_pending_cards_target);
-      break;
-    }
-  }
-}
-
-void G1ConcurrentRefine::reduce_threads_wanted() {
-  assert_current_thread_is_primary_refinement_thread();
-  if (!_needs_adjust) {         // Defer if adjustment request is active.
-    uint wanted = AtomicAccess::load(&_threads_wanted);
-    if (wanted > 0) {
-      AtomicAccess::store(&_threads_wanted, --wanted);
-    }
-    // If very little time remains until GC, enable mutator refinement.  If
-    // the target has been reached, this keeps the number of pending cards on
-    // target even as refinement threads deactivate in the meantime.
-    if (is_in_last_adjustment_period()) {
-      _dcqs.set_mutator_refinement_threshold(_pending_cards_target);
-    }
-  }
-}
-
-bool G1ConcurrentRefine::is_thread_wanted(uint worker_id) const {
-  return worker_id < AtomicAccess::load(&_threads_wanted);
+                        _pending_cards_target,
+                        _threads_needed.predicted_time_until_next_gc_ms(),
+                        analytics->predict_concurrent_refine_rate_ms(),
+                        analytics->predict_dirtied_cards_rate_ms()
+                        );
 }
 
 bool G1ConcurrentRefine::is_thread_adjustment_needed() const {
-  assert_current_thread_is_primary_refinement_thread();
+  assert_current_thread_is_control_refinement_thread();
   return _needs_adjust;
 }
 
 void G1ConcurrentRefine::record_thread_adjustment_needed() {
-  assert_current_thread_is_primary_refinement_thread();
+  assert_current_thread_is_control_refinement_thread();
   _needs_adjust = true;
 }
-
-G1ConcurrentRefineStats G1ConcurrentRefine::get_and_reset_refinement_stats() {
-  struct CollectStats : public ThreadClosure {
-    G1ConcurrentRefineStats _total_stats;
-    virtual void do_thread(Thread* t) {
-      G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
-      G1ConcurrentRefineStats& stats = *crt->refinement_stats();
-      _total_stats += stats;
-      stats.reset();
-    }
-  } collector;
-  threads_do(&collector);
-  return collector._total_stats;
-}
-
-uint G1ConcurrentRefine::worker_id_offset() {
-  return G1DirtyCardQueueSet::num_par_ids();
-}
-
-bool G1ConcurrentRefine::try_refinement_step(uint worker_id,
-                                             size_t stop_at,
-                                             G1ConcurrentRefineStats* stats) {
-  uint adjusted_id = worker_id + worker_id_offset();
-  return _dcqs.refine_completed_buffer_concurrently(adjusted_id, stop_at, stats);
-}
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
index dd0b62a22ea..5e96ed738fd 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,23 +34,28 @@
 #include "utilities/macros.hpp"
 
 // Forward decl
+class G1CardTableClaimTable;
+class G1CollectedHeap;
 class G1ConcurrentRefine;
 class G1ConcurrentRefineThread;
-class G1DirtyCardQueueSet;
+class G1HeapRegion;
 class G1Policy;
 class ThreadClosure;
+class WorkerTask;
+class WorkerThreads;
 
 // Helper class for refinement thread management. Used to start, stop and
 // iterate over them.
 class G1ConcurrentRefineThreadControl {
   G1ConcurrentRefine* _cr;
-  GrowableArrayCHeap<G1ConcurrentRefineThread*, mtGC> _threads;
+  G1ConcurrentRefineThread* _control_thread;
+
+  WorkerThreads* _workers;
+  uint _max_num_threads;
 
   // Create the refinement thread for the given worker id.
   // If initializing is true, ignore InjectGCWorkerCreationFailure.
-  G1ConcurrentRefineThread* create_refinement_thread(uint worker_id, bool initializing);
-
-  bool ensure_threads_created(uint worker_id, bool initializing);
+  G1ConcurrentRefineThread* create_refinement_thread();
 
   NONCOPYABLE(G1ConcurrentRefineThreadControl);
 
@@ -60,21 +65,119 @@ public:
 
   jint initialize(G1ConcurrentRefine* cr);
 
-  void assert_current_thread_is_primary_refinement_thread() const NOT_DEBUG_RETURN;
+  void assert_current_thread_is_control_refinement_thread() const NOT_DEBUG_RETURN;
 
-  uint max_num_threads() const { return _threads.capacity(); }
+  uint max_num_threads() const { return _max_num_threads; }
+  bool is_refinement_enabled() const { return _max_num_threads > 0; }
 
-  // Activate the indicated thread.  If the thread has not yet been allocated,
-  // allocate and then activate.  If allocation is needed and fails, return
-  // false.  Otherwise return true.
-  // precondition: worker_id < max_num_threads().
-  // precondition: current thread is not the designated worker.
-  bool activate(uint worker_id);
+  // Activate the control thread.
+  void activate();
 
+  void run_task(WorkerTask* task, uint num_workers);
+
+  void control_thread_do(ThreadClosure* tc);
   void worker_threads_do(ThreadClosure* tc);
   void stop();
 };
 
+// Tracks the current state of re-examining the dirty cards from idle to completion
+// (and reset back to idle).
+//
+// The process steps are as follows:
+//
+// 1) Swap global card table pointers
+//
+// 2) Swap Java Thread's card table pointers
+//
+// 3) Synchronize GC Threads
+//      Ensures memory visibility
+//
+// After this point mutator threads should not mark the refinement table.
+//
+// 4) Snapshot the heap
+//      Determines which regions need to be swept.
+//
+// 5) Sweep Refinement table
+//      Examines non-Clean cards on the refinement table.
+//
+// 6) Completion Work
+//      Calculates statistics about the process to be used in various parts of
+//      the garbage collection.
+//
+// All but step 4 are interruptible by safepoints. In case of a garbage collection,
+// the garbage collection will interrupt this process, and go to Idle state.
+//
+class G1ConcurrentRefineSweepState {
+
+  enum class State : uint {
+    Idle,                        // Refinement is doing nothing.
+    SwapGlobalCT,                // Swap global card table.
+    SwapJavaThreadsCT,           // Swap java thread's card tables.
+    SynchronizeGCThreads,        // Synchronize GC thread's memory view.
+    SnapshotHeap,                // Take a snapshot of the region's top() values.
+    SweepRT,                     // Sweep the refinement table for pending (dirty) cards.
+    CompleteRefineWork,          // Cleanup of refinement work, reset to idle.
+    Last
+  } _state;
+
+  static const char* state_name(State state) {
+    static const char* _state_names[] = {
+      "Idle",
+      "Swap Global Card Table",
+      "Swap JavaThread Card Table",
+      "Synchronize GC Threads",
+      "Snapshot Heap",
+      "Sweep Refinement Table",
+      "Complete Sweep Work"
+    };
+
+    return _state_names[static_cast<uint>(state)];
+  }
+
+  // Current heap snapshot.
+  G1CardTableClaimTable* _sweep_table;
+
+  // Start times for all states.
+  Ticks _state_start[static_cast<uint>(State::Last)];
+
+  void set_state_start_time();
+  Tickspan get_duration(State start, State end);
+
+  G1ConcurrentRefineStats _stats;
+
+  // Advances the state to next_state if not interrupted by a changed epoch. Returns
+  // to Idle otherwise.
+  bool advance_state(State next_state);
+
+  void assert_state(State expected);
+
+  void snapshot_heap_inner();
+
+public:
+  G1ConcurrentRefineSweepState(uint max_reserved_regions);
+  ~G1ConcurrentRefineSweepState();
+
+  void start_work();
+
+  bool swap_global_card_table();
+  bool swap_java_threads_ct();
+  bool swap_gc_threads_ct();
+  void snapshot_heap(bool concurrent = true);
+  void sweep_refinement_table_start();
+  bool sweep_refinement_table_step();
+
+  bool complete_work(bool concurrent, bool print_log = true);
+
+  G1CardTableClaimTable* sweep_table() { return _sweep_table; }
+  G1ConcurrentRefineStats* stats() { return &_stats; }
+  void reset_stats();
+
+  void add_yield_during_sweep_duration(jlong duration);
+
+  bool is_in_progress() const;
+  bool are_java_threads_synched() const;
+};
+
 // Controls concurrent refinement.
 //
 // Mutator threads produce dirty cards, which need to be examined for updates
@@ -84,49 +187,43 @@ public:
 // pending dirty cards at the start of a GC can be processed within that time
 // budget.
 //
-// Concurrent refinement is performed by a combination of dedicated threads
-// and by mutator threads as they produce dirty cards.  If configured to not
-// have any dedicated threads (-XX:G1ConcRefinementThreads=0) then all
-// concurrent refinement work is performed by mutator threads.  When there are
-// dedicated threads, they generally do most of the concurrent refinement
-// work, to minimize throughput impact of refinement work on mutator threads.
+// Concurrent refinement is performed by a set of dedicated threads.  If configured
+// to not have any dedicated threads (-XX:G1ConcRefinementThreads=0) then no
+// refinement work is performed at all.
 //
 // This class determines the target number of dirty cards pending for the next
 // GC.  It also owns the dedicated refinement threads and controls their
 // activation in order to achieve that target.
 //
-// There are two kinds of dedicated refinement threads, a single primary
-// thread and some number of secondary threads.  When active, all refinement
-// threads take buffers of dirty cards from the dirty card queue and process
-// them.  Between buffers they query this owning object to find out whether
-// they should continue running, deactivating themselves if not.
+// There are two kinds of dedicated refinement threads, a single control
+// thread and some number of refinement worker threads.
+// The control thread determines whether there is need to do work, and then starts
+// an appropriate number of refinement worker threads to get back to the target
+// number of pending dirty cards.
+//
+// The control wakes up periodically whether there is need to do refinement
+// work, starting the refinement process as necessary.
 //
-// The primary thread drives the control system that determines how many
-// refinement threads should be active.  If inactive, it wakes up periodically
-// to recalculate the number of active threads needed, and activates
-// additional threads as necessary.  While active it also periodically
-// recalculates the number wanted and activates more threads if needed.  It
-// also reduces the number of wanted threads when the target has been reached,
-// triggering deactivations.
 class G1ConcurrentRefine : public CHeapObj<mtGC> {
   G1Policy* _policy;
-  volatile uint _threads_wanted;
+  volatile uint _num_threads_wanted;
   size_t _pending_cards_target;
   Ticks _last_adjust;
   Ticks _last_deactivate;
   bool _needs_adjust;
+  bool _heap_was_locked;                // The heap has been locked the last time we tried to adjust the number of refinement threads.
+
   G1ConcurrentRefineThreadsNeeded _threads_needed;
   G1ConcurrentRefineThreadControl _thread_control;
-  G1DirtyCardQueueSet& _dcqs;
 
-  G1ConcurrentRefine(G1Policy* policy);
+  G1ConcurrentRefineSweepState _sweep_state;
 
-  static uint worker_id_offset();
+  G1ConcurrentRefine(G1CollectedHeap* g1h);
 
   jint initialize();
 
-  void assert_current_thread_is_primary_refinement_thread() const {
-    _thread_control.assert_current_thread_is_primary_refinement_thread();
+  void assert_current_thread_is_control_refinement_thread() const {
+    _thread_control.assert_current_thread_is_control_refinement_thread();
   }
 
   // For the first few collection cycles we don't have a target (and so don't
@@ -138,16 +235,11 @@ class G1ConcurrentRefine : public CHeapObj<mtGC> {
     return _pending_cards_target != PendingCardsTargetUninitialized;
   }
 
-  void update_pending_cards_target(double logged_cards_scan_time_ms,
-                                   size_t processed_logged_cards,
-                                   size_t predicted_thread_buffer_cards,
+  void update_pending_cards_target(double pending_cards_scan_time_ms,
+                                   size_t processed_pending_cards,
                                    double goal_ms);
 
   uint64_t adjust_threads_period_ms() const;
-  bool is_in_last_adjustment_period() const;
-
-  class RemSetSamplingClosure;  // Helper class for adjusting young length.
-  void adjust_young_list_target_length();
 
   void adjust_threads_wanted(size_t available_bytes);
 
@@ -156,67 +248,66 @@ class G1ConcurrentRefine : public CHeapObj<mtGC> {
 public:
   ~G1ConcurrentRefine();
 
+  G1ConcurrentRefineSweepState& sweep_state() { return _sweep_state; }
+
+  G1ConcurrentRefineSweepState& sweep_state_for_merge();
+
+  void run_with_refinement_workers(WorkerTask* task);
+
+  void notify_region_reclaimed(G1HeapRegion* r);
+
   // Returns a G1ConcurrentRefine instance if succeeded to create/initialize the
   // G1ConcurrentRefine instance. Otherwise, returns null with error code.
-  static G1ConcurrentRefine* create(G1Policy* policy, jint* ecode);
+  static G1ConcurrentRefine* create(G1CollectedHeap* g1h, jint* ecode);
 
   // Stop all the refinement threads.
   void stop();
 
   // Called at the end of a GC to prepare for refinement during the next
   // concurrent phase.  Updates the target for the number of pending dirty
-  // cards.  Updates the mutator refinement threshold.  Ensures the primary
-  // refinement thread (if it exists) is active, so it will adjust the number
+  // cards.  Updates the mutator refinement threshold.  Ensures the refinement
+  // control thread (if it exists) is active, so it will adjust the number
   // of running threads.
-  void adjust_after_gc(double logged_cards_scan_time_ms,
-                       size_t processed_logged_cards,
-                       size_t predicted_thread_buffer_cards,
+  void adjust_after_gc(double pending_cards_scan_time_ms,
+                       size_t processed_pending_cards,
                        double goal_ms);
 
   // Target number of pending dirty cards at the start of the next GC.
   size_t pending_cards_target() const { return _pending_cards_target; }
 
-  // May recalculate the number of refinement threads that should be active in
-  // order to meet the pending cards target.  Returns true if adjustment was
-  // performed, and clears any pending request.  Returns false if the
-  // adjustment period has not expired, or because a timed or requested
-  // adjustment could not be performed immediately and so was deferred.
-  // precondition: current thread is the primary refinement thread.
-  bool adjust_threads_periodically();
+  // Recalculates the number of refinement threads that should be active in
+  // order to meet the pending cards target.
+  // Returns true if it could recalculate the number of threads and
+  // refinement threads should be started.
+  // Returns false if the adjustment period has not expired, or because a timed
+  // or requested adjustment could not be performed immediately and so was deferred.
+  bool adjust_num_threads_periodically();
 
-  // The amount of time (in ms) the primary refinement thread should sleep
+  // The amount of time (in ms) the refinement control thread should sleep
   // when it is inactive.  It requests adjustment whenever it is reactivated.
-  // precondition: current thread is the primary refinement thread.
+  // precondition: current thread is the refinement control thread.
   uint64_t adjust_threads_wait_ms() const;
 
   // Record a request for thread adjustment as soon as possible.
-  // precondition: current thread is the primary refinement thread.
+  // precondition: current thread is the refinement control thread.
   void record_thread_adjustment_needed();
 
   // Test whether there is a pending request for thread adjustment.
-  // precondition: current thread is the primary refinement thread.
+  // precondition: current thread is the refinement control thread.
   bool is_thread_adjustment_needed() const;
 
-  // Reduce the number of active threads wanted.
-  // precondition: current thread is the primary refinement thread.
-  void reduce_threads_wanted();
+  // Indicate that last refinement adjustment had been deferred due to not
+  // obtaining the heap lock.
+  bool heap_was_locked() const { return _heap_was_locked; }
 
-  // Test whether the thread designated by worker_id should be active.
-  bool is_thread_wanted(uint worker_id) const;
-
-  // Return total of concurrent refinement stats for the
-  // ConcurrentRefineThreads.  Also reset the stats for the threads.
-  G1ConcurrentRefineStats get_and_reset_refinement_stats();
-
-  // Perform a single refinement step; called by the refinement
-  // threads.  Returns true if there was refinement work available.
-  // Updates stats.
-  bool try_refinement_step(uint worker_id,
-                           size_t stop_at,
-                           G1ConcurrentRefineStats* stats);
+  uint num_threads_wanted() const { return _num_threads_wanted; }
+  uint max_num_threads() const { return _thread_control.max_num_threads(); }
 
   // Iterate over all concurrent refinement threads applying the given closure.
   void threads_do(ThreadClosure *tc);
+  // Iterate over specific refinement threads applying the given closure.
+  void worker_threads_do(ThreadClosure *tc);
+  void control_thread_do(ThreadClosure *tc);
 };
 
 #endif // SHARE_GC_G1_G1CONCURRENTREFINE_HPP
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
index 7f0bcc5b50f..83a09c55a3f 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
@@ -23,41 +23,33 @@
  */
 
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
+#include "runtime/atomicAccess.hpp"
+#include "runtime/timer.hpp"
 
 G1ConcurrentRefineStats::G1ConcurrentRefineStats() :
-  _refinement_time(),
-  _refined_cards(0),
-  _precleaned_cards(0),
-  _dirtied_cards(0)
+  _sweep_duration(0),
+  _yield_during_sweep_duration(0),
+  _cards_scanned(0),
+  _cards_clean(0),
+  _cards_not_parsable(0),
+  _cards_already_refer_to_cset(0),
+  _cards_refer_to_cset(0),
+  _cards_no_cross_region(0),
+  _refine_duration(0)
 {}
 
-double G1ConcurrentRefineStats::refinement_rate_ms() const {
-  // Report 0 when no time recorded because no refinement performed.
-  double secs = refinement_time().seconds();
-  return (secs > 0) ? (refined_cards() / (secs * MILLIUNITS)) : 0.0;
-}
+void G1ConcurrentRefineStats::add_atomic(G1ConcurrentRefineStats* other) {
+  AtomicAccess::add(&_sweep_duration, other->_sweep_duration, memory_order_relaxed);
+  AtomicAccess::add(&_yield_during_sweep_duration, other->_yield_during_sweep_duration, memory_order_relaxed);
 
-G1ConcurrentRefineStats&
-G1ConcurrentRefineStats::operator+=(const G1ConcurrentRefineStats& other) {
-  _refinement_time += other._refinement_time;
-  _refined_cards += other._refined_cards;
-  _precleaned_cards += other._precleaned_cards;
-  _dirtied_cards += other._dirtied_cards;
-  return *this;
-}
+  AtomicAccess::add(&_cards_scanned, other->_cards_scanned, memory_order_relaxed);
+  AtomicAccess::add(&_cards_clean, other->_cards_clean, memory_order_relaxed);
+  AtomicAccess::add(&_cards_not_parsable, other->_cards_not_parsable, memory_order_relaxed);
+  AtomicAccess::add(&_cards_already_refer_to_cset, other->_cards_already_refer_to_cset, memory_order_relaxed);
+  AtomicAccess::add(&_cards_refer_to_cset, other->_cards_refer_to_cset, memory_order_relaxed);
+  AtomicAccess::add(&_cards_no_cross_region, other->_cards_no_cross_region, memory_order_relaxed);
 
-template<typename T>
-static T clipped_sub(T x, T y) {
-  return (x < y) ? T() : (x - y);
-}
-
-G1ConcurrentRefineStats&
-G1ConcurrentRefineStats::operator-=(const G1ConcurrentRefineStats& other) {
-  _refinement_time = clipped_sub(_refinement_time, other._refinement_time);
-  _refined_cards = clipped_sub(_refined_cards, other._refined_cards);
-  _precleaned_cards = clipped_sub(_precleaned_cards, other._precleaned_cards);
-  _dirtied_cards = clipped_sub(_dirtied_cards, other._dirtied_cards);
-  return *this;
+  AtomicAccess::add(&_refine_duration, other->_refine_duration, memory_order_relaxed);
 }
 
 void G1ConcurrentRefineStats::reset() {
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
index ae576778a07..ce22f4317df 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,47 +33,56 @@
 // Used for collecting per-thread statistics and for summaries over a
 // collection of threads.
 class G1ConcurrentRefineStats : public CHeapObj<mtGC> {
-  Tickspan _refinement_time;
-  size_t _refined_cards;
-  size_t _precleaned_cards;
-  size_t _dirtied_cards;
+  jlong _sweep_duration;              // Time spent sweeping the table finding non-clean cards
+                                      // and refining them.
+  jlong _yield_during_sweep_duration; // Time spent yielding during the sweep (not doing the sweep).
+
+  size_t _cards_scanned;              // Total number of cards scanned.
+  size_t _cards_clean;                // Number of cards found clean.
+  size_t _cards_not_parsable;         // Number of cards we could not parse and left unrefined.
+  size_t _cards_already_refer_to_cset;// Number of cards marked found to be already young.
+  size_t _cards_refer_to_cset;        // Number of dirty cards that were recently found to contain a to-cset reference.
+  size_t _cards_no_cross_region;      // Number of dirty cards that were dirtied, but then cleaned again by the mutator.
+
+  jlong _refine_duration;             // Time spent during actual refinement.
 
 public:
   G1ConcurrentRefineStats();
 
-  // Time spent performing concurrent refinement.
-  Tickspan refinement_time() const { return _refinement_time; }
+  // Time spent performing sweeping the refinement table (includes actual refinement,
+  // but not yield time).
+  jlong sweep_duration() const { return _sweep_duration - _yield_during_sweep_duration; }
+  jlong yield_during_sweep_duration() const { return _yield_during_sweep_duration; }
+  jlong refine_duration() const { return _refine_duration; }
 
   // Number of refined cards.
-  size_t refined_cards() const { return _refined_cards; }
+  size_t refined_cards() const { return cards_not_clean(); }
 
-  // Refinement rate, in cards per ms.
-  double refinement_rate_ms() const;
+  size_t cards_scanned() const { return _cards_scanned; }
+  size_t cards_clean() const { return _cards_clean; }
+  size_t cards_not_clean() const { return _cards_scanned - _cards_clean; }
+  size_t cards_not_parsable() const { return _cards_not_parsable; }
+  size_t cards_already_refer_to_cset() const { return _cards_already_refer_to_cset; }
+  size_t cards_refer_to_cset() const { return _cards_refer_to_cset; }
+  size_t cards_no_cross_region() const { return _cards_no_cross_region; }
+  // Number of cards that were marked dirty and in need of refinement. This includes cards recently
+  // found to refer to the collection set as they originally were dirty.
+  size_t cards_pending() const { return cards_not_clean() - _cards_already_refer_to_cset; }
 
-  // Number of cards for which refinement was skipped because some other
-  // thread had already refined them.
-  size_t precleaned_cards() const { return _precleaned_cards; }
+  size_t cards_to_cset() const { return _cards_already_refer_to_cset + _cards_refer_to_cset; }
 
-  // Number of cards marked dirty and in need of refinement.
-  size_t dirtied_cards() const { return _dirtied_cards; }
+  void inc_sweep_time(jlong t) { _sweep_duration += t; }
+  void inc_yield_during_sweep_duration(jlong t) { _yield_during_sweep_duration += t; }
+  void inc_refine_duration(jlong t) { _refine_duration += t; }
 
-  void inc_refinement_time(Tickspan t) { _refinement_time += t; }
-  void inc_refined_cards(size_t cards) { _refined_cards += cards; }
-  void inc_precleaned_cards(size_t cards) { _precleaned_cards += cards; }
-  void inc_dirtied_cards(size_t cards) { _dirtied_cards += cards; }
+  void inc_cards_scanned(size_t increment) { _cards_scanned += increment; }
+  void inc_cards_clean(size_t increment) { _cards_clean += increment; }
+  void inc_cards_not_parsable() { _cards_not_parsable++; }
+  void inc_cards_already_refer_to_cset() { _cards_already_refer_to_cset++; }
+  void inc_cards_refer_to_cset() { _cards_refer_to_cset++; }
+  void inc_cards_no_cross_region() { _cards_no_cross_region++; }
 
-  G1ConcurrentRefineStats& operator+=(const G1ConcurrentRefineStats& other);
-  G1ConcurrentRefineStats& operator-=(const G1ConcurrentRefineStats& other);
-
-  friend G1ConcurrentRefineStats operator+(G1ConcurrentRefineStats x,
-                                           const G1ConcurrentRefineStats& y) {
-    return x += y;
-  }
-
-  friend G1ConcurrentRefineStats operator-(G1ConcurrentRefineStats x,
-                                           const G1ConcurrentRefineStats& y) {
-    return x -= y;
-  }
+  void add_atomic(G1ConcurrentRefineStats* other);
 
   void reset();
 };
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp
new file mode 100644
index 00000000000..ca5bc9ebe5f
--- /dev/null
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1ConcurrentRefineSweepTask.hpp"
+
+class G1RefineRegionClosure : public G1HeapRegionClosure {
+  using CardValue = G1CardTable::CardValue;
+
+  G1RemSet* _rem_set;
+  G1CardTableClaimTable* _scan_state;
+
+  uint _worker_id;
+
+  size_t _num_collections_at_start;
+
+  bool has_work(G1HeapRegion* r) {
+    return _scan_state->has_unclaimed_cards(r->hrm_index());
+  }
+
+  void verify_card_pair_refers_to_same_card(CardValue* source_card, CardValue* dest_card) {
+#ifdef ASSERT
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+    G1HeapRegion* refinement_r = g1h->heap_region_containing(g1h->refinement_table()->addr_for(source_card));
+    G1HeapRegion* card_r = g1h->heap_region_containing(g1h->card_table()->addr_for(dest_card));
+    size_t refinement_i = g1h->refinement_table()->index_for_cardvalue(source_card);
+    size_t card_i = g1h->card_table()->index_for_cardvalue(dest_card);
+
+    assert(refinement_r == card_r, "not same region source %u (%zu) dest %u (%zu) ", refinement_r->hrm_index(), refinement_i, card_r->hrm_index(), card_i);
+    assert(refinement_i == card_i, "indexes are not same %zu %zu", refinement_i, card_i);
+#endif
+  }
+
+  void do_dirty_card(CardValue* source_card, CardValue* dest_card) {
+    verify_card_pair_refers_to_same_card(source_card, dest_card);
+
+    G1RemSet::RefineResult res = _rem_set->refine_card_concurrently(source_card, _worker_id);
+    // Gather statistics based on the result.
+    switch (res) {
+      case G1RemSet::HasRefToCSet: {
+        *dest_card = G1CardTable::g1_to_cset_card;
+        _refine_stats.inc_cards_refer_to_cset();
+        break;
+      }
+      case G1RemSet::AlreadyToCSet: {
+        *dest_card = G1CardTable::g1_to_cset_card;
+        _refine_stats.inc_cards_already_refer_to_cset();
+        break;
+      }
+      case G1RemSet::NoCrossRegion: {
+        _refine_stats.inc_cards_no_cross_region();
+        break;
+      }
+      case G1RemSet::CouldNotParse: {
+        // Could not refine - redirty with the original value.
+        *dest_card = *source_card;
+        _refine_stats.inc_cards_not_parsable();
+        break;
+      }
+      case G1RemSet::HasRefToOld : break; // Nothing special to do.
+    }
+    // Clean card on source card table.
+    *source_card = G1CardTable::clean_card_val();
+  }
+
+  void do_claimed_block(CardValue* dirty_l, CardValue* dirty_r, CardValue* dest_card) {
+    for (CardValue* source = dirty_l; source < dirty_r; ++source, ++dest_card) {
+      do_dirty_card(source, dest_card);
+    }
+  }
+
+public:
+  bool _completed;
+  G1ConcurrentRefineStats _refine_stats;
+
+  G1RefineRegionClosure(uint worker_id, G1CardTableClaimTable* scan_state) :
+    G1HeapRegionClosure(),
+    _rem_set(G1CollectedHeap::heap()->rem_set()),
+    _scan_state(scan_state),
+    _worker_id(worker_id),
+    _completed(true),
+    _refine_stats() { }
+
+  bool do_heap_region(G1HeapRegion* r) override {
+
+    if (!has_work(r)) {
+      return false;
+    }
+
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+
+    if (r->is_young()) {
+      if (_scan_state->claim_all_cards(r->hrm_index()) == 0) {
+        // Clear the pre-dirtying information.
+        r->clear_refinement_table();
+      }
+      return false;
+    }
+
+    G1CardTable* card_table = g1h->card_table();
+    G1CardTable* refinement_table = g1h->refinement_table();
+
+    G1CardTableChunkClaimer claim(_scan_state, r->hrm_index());
+
+    size_t const region_card_base_idx = (size_t)r->hrm_index() << G1HeapRegion::LogCardsPerRegion;
+
+    while (claim.has_next()) {
+      size_t const start_idx = region_card_base_idx + claim.value();
+      CardValue* const start_card = refinement_table->byte_for_index(start_idx);
+      CardValue* const end_card = start_card + claim.size();
+
+      CardValue* dest_card = card_table->byte_for_index(start_idx);
+
+      G1ChunkScanner scanner{start_card, end_card};
+
+      size_t num_dirty_cards = 0;
+      scanner.on_dirty_cards([&] (CardValue* dirty_l, CardValue* dirty_r) {
+                               jlong refine_start = os::elapsed_counter();
+
+                               do_claimed_block(dirty_l, dirty_r, dest_card + pointer_delta(dirty_l, start_card, sizeof(CardValue)));
+                               num_dirty_cards += pointer_delta(dirty_r, dirty_l, sizeof(CardValue));
+
+                               _refine_stats.inc_refine_duration(os::elapsed_counter() - refine_start);
+                             });
+
+      if (VerifyDuringGC) {
+        for (CardValue* i = start_card; i < end_card; ++i) {
+          guarantee(*i == G1CardTable::clean_card_val(), "must be");
+        }
+      }
+
+      _refine_stats.inc_cards_scanned(claim.size());
+      _refine_stats.inc_cards_clean(claim.size() - num_dirty_cards);
+
+      if (SuspendibleThreadSet::should_yield()) {
+        _completed = false;
+        break;
+      }
+    }
+
+    return !_completed;
+  }
+};
+
+G1ConcurrentRefineSweepTask::G1ConcurrentRefineSweepTask(G1CardTableClaimTable* scan_state,
+                                                           G1ConcurrentRefineStats* stats,
+                                                           uint max_workers) :
+  WorkerTask("G1 Refine Task"),
+  _scan_state(scan_state),
+  _stats(stats),
+  _max_workers(max_workers),
+  _sweep_completed(true)
+{ }
+
+void G1ConcurrentRefineSweepTask::work(uint worker_id) {
+  jlong start = os::elapsed_counter();
+
+  G1RefineRegionClosure sweep_cl(worker_id, _scan_state);
+  _scan_state->heap_region_iterate_from_worker_offset(&sweep_cl, worker_id, _max_workers);
+
+  if (!sweep_cl._completed) {
+    _sweep_completed = false;
+  }
+
+  sweep_cl._refine_stats.inc_sweep_time(os::elapsed_counter() - start);
+  _stats->add_atomic(&sweep_cl._refine_stats);
+}
+
+bool G1ConcurrentRefineSweepTask::sweep_completed() const { return _sweep_completed; }
\ No newline at end of file
diff --git a/src/hotspot/share/gc/shared/bufferNodeList.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.hpp
similarity index 57%
rename from src/hotspot/share/gc/shared/bufferNodeList.hpp
rename to src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.hpp
index 55905ec071a..bf24c5ae850 100644
--- a/src/hotspot/share/gc/shared/bufferNodeList.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -22,20 +22,27 @@
  *
  */
 
-#ifndef SHARE_GC_SHARED_BUFFERNODELIST_HPP
-#define SHARE_GC_SHARED_BUFFERNODELIST_HPP
+#ifndef SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP
+#define SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP
 
-#include "utilities/globalDefinitions.hpp"
+#include "gc/g1/g1ConcurrentRefineStats.hpp"
+#include "gc/shared/workerThread.hpp"
 
-class BufferNode;
+class G1CardTableClaimTable;
 
-struct BufferNodeList {
-  BufferNode* _head;            // First node in list or null if empty.
-  BufferNode* _tail;            // Last node in list or null if empty.
-  size_t _entry_count;          // Sum of entries in nodes in list.
+class G1ConcurrentRefineSweepTask : public WorkerTask {
+  G1CardTableClaimTable* _scan_state;
+  G1ConcurrentRefineStats* _stats;
+  uint _max_workers;
+  bool _sweep_completed;
 
-  BufferNodeList();
-  BufferNodeList(BufferNode* head, BufferNode* tail, size_t entry_count);
+public:
+
+  G1ConcurrentRefineSweepTask(G1CardTableClaimTable* scan_state, G1ConcurrentRefineStats* stats, uint max_workers);
+
+  void work(uint worker_id) override;
+
+  bool sweep_completed() const;
 };
 
-#endif // SHARE_GC_SHARED_BUFFERNODELIST_HPP
+#endif /* SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP */
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
index 2fa19d46093..eccfe466d48 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
@@ -23,10 +23,13 @@
  */
 
 #include "gc/g1/g1BarrierSet.hpp"
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
+#include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
+#include "gc/g1/g1ConcurrentRefineSweepTask.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
+#include "gc/shared/gcTraceTime.inline.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "logging/log.hpp"
 #include "runtime/cpuTimeCounters.hpp"
@@ -38,60 +41,61 @@
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/ticks.hpp"
 
-G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) :
+G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr) :
   ConcurrentGCThread(),
-  _notifier(Mutex::nosafepoint, FormatBuffer<>("G1 Refine#%d", worker_id), true),
+  _notifier(Mutex::nosafepoint, "G1 Refine Control", true),
   _requested_active(false),
-  _refinement_stats(),
-  _worker_id(worker_id),
   _cr(cr)
 {
-  // set name
-  set_name("G1 Refine#%d", worker_id);
+  set_name("G1 Refine Control");
 }
 
 void G1ConcurrentRefineThread::run_service() {
-  while (wait_for_completed_buffers()) {
+  while (wait_for_work()) {
     SuspendibleThreadSetJoiner sts_join;
-    G1ConcurrentRefineStats active_stats_start = _refinement_stats;
     report_active("Activated");
     while (!should_terminate()) {
       if (sts_join.should_yield()) {
-        report_inactive("Paused", _refinement_stats - active_stats_start);
+        report_inactive("Paused");
         sts_join.yield();
         // Reset after yield rather than accumulating across yields, else a
         // very long running thread could overflow.
-        active_stats_start = _refinement_stats;
         report_active("Resumed");
-      } else if (maybe_deactivate()) {
-        break;
+      }
+      // Look if we want to do refinement. If we don't then don't do any refinement
+      // this. This thread may have just woken up but no threads are currently
+      // needed, which is common.  In this case we want to just go back to
+      // waiting, with a minimum of fuss; in particular, don't do any "premature"
+      // refinement.  However, adjustment may be pending but temporarily
+      // blocked. In that case we wait for adjustment to succeed.
+      Ticks adjust_start = Ticks::now();
+      if (cr()->adjust_num_threads_periodically()) {
+        GCTraceTime(Info, gc, refine) tm("Concurrent Refine Cycle");
+        do_refinement();
       } else {
-        do_refinement_step();
+        log_debug(gc, refine)("Concurrent Refine Adjust Only (#threads wanted: %u adjustment_needed: %s wait_for_heap_lock: %s) %.2fms",
+                              cr()->num_threads_wanted(),
+                              BOOL_TO_STR(cr()->is_thread_adjustment_needed()),
+                              BOOL_TO_STR(cr()->heap_was_locked()),
+                              (Ticks::now() - adjust_start).seconds() * MILLIUNITS);
+
+        deactivate();
+        break;
       }
     }
-    report_inactive("Deactivated", _refinement_stats - active_stats_start);
+    report_inactive("Deactivated");
     update_perf_counter_cpu_time();
   }
 
-  log_debug(gc, refine)("Stopping %d", _worker_id);
+  log_debug(gc, refine)("Stopping %s", name());
 }
 
 void G1ConcurrentRefineThread::report_active(const char* reason) const {
-  log_trace(gc, refine)("%s worker %u, current: %zu",
-                        reason,
-                        _worker_id,
-                        G1BarrierSet::dirty_card_queue_set().num_cards());
+  log_trace(gc, refine)("%s active (%s)", name(), reason);
 }
 
-void G1ConcurrentRefineThread::report_inactive(const char* reason,
-                                               const G1ConcurrentRefineStats& stats) const {
-  log_trace(gc, refine)
-           ("%s worker %u, cards: %zu, refined %zu, rate %1.2fc/ms",
-            reason,
-            _worker_id,
-            G1BarrierSet::dirty_card_queue_set().num_cards(),
-            stats.refined_cards(),
-            stats.refinement_rate_ms());
+void G1ConcurrentRefineThread::report_inactive(const char* reason) const {
+  log_trace(gc, refine)("%s inactive (%s)", name(), reason);
 }
 
 void G1ConcurrentRefineThread::activate() {
@@ -103,21 +107,12 @@ void G1ConcurrentRefineThread::activate() {
   }
 }
 
-bool G1ConcurrentRefineThread::maybe_deactivate() {
+bool G1ConcurrentRefineThread::deactivate() {
   assert(this == Thread::current(), "precondition");
-  if (cr()->is_thread_wanted(_worker_id)) {
-    return false;
-  } else {
-    MutexLocker ml(&_notifier, Mutex::_no_safepoint_check_flag);
-    bool requested = _requested_active;
-    _requested_active = false;
-    return !requested;  // Deactivate only if not recently requested active.
-  }
-}
-
-bool G1ConcurrentRefineThread::try_refinement_step(size_t stop_at) {
-  assert(this == Thread::current(), "precondition");
-  return _cr->try_refinement_step(_worker_id, stop_at, &_refinement_stats);
+  MutexLocker ml(&_notifier, Mutex::_no_safepoint_check_flag);
+  bool requested = _requested_active;
+  _requested_active = false;
+  return !requested;  // Deactivate only if not recently requested active.
 }
 
 void G1ConcurrentRefineThread::stop_service() {
@@ -128,23 +123,9 @@ jlong G1ConcurrentRefineThread::cpu_time() {
   return os::thread_cpu_time(this);
 }
 
-// The (single) primary thread drives the controller for the refinement threads.
-class G1PrimaryConcurrentRefineThread final : public G1ConcurrentRefineThread {
-  bool wait_for_completed_buffers() override;
-  bool maybe_deactivate() override;
-  void do_refinement_step() override;
-  // Updates jstat cpu usage for all refinement threads.
-  void update_perf_counter_cpu_time() override;
-
-public:
-  G1PrimaryConcurrentRefineThread(G1ConcurrentRefine* cr) :
-    G1ConcurrentRefineThread(cr, 0)
-  {}
-};
-
-// When inactive, the primary thread periodically wakes up and requests
-// adjustment of the number of active refinement threads.
-bool G1PrimaryConcurrentRefineThread::wait_for_completed_buffers() {
+// When inactive, the control thread periodically wakes up to check if there is
+// refinement work pending.
+bool G1ConcurrentRefineThread::wait_for_work() {
   assert(this == Thread::current(), "precondition");
   MonitorLocker ml(notifier(), Mutex::_no_safepoint_check_flag);
   if (!requested_active() && !should_terminate()) {
@@ -157,78 +138,115 @@ bool G1PrimaryConcurrentRefineThread::wait_for_completed_buffers() {
   return !should_terminate();
 }
 
-bool G1PrimaryConcurrentRefineThread::maybe_deactivate() {
-  // Don't deactivate while needing to adjust the number of active threads.
-  return !cr()->is_thread_adjustment_needed() &&
-         G1ConcurrentRefineThread::maybe_deactivate();
+void G1ConcurrentRefineThread::do_refinement() {
+  G1ConcurrentRefineSweepState& state = _cr->sweep_state();
+
+  state.start_work();
+
+  // Swap card tables.
+
+  // 1. Global card table
+  if (!state.swap_global_card_table()) {
+    log_debug(gc, refine)("GC pause after Global Card Table Swap");
+    return;
+  }
+
+  // 2. Java threads
+  if (!state.swap_java_threads_ct()) {
+    log_debug(gc, refine)("GC pause after Java Thread CT swap");
+    return;
+  }
+
+  // 3. GC threads
+  if (!state.swap_gc_threads_ct()) {
+    log_debug(gc, refine)("GC pause after GC Thread CT swap");
+    return;
+  }
+
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  jlong epoch_yield_duration = g1h->yield_duration_in_refinement_epoch();
+  jlong next_epoch_start = os::elapsed_counter();
+
+  jlong total_yield_during_sweep_duration = 0;
+
+  // 4. Snapshot heap.
+  state.snapshot_heap();
+
+  // 5. Sweep refinement table until done
+  bool interrupted_by_gc = false;
+
+  log_info(gc, task)("Concurrent Refine Sweep Using %u of %u Workers", _cr->num_threads_wanted(), _cr->max_num_threads());
+
+  state.sweep_refinement_table_start();
+  while (true) {
+    bool completed = state.sweep_refinement_table_step();
+
+    if (completed) {
+      break;
+    }
+
+    if (SuspendibleThreadSet::should_yield()) {
+      jlong yield_during_sweep_start = os::elapsed_counter();
+      SuspendibleThreadSet::yield();
+
+      // The yielding may have completed the task, check.
+      if (!state.is_in_progress()) {
+        log_debug(gc, refine)("GC completed sweeping, aborting concurrent operation");
+        interrupted_by_gc = true;
+        break;
+      } else {
+        jlong yield_during_sweep_duration = os::elapsed_counter() - yield_during_sweep_start;
+        log_debug(gc, refine)("Yielded from card table sweeping for %.2fms, no GC inbetween, continue",
+                              TimeHelper::counter_to_millis(yield_during_sweep_duration));
+        total_yield_during_sweep_duration += yield_during_sweep_duration;
+      }
+    }
+  }
+
+  if (!interrupted_by_gc) {
+    GCTraceTime(Info, gc, refine) tm("Concurrent Refine Complete Work");
+
+    state.add_yield_during_sweep_duration(total_yield_during_sweep_duration);
+
+    state.complete_work(true);
+
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+    G1Policy* policy = g1h->policy();
+    G1ConcurrentRefineStats* stats = state.stats();
+    policy->record_refinement_stats(stats);
+
+    {
+      // The young gen revising mechanism reads the predictor and the values set
+      // here. Avoid inconsistencies by locking.
+      MutexLocker x(G1ReviseYoungLength_lock, Mutex::_no_safepoint_check_flag);
+      policy->record_dirtying_stats(TimeHelper::counter_to_millis(G1CollectedHeap::heap()->last_refinement_epoch_start()),
+                                    TimeHelper::counter_to_millis(next_epoch_start),
+                                    stats->cards_pending(),
+                                    TimeHelper::counter_to_millis(epoch_yield_duration),
+                                    0 /* pending_cards_from_gc */,
+                                    stats->cards_to_cset());
+      G1CollectedHeap::heap()->set_last_refinement_epoch_start(next_epoch_start, epoch_yield_duration);
+    }
+    stats->reset();
+  }
 }
 
-void G1PrimaryConcurrentRefineThread::do_refinement_step() {
-  // Try adjustment first.  If it succeeds then don't do any refinement this
-  // round.  This thread may have just woken up but no threads are currently
-  // needed, which is common.  In this case we want to just go back to
-  // waiting, with a minimum of fuss; in particular, don't do any "premature"
-  // refinement.  However, adjustment may be pending but temporarily
-  // blocked. In that case we *do* try refinement, rather than possibly
-  // uselessly spinning while waiting for adjustment to succeed.
-  if (!cr()->adjust_threads_periodically()) {
-    // No adjustment, so try refinement, with the target as a cuttoff.
-    if (!try_refinement_step(cr()->pending_cards_target())) {
-      // Refinement was cut off, so proceed with fewer threads.
-      cr()->reduce_threads_wanted();
+void G1ConcurrentRefineThread::update_perf_counter_cpu_time() {
+  // The control thread is responsible for updating the CPU time for all workers.
+  if (UsePerfData) {
+    {
+      ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine);
+      cr()->worker_threads_do(&tttc);
+    }
+    {
+      ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine_control);
+      cr()->control_thread_do(&tttc);
     }
   }
 }
 
-void G1PrimaryConcurrentRefineThread::update_perf_counter_cpu_time() {
-  if (UsePerfData) {
-    ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine);
-    cr()->threads_do(&tttc);
-  }
-}
-
-class G1SecondaryConcurrentRefineThread final : public G1ConcurrentRefineThread {
-  bool wait_for_completed_buffers() override;
-  void do_refinement_step() override;
-  void update_perf_counter_cpu_time() override { /* Nothing to do. The primary thread does all the work. */ }
-
-public:
-  G1SecondaryConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) :
-    G1ConcurrentRefineThread(cr, worker_id)
-  {
-    assert(worker_id > 0, "precondition");
-  }
-};
-
-bool G1SecondaryConcurrentRefineThread::wait_for_completed_buffers() {
-  assert(this == Thread::current(), "precondition");
-  MonitorLocker ml(notifier(), Mutex::_no_safepoint_check_flag);
-  while (!requested_active() && !should_terminate()) {
-    ml.wait();
-  }
-  return !should_terminate();
-}
-
-void G1SecondaryConcurrentRefineThread::do_refinement_step() {
-  assert(this == Thread::current(), "precondition");
-  // Secondary threads ignore the target and just drive the number of pending
-  // dirty cards down.  The primary thread is responsible for noticing the
-  // target has been reached and reducing the number of wanted threads.  This
-  // makes the control of wanted threads all under the primary, while avoiding
-  // useless spinning by secondary threads until the primary thread notices.
-  // (Useless spinning is still possible if there are no pending cards, but
-  // that should rarely happen.)
-  try_refinement_step(0);
-}
-
-G1ConcurrentRefineThread*
-G1ConcurrentRefineThread::create(G1ConcurrentRefine* cr, uint worker_id) {
-  G1ConcurrentRefineThread* crt;
-  if (worker_id == 0) {
-    crt = new (std::nothrow) G1PrimaryConcurrentRefineThread(cr);
-  } else {
-    crt = new (std::nothrow) G1SecondaryConcurrentRefineThread(cr, worker_id);
-  }
+G1ConcurrentRefineThread* G1ConcurrentRefineThread::create(G1ConcurrentRefine* cr) {
+  G1ConcurrentRefineThread* crt = new (std::nothrow) G1ConcurrentRefineThread(cr);
   if (crt != nullptr) {
     crt->create_and_start();
   }
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
index b1e34e4b78d..8e635247cd3 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
@@ -33,8 +33,8 @@
 // Forward Decl.
 class G1ConcurrentRefine;
 
-// One or more G1 Concurrent Refinement Threads may be active if concurrent
-// refinement is in progress.
+// Concurrent refinement control thread watching card mark accrual on the card table
+// and starting refinement work.
 class G1ConcurrentRefineThread: public ConcurrentGCThread {
   friend class VMStructs;
   friend class G1CollectedHeap;
@@ -42,43 +42,34 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {
   Monitor _notifier;
   bool _requested_active;
 
-  G1ConcurrentRefineStats _refinement_stats;
-
   uint _worker_id;
 
   G1ConcurrentRefine* _cr;
 
   NONCOPYABLE(G1ConcurrentRefineThread);
 
-protected:
-  G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id);
+  G1ConcurrentRefineThread(G1ConcurrentRefine* cr);
 
   Monitor* notifier() { return &_notifier; }
   bool requested_active() const { return _requested_active; }
 
   // Returns !should_terminate().
   // precondition: this is the current thread.
-  virtual bool wait_for_completed_buffers() = 0;
+  bool wait_for_work();
 
   // Deactivate if appropriate.  Returns true if deactivated.
   // precondition: this is the current thread.
-  virtual bool maybe_deactivate();
+  bool deactivate();
 
-  // Attempt to do some refinement work.
-  // precondition: this is the current thread.
-  virtual void do_refinement_step() = 0;
+  // Swap card table and do a complete re-examination/refinement pass over the
+  // refinement table.
+  void do_refinement();
 
   // Update concurrent refine threads cpu time stats.
-  virtual void update_perf_counter_cpu_time() = 0;
-
-  // Helper for do_refinement_step implementations.  Try to perform some
-  // refinement work, limited by stop_at.  Returns true if any refinement work
-  // was performed, false if no work available per stop_at.
-  // precondition: this is the current thread.
-  bool try_refinement_step(size_t stop_at);
+  void update_perf_counter_cpu_time();
 
   void report_active(const char* reason) const;
-  void report_inactive(const char* reason, const G1ConcurrentRefineStats& stats) const;
+  void report_inactive(const char* reason) const;
 
   G1ConcurrentRefine* cr() const { return _cr; }
 
@@ -86,23 +77,12 @@ protected:
   void stop_service() override;
 
 public:
-  static G1ConcurrentRefineThread* create(G1ConcurrentRefine* cr, uint worker_id);
-  virtual ~G1ConcurrentRefineThread() = default;
-
-  uint worker_id() const { return _worker_id; }
+  static G1ConcurrentRefineThread* create(G1ConcurrentRefine* cr);
 
   // Activate this thread.
   // precondition: this is not the current thread.
   void activate();
 
-  G1ConcurrentRefineStats* refinement_stats() {
-    return &_refinement_stats;
-  }
-
-  const G1ConcurrentRefineStats* refinement_stats() const {
-    return &_refinement_stats;
-  }
-
   // Total cpu time spent in this thread so far.
   jlong cpu_time();
 };
diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp
index d34229bd359..3ab26bd72af 100644
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp
@@ -45,48 +45,22 @@ G1ConcurrentRefineThreadsNeeded::G1ConcurrentRefineThreadsNeeded(G1Policy* polic
 //
 // 1. Minimize the number of refinement threads running at once.
 //
-// 2. Minimize the number of activations and deactivations for the
-// refinement threads that run.
-//
-// 3. Delay performing refinement work.  Having more dirty cards waiting to
+// 2. Delay performing refinement work.  Having more dirty cards waiting to
 // be refined can be beneficial, as further writes to the same card don't
 // create more work.
 void G1ConcurrentRefineThreadsNeeded::update(uint active_threads,
                                              size_t available_bytes,
                                              size_t num_cards,
                                              size_t target_num_cards) {
+  _predicted_time_until_next_gc_ms = _policy->predict_time_to_next_gc_ms(available_bytes);
+
+  // Estimate number of cards that need to be processed before next GC.
   const G1Analytics* analytics = _policy->analytics();
 
-  // Estimate time until next GC, based on remaining bytes available for
-  // allocation and the allocation rate.
-  double alloc_region_rate = analytics->predict_alloc_rate_ms();
-  double alloc_bytes_rate = alloc_region_rate * G1HeapRegion::GrainBytes;
-  if (alloc_bytes_rate == 0.0) {
-    // A zero rate indicates we don't yet have data to use for predictions.
-    // Since we don't have any idea how long until the next GC, use a time of
-    // zero.
-    _predicted_time_until_next_gc_ms = 0.0;
-  } else {
-    // If the heap size is large and the allocation rate is small, we can get
-    // a predicted time until next GC that is so large it can cause problems
-    // (such as overflow) in other calculations.  Limit the prediction to one
-    // hour, which is still large in this context.
-    const double one_hour_ms = 60.0 * 60.0 * MILLIUNITS;
-    double raw_time_ms = available_bytes / alloc_bytes_rate;
-    _predicted_time_until_next_gc_ms = MIN2(raw_time_ms, one_hour_ms);
-  }
+  double incoming_rate = analytics->predict_dirtied_cards_rate_ms();
+  double raw_cards = incoming_rate * _predicted_time_until_next_gc_ms;
+  size_t incoming_cards = static_cast<size_t>(raw_cards);
 
-  // Estimate number of cards that need to be processed before next GC.  There
-  // are no incoming cards when time is short, because in that case the
-  // controller activates refinement by mutator threads to stay on target even
-  // if threads deactivate in the meantime.  This also covers the case of not
-  // having a real prediction of time until GC.
-  size_t incoming_cards = 0;
-  if (_predicted_time_until_next_gc_ms > _update_period_ms) {
-    double incoming_rate = analytics->predict_dirtied_cards_rate_ms();
-    double raw_cards = incoming_rate * _predicted_time_until_next_gc_ms;
-    incoming_cards = static_cast<size_t>(raw_cards);
-  }
   size_t total_cards = num_cards + incoming_cards;
   _predicted_cards_at_next_gc = total_cards;
 
@@ -100,9 +74,8 @@ void G1ConcurrentRefineThreadsNeeded::update(uint active_threads,
   // The calculation of the number of threads needed isn't very stable when
   // time is short, and can lead to starting up lots of threads for not much
   // profit.  If we're in the last update period, don't change the number of
-  // threads running, other than to treat the current thread as running.  That
-  // might not be sufficient, but hopefully we were already reasonably close.
-  // We won't accumulate more because mutator refinement will be activated.
+  // threads needed.  That might not be sufficient, but hopefully we were
+  // already reasonably close.
   if (_predicted_time_until_next_gc_ms <= _update_period_ms) {
     _threads_needed = MAX2(active_threads, 1u);
     return;
@@ -133,11 +106,12 @@ void G1ConcurrentRefineThreadsNeeded::update(uint active_threads,
   // close to the next GC we want to drive toward the target, so round up
   // then.  The rest of the time we round to nearest, trying to remain near
   // the middle of the range.
+  double rthreads = nthreads;
   if (_predicted_time_until_next_gc_ms <= _update_period_ms * 5.0) {
-    nthreads = ::ceil(nthreads);
+    rthreads = ::ceil(nthreads);
   } else {
-    nthreads = ::round(nthreads);
+    rthreads = ::round(nthreads);
   }
 
-  _threads_needed = static_cast<uint>(MIN2<size_t>(nthreads, UINT_MAX));
+  _threads_needed = static_cast<uint>(MIN2<size_t>(rthreads, UINT_MAX));
 }
diff --git a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
deleted file mode 100644
index ec9d68af3bb..00000000000
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ /dev/null
@@ -1,599 +0,0 @@
-/*
- * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "gc/g1/g1BarrierSet.inline.hpp"
-#include "gc/g1/g1CardTableEntryClosure.hpp"
-#include "gc/g1/g1CollectedHeap.inline.hpp"
-#include "gc/g1/g1ConcurrentRefineStats.hpp"
-#include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
-#include "gc/g1/g1FreeIdSet.hpp"
-#include "gc/g1/g1HeapRegionRemSet.inline.hpp"
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
-#include "gc/g1/g1RemSet.hpp"
-#include "gc/g1/g1ThreadLocalData.hpp"
-#include "gc/shared/bufferNode.hpp"
-#include "gc/shared/bufferNodeList.hpp"
-#include "gc/shared/suspendibleThreadSet.hpp"
-#include "memory/iterator.hpp"
-#include "runtime/atomicAccess.hpp"
-#include "runtime/javaThread.hpp"
-#include "runtime/mutex.hpp"
-#include "runtime/mutexLocker.hpp"
-#include "runtime/os.hpp"
-#include "runtime/safepoint.hpp"
-#include "runtime/threads.hpp"
-#include "runtime/threadSMR.hpp"
-#include "utilities/globalCounter.inline.hpp"
-#include "utilities/macros.hpp"
-#include "utilities/nonblockingQueue.inline.hpp"
-#include "utilities/pair.hpp"
-#include "utilities/quickSort.hpp"
-#include "utilities/ticks.hpp"
-
-G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
-  PtrQueue(qset),
-  _refinement_stats(new G1ConcurrentRefineStats())
-{ }
-
-G1DirtyCardQueue::~G1DirtyCardQueue() {
-  delete _refinement_stats;
-}
-
-// Assumed to be zero by concurrent threads.
-static uint par_ids_start() { return 0; }
-
-G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) :
-  PtrQueueSet(allocator),
-  _num_cards(0),
-  _mutator_refinement_threshold(SIZE_MAX),
-  _completed(),
-  _paused(),
-  _free_ids(par_ids_start(), num_par_ids()),
-  _detached_refinement_stats()
-{}
-
-G1DirtyCardQueueSet::~G1DirtyCardQueueSet() {
-  abandon_completed_buffers();
-}
-
-// Determines how many mutator threads can process the buffers in parallel.
-uint G1DirtyCardQueueSet::num_par_ids() {
-  return (uint)os::initial_active_processor_count();
-}
-
-void G1DirtyCardQueueSet::flush_queue(G1DirtyCardQueue& queue) {
-  if (queue.buffer() != nullptr) {
-    G1ConcurrentRefineStats* stats = queue.refinement_stats();
-    stats->inc_dirtied_cards(queue.size());
-  }
-  PtrQueueSet::flush_queue(queue);
-}
-
-void G1DirtyCardQueueSet::enqueue(G1DirtyCardQueue& queue,
-                                  volatile CardValue* card_ptr) {
-  CardValue* value = const_cast<CardValue*>(card_ptr);
-  if (!try_enqueue(queue, value)) {
-    handle_zero_index(queue);
-    retry_enqueue(queue, value);
-  }
-}
-
-void G1DirtyCardQueueSet::handle_zero_index(G1DirtyCardQueue& queue) {
-  assert(queue.index() == 0, "precondition");
-  BufferNode* old_node = exchange_buffer_with_new(queue);
-  if (old_node != nullptr) {
-    assert(old_node->index() == 0, "invariant");
-    G1ConcurrentRefineStats* stats = queue.refinement_stats();
-    stats->inc_dirtied_cards(old_node->capacity());
-    handle_completed_buffer(old_node, stats);
-  }
-}
-
-void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
-  G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t);
-  G1BarrierSet::dirty_card_queue_set().handle_zero_index(queue);
-}
-
-size_t G1DirtyCardQueueSet::num_cards() const {
-  return AtomicAccess::load(&_num_cards);
-}
-
-void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
-  assert(cbn != nullptr, "precondition");
-  // Increment _num_cards before adding to queue, so queue removal doesn't
-  // need to deal with _num_cards possibly going negative.
-  AtomicAccess::add(&_num_cards, cbn->size());
-  // Perform push in CS.  The old tail may be popped while the push is
-  // observing it (attaching it to the new buffer).  We need to ensure it
-  // can't be reused until the push completes, to avoid ABA problems.
-  GlobalCounter::CriticalSection cs(Thread::current());
-  _completed.push(*cbn);
-}
-
-// Thread-safe attempt to remove and return the first buffer from
-// the _completed queue, using the NonblockingQueue::try_pop() underneath.
-// It has a limitation that it may return null when there are objects
-// in the queue if there is a concurrent push/append operation.
-BufferNode* G1DirtyCardQueueSet::dequeue_completed_buffer() {
-  Thread* current_thread = Thread::current();
-  BufferNode* result = nullptr;
-  while (true) {
-    // Use GlobalCounter critical section to avoid ABA problem.
-    // The release of a buffer to its allocator's free list uses
-    // GlobalCounter::write_synchronize() to coordinate with this
-    // dequeuing operation.
-    // We use a CS per iteration, rather than over the whole loop,
-    // because we're not guaranteed to make progress. Lingering in
-    // one CS could defer releasing buffer to the free list for reuse,
-    // leading to excessive allocations.
-    GlobalCounter::CriticalSection cs(current_thread);
-    if (_completed.try_pop(&result)) return result;
-  }
-}
-
-BufferNode* G1DirtyCardQueueSet::get_completed_buffer() {
-  BufferNode* result = dequeue_completed_buffer();
-  if (result == nullptr) {         // Unlikely if no paused buffers.
-    enqueue_previous_paused_buffers();
-    result = dequeue_completed_buffer();
-    if (result == nullptr) return nullptr;
-  }
-  AtomicAccess::sub(&_num_cards, result->size());
-  return result;
-}
-
-#ifdef ASSERT
-void G1DirtyCardQueueSet::verify_num_cards() const {
-  size_t actual = 0;
-  for (BufferNode* cur = _completed.first();
-       !_completed.is_end(cur);
-       cur = cur->next()) {
-    actual += cur->size();
-  }
-  assert(actual == AtomicAccess::load(&_num_cards),
-         "Num entries in completed buffers should be %zu but are %zu",
-         AtomicAccess::load(&_num_cards), actual);
-}
-#endif // ASSERT
-
-G1DirtyCardQueueSet::PausedBuffers::PausedList::PausedList() :
-  _head(nullptr), _tail(nullptr),
-  _safepoint_id(SafepointSynchronize::safepoint_id())
-{}
-
-#ifdef ASSERT
-G1DirtyCardQueueSet::PausedBuffers::PausedList::~PausedList() {
-  assert(AtomicAccess::load(&_head) == nullptr, "precondition");
-  assert(_tail == nullptr, "precondition");
-}
-#endif // ASSERT
-
-bool G1DirtyCardQueueSet::PausedBuffers::PausedList::is_next() const {
-  assert_not_at_safepoint();
-  return _safepoint_id == SafepointSynchronize::safepoint_id();
-}
-
-void G1DirtyCardQueueSet::PausedBuffers::PausedList::add(BufferNode* node) {
-  assert_not_at_safepoint();
-  assert(is_next(), "precondition");
-  BufferNode* old_head = AtomicAccess::xchg(&_head, node);
-  if (old_head == nullptr) {
-    assert(_tail == nullptr, "invariant");
-    _tail = node;
-  } else {
-    node->set_next(old_head);
-  }
-}
-
-G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::PausedList::take() {
-  BufferNode* head = AtomicAccess::load(&_head);
-  BufferNode* tail = _tail;
-  AtomicAccess::store(&_head, (BufferNode*)nullptr);
-  _tail = nullptr;
-  return HeadTail(head, tail);
-}
-
-G1DirtyCardQueueSet::PausedBuffers::PausedBuffers() : _plist(nullptr) {}
-
-#ifdef ASSERT
-G1DirtyCardQueueSet::PausedBuffers::~PausedBuffers() {
-  assert(AtomicAccess::load(&_plist) == nullptr, "invariant");
-}
-#endif // ASSERT
-
-void G1DirtyCardQueueSet::PausedBuffers::add(BufferNode* node) {
-  assert_not_at_safepoint();
-  PausedList* plist = AtomicAccess::load_acquire(&_plist);
-  if (plist == nullptr) {
-    // Try to install a new next list.
-    plist = new PausedList();
-    PausedList* old_plist = AtomicAccess::cmpxchg(&_plist, (PausedList*)nullptr, plist);
-    if (old_plist != nullptr) {
-      // Some other thread installed a new next list.  Use it instead.
-      delete plist;
-      plist = old_plist;
-    }
-  }
-  assert(plist->is_next(), "invariant");
-  plist->add(node);
-}
-
-G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_previous() {
-  assert_not_at_safepoint();
-  PausedList* previous;
-  {
-    // Deal with plist in a critical section, to prevent it from being
-    // deleted out from under us by a concurrent take_previous().
-    GlobalCounter::CriticalSection cs(Thread::current());
-    previous = AtomicAccess::load_acquire(&_plist);
-    if ((previous == nullptr) ||   // Nothing to take.
-        previous->is_next() ||  // Not from a previous safepoint.
-        // Some other thread stole it.
-        (AtomicAccess::cmpxchg(&_plist, previous, (PausedList*)nullptr) != previous)) {
-      return HeadTail();
-    }
-  }
-  // We now own previous.
-  HeadTail result = previous->take();
-  // There might be other threads examining previous (in concurrent
-  // take_previous()).  Synchronize to wait until any such threads are
-  // done with such examination before deleting.
-  GlobalCounter::write_synchronize();
-  delete previous;
-  return result;
-}
-
-G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_all() {
-  assert_at_safepoint();
-  HeadTail result;
-  PausedList* plist = AtomicAccess::load(&_plist);
-  if (plist != nullptr) {
-    AtomicAccess::store(&_plist, (PausedList*)nullptr);
-    result = plist->take();
-    delete plist;
-  }
-  return result;
-}
-
-void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) {
-  assert_not_at_safepoint();
-  assert(node->next() == nullptr, "precondition");
-  // Ensure there aren't any paused buffers from a previous safepoint.
-  enqueue_previous_paused_buffers();
-  // Cards for paused buffers are included in count, to contribute to
-  // notification checking after the coming safepoint if it doesn't GC.
-  // Note that this means the queue's _num_cards differs from the number
-  // of cards in the queued buffers when there are paused buffers.
-  AtomicAccess::add(&_num_cards, node->size());
-  _paused.add(node);
-}
-
-void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(const HeadTail& paused) {
-  if (paused._head != nullptr) {
-    assert(paused._tail != nullptr, "invariant");
-    // Cards from paused buffers are already recorded in the queue count.
-    _completed.append(*paused._head, *paused._tail);
-  }
-}
-
-void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() {
-  assert_not_at_safepoint();
-  enqueue_paused_buffers_aux(_paused.take_previous());
-}
-
-void G1DirtyCardQueueSet::enqueue_all_paused_buffers() {
-  assert_at_safepoint();
-  enqueue_paused_buffers_aux(_paused.take_all());
-}
-
-void G1DirtyCardQueueSet::abandon_completed_buffers() {
-  BufferNodeList list = take_all_completed_buffers();
-  BufferNode* buffers_to_delete = list._head;
-  while (buffers_to_delete != nullptr) {
-    BufferNode* bn = buffers_to_delete;
-    buffers_to_delete = bn->next();
-    bn->set_next(nullptr);
-    deallocate_buffer(bn);
-  }
-}
-
-// Merge lists of buffers. The source queue set is emptied as a
-// result. The queue sets must share the same allocator.
-void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) {
-  assert(allocator() == src->allocator(), "precondition");
-  const BufferNodeList from = src->take_all_completed_buffers();
-  if (from._head != nullptr) {
-    AtomicAccess::add(&_num_cards, from._entry_count);
-    _completed.append(*from._head, *from._tail);
-  }
-}
-
-BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
-  enqueue_all_paused_buffers();
-  verify_num_cards();
-  Pair<BufferNode*, BufferNode*> pair = _completed.take_all();
-  size_t num_cards = AtomicAccess::load(&_num_cards);
-  AtomicAccess::store(&_num_cards, size_t(0));
-  return BufferNodeList(pair.first, pair.second, num_cards);
-}
-
-class G1RefineBufferedCards : public StackObj {
-  BufferNode* const _node;
-  CardTable::CardValue** const _node_buffer;
-  const size_t _node_buffer_capacity;
-  const uint _worker_id;
-  G1ConcurrentRefineStats* _stats;
-  G1RemSet* const _g1rs;
-
-  static inline ptrdiff_t compare_cards(const CardTable::CardValue* p1,
-                                        const CardTable::CardValue* p2) {
-    return p2 - p1;
-  }
-
-  // Sorts the cards from start_index to _node_buffer_capacity in *decreasing*
-  // address order. Tests showed that this order is preferable to not sorting
-  // or increasing address order.
-  void sort_cards(size_t start_index) {
-    QuickSort::sort(&_node_buffer[start_index],
-                    _node_buffer_capacity - start_index,
-                    compare_cards);
-  }
-
-  // Returns the index to the first clean card in the buffer.
-  size_t clean_cards() {
-    const size_t start = _node->index();
-    assert(start <= _node_buffer_capacity, "invariant");
-
-    // Two-fingered compaction algorithm similar to the filtering mechanism in
-    // SATBMarkQueue. The main difference is that clean_card_before_refine()
-    // could change the buffer element in-place.
-    // We don't check for SuspendibleThreadSet::should_yield(), because
-    // cleaning and redirtying the cards is fast.
-    CardTable::CardValue** src = &_node_buffer[start];
-    CardTable::CardValue** dst = &_node_buffer[_node_buffer_capacity];
-    assert(src <= dst, "invariant");
-    for ( ; src < dst; ++src) {
-      // Search low to high for a card to keep.
-      if (_g1rs->clean_card_before_refine(src)) {
-        // Found keeper.  Search high to low for a card to discard.
-        while (src < --dst) {
-          if (!_g1rs->clean_card_before_refine(dst)) {
-            *dst = *src;         // Replace discard with keeper.
-            break;
-          }
-        }
-        // If discard search failed (src == dst), the outer loop will also end.
-      }
-    }
-
-    // dst points to the first retained clean card, or the end of the buffer
-    // if all the cards were discarded.
-    const size_t first_clean = dst - _node_buffer;
-    assert(first_clean >= start && first_clean <= _node_buffer_capacity, "invariant");
-    // Discarded cards are considered as refined.
-    _stats->inc_refined_cards(first_clean - start);
-    _stats->inc_precleaned_cards(first_clean - start);
-    return first_clean;
-  }
-
-  bool refine_cleaned_cards(size_t start_index) {
-    bool result = true;
-    size_t i = start_index;
-    for ( ; i < _node_buffer_capacity; ++i) {
-      if (SuspendibleThreadSet::should_yield()) {
-        redirty_unrefined_cards(i);
-        result = false;
-        break;
-      }
-      _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id);
-    }
-    _node->set_index(i);
-    _stats->inc_refined_cards(i - start_index);
-    return result;
-  }
-
-  void redirty_unrefined_cards(size_t start) {
-    for ( ; start < _node_buffer_capacity; ++start) {
-      *_node_buffer[start] = G1CardTable::dirty_card_val();
-    }
-  }
-
-public:
-  G1RefineBufferedCards(BufferNode* node,
-                        uint worker_id,
-                        G1ConcurrentRefineStats* stats) :
-    _node(node),
-    _node_buffer(reinterpret_cast<CardTable::CardValue**>(BufferNode::make_buffer_from_node(node))),
-    _node_buffer_capacity(node->capacity()),
-    _worker_id(worker_id),
-    _stats(stats),
-    _g1rs(G1CollectedHeap::heap()->rem_set()) {}
-
-  bool refine() {
-    size_t first_clean_index = clean_cards();
-    if (first_clean_index == _node_buffer_capacity) {
-      _node->set_index(first_clean_index);
-      return true;
-    }
-    // This fence serves two purposes. First, the cards must be cleaned
-    // before processing the contents. Second, we can't proceed with
-    // processing a region until after the read of the region's top in
-    // collect_and_clean_cards(), for synchronization with possibly concurrent
-    // humongous object allocation (see comment at the StoreStore fence before
-    // setting the regions' tops in humongous allocation path).
-    // It's okay that reading region's top and reading region's type were racy
-    // wrto each other. We need both set, in any order, to proceed.
-    OrderAccess::fence();
-    sort_cards(first_clean_index);
-    return refine_cleaned_cards(first_clean_index);
-  }
-};
-
-bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
-                                        uint worker_id,
-                                        G1ConcurrentRefineStats* stats) {
-  Ticks start_time = Ticks::now();
-  G1RefineBufferedCards buffered_cards(node, worker_id, stats);
-  bool result = buffered_cards.refine();
-  stats->inc_refinement_time(Ticks::now() - start_time);
-  return result;
-}
-
-void G1DirtyCardQueueSet::handle_refined_buffer(BufferNode* node,
-                                                bool fully_processed) {
-  if (fully_processed) {
-    assert(node->is_empty(), "Buffer not fully consumed: index: %zu, size: %zu",
-           node->index(), node->capacity());
-    deallocate_buffer(node);
-  } else {
-    assert(!node->is_empty(), "Buffer fully consumed.");
-    // Buffer incompletely processed because there is a pending safepoint.
-    // Record partially processed buffer, to be finished later.
-    record_paused_buffer(node);
-  }
-}
-
-void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node,
-                                                  G1ConcurrentRefineStats* stats) {
-  enqueue_completed_buffer(new_node);
-
-  // No need for mutator refinement if number of cards is below limit.
-  if (AtomicAccess::load(&_num_cards) <= AtomicAccess::load(&_mutator_refinement_threshold)) {
-    return;
-  }
-
-  // Don't try to process a buffer that will just get immediately paused.
-  // When going into a safepoint it's just a waste of effort.
-  // When coming out of a safepoint, Java threads may be running before the
-  // yield request (for non-Java threads) has been cleared.
-  if (SuspendibleThreadSet::should_yield()) {
-    return;
-  }
-
-  // Only Java threads perform mutator refinement.
-  if (!Thread::current()->is_Java_thread()) {
-    return;
-  }
-
-  BufferNode* node = get_completed_buffer();
-  if (node == nullptr) return;     // Didn't get a buffer to process.
-
-  // Refine cards in buffer.
-
-  uint worker_id = _free_ids.claim_par_id(); // temporarily claim an id
-  bool fully_processed = refine_buffer(node, worker_id, stats);
-  _free_ids.release_par_id(worker_id); // release the id
-
-  // Deal with buffer after releasing id, to let another thread use id.
-  handle_refined_buffer(node, fully_processed);
-}
-
-bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
-                                                               size_t stop_at,
-                                                               G1ConcurrentRefineStats* stats) {
-  // Not enough cards to trigger processing.
-  if (AtomicAccess::load(&_num_cards) <= stop_at) return false;
-
-  BufferNode* node = get_completed_buffer();
-  if (node == nullptr) return false; // Didn't get a buffer to process.
-
-  bool fully_processed = refine_buffer(node, worker_id, stats);
-  handle_refined_buffer(node, fully_processed);
-  return true;
-}
-
-void G1DirtyCardQueueSet::abandon_logs_and_stats() {
-  assert_at_safepoint();
-
-  // Disable mutator refinement until concurrent refinement decides otherwise.
-  set_mutator_refinement_threshold(SIZE_MAX);
-
-  // Iterate over all the threads, resetting per-thread queues and stats.
-  struct AbandonThreadLogClosure : public ThreadClosure {
-    G1DirtyCardQueueSet& _qset;
-    AbandonThreadLogClosure(G1DirtyCardQueueSet& qset) : _qset(qset) {}
-    virtual void do_thread(Thread* t) {
-      G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t);
-      _qset.reset_queue(queue);
-      queue.refinement_stats()->reset();
-    }
-  } closure(*this);
-  Threads::threads_do(&closure);
-
-  enqueue_all_paused_buffers();
-  abandon_completed_buffers();
-
-  // Reset stats from detached threads.
-  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
-  _detached_refinement_stats.reset();
-}
-
-void G1DirtyCardQueueSet::update_refinement_stats(G1ConcurrentRefineStats& stats) {
-  assert_at_safepoint();
-
-  _concatenated_refinement_stats = stats;
-
-  enqueue_all_paused_buffers();
-  verify_num_cards();
-
-  // Collect and reset stats from detached threads.
-  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
-  _concatenated_refinement_stats += _detached_refinement_stats;
-  _detached_refinement_stats.reset();
-}
-
-G1ConcurrentRefineStats G1DirtyCardQueueSet::concatenate_log_and_stats(Thread* thread) {
-  assert_at_safepoint();
-
-  G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
-  // Flush the buffer if non-empty.  Flush before accumulating and
-  // resetting stats, since flushing may modify the stats.
-  if (!queue.is_empty()) {
-    flush_queue(queue);
-  }
-
-  G1ConcurrentRefineStats result = *queue.refinement_stats();
-  queue.refinement_stats()->reset();
-  return result;
-}
-
-G1ConcurrentRefineStats G1DirtyCardQueueSet::concatenated_refinement_stats() const {
-  assert_at_safepoint();
-  return _concatenated_refinement_stats;
-}
-
-void G1DirtyCardQueueSet::record_detached_refinement_stats(G1ConcurrentRefineStats* stats) {
-  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
-  _detached_refinement_stats += *stats;
-  stats->reset();
-}
-
-size_t G1DirtyCardQueueSet::mutator_refinement_threshold() const {
-  return AtomicAccess::load(&_mutator_refinement_threshold);
-}
-
-void G1DirtyCardQueueSet::set_mutator_refinement_threshold(size_t value) {
-  AtomicAccess::store(&_mutator_refinement_threshold, value);
-}
diff --git a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
deleted file mode 100644
index 6beb536df87..00000000000
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP
-#define SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP
-
-#include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1ConcurrentRefineStats.hpp"
-#include "gc/g1/g1FreeIdSet.hpp"
-#include "gc/shared/bufferNode.hpp"
-#include "gc/shared/bufferNodeList.hpp"
-#include "gc/shared/ptrQueue.hpp"
-#include "memory/allocation.hpp"
-#include "memory/padded.hpp"
-#include "utilities/nonblockingQueue.hpp"
-
-class G1PrimaryConcurrentRefineThread;
-class G1DirtyCardQueueSet;
-class G1RedirtyCardsQueueSet;
-class Thread;
-
-// A ptrQueue whose elements are "oops", pointers to object heads.
-class G1DirtyCardQueue: public PtrQueue {
-  G1ConcurrentRefineStats* _refinement_stats;
-
-public:
-  G1DirtyCardQueue(G1DirtyCardQueueSet* qset);
-
-  // Flush before destroying; queue may be used to capture pending work while
-  // doing something else, with auto-flush on completion.
-  ~G1DirtyCardQueue();
-
-  G1ConcurrentRefineStats* refinement_stats() const {
-    return _refinement_stats;
-  }
-
-  // Compiler support.
-  static ByteSize byte_offset_of_index() {
-    return PtrQueue::byte_offset_of_index<G1DirtyCardQueue>();
-  }
-  using PtrQueue::byte_width_of_index;
-
-  static ByteSize byte_offset_of_buf() {
-    return PtrQueue::byte_offset_of_buf<G1DirtyCardQueue>();
-  }
-  using PtrQueue::byte_width_of_buf;
-
-};
-
-class G1DirtyCardQueueSet: public PtrQueueSet {
-  // Head and tail of a list of BufferNodes, linked through their next()
-  // fields.  Similar to BufferNodeList, but without the _entry_count.
-  struct HeadTail {
-    BufferNode* _head;
-    BufferNode* _tail;
-    HeadTail() : _head(nullptr), _tail(nullptr) {}
-    HeadTail(BufferNode* head, BufferNode* tail) : _head(head), _tail(tail) {}
-  };
-
-  // Concurrent refinement may stop processing in the middle of a buffer if
-  // there is a pending safepoint, to avoid long delays to safepoint.  A
-  // partially processed buffer needs to be recorded for processing by the
-  // safepoint if it's a GC safepoint; otherwise it needs to be recorded for
-  // further concurrent refinement work after the safepoint.  But if the
-  // buffer was obtained from the completed buffer queue then it can't simply
-  // be added back to the queue, as that would introduce a new source of ABA
-  // for the queue.
-  //
-  // The PausedBuffer object is used to record such buffers for the upcoming
-  // safepoint, and provides access to the buffers recorded for previous
-  // safepoints.  Before obtaining a buffer from the completed buffers queue,
-  // we first transfer any buffers from previous safepoints to the queue.
-  // This is ABA-safe because threads cannot be in the midst of a queue pop
-  // across a safepoint.
-  //
-  // The paused buffers are conceptually an extension of the completed buffers
-  // queue, and operations which need to deal with all of the queued buffers
-  // (such as concatenating or abandoning logs) also need to deal with any
-  // paused buffers.  In general, if a safepoint performs a GC then the paused
-  // buffers will be processed as part of it, and there won't be any paused
-  // buffers after a GC safepoint.
-  class PausedBuffers {
-    class PausedList : public CHeapObj<mtGC> {
-      BufferNode* volatile _head;
-      BufferNode* _tail;
-      size_t _safepoint_id;
-
-      NONCOPYABLE(PausedList);
-
-    public:
-      PausedList();
-      DEBUG_ONLY(~PausedList();)
-
-      // Return true if this list was created to hold buffers for the
-      // next safepoint.
-      // precondition: not at safepoint.
-      bool is_next() const;
-
-      // Thread-safe add the buffer to the list.
-      // precondition: not at safepoint.
-      // precondition: is_next().
-      void add(BufferNode* node);
-
-      // Take all the buffers from the list.  Not thread-safe.
-      HeadTail take();
-    };
-
-    // The most recently created list, which might be for either the next or
-    // a previous safepoint, or might be null if the next list hasn't been
-    // created yet.  We only need one list because of the requirement that
-    // threads calling add() must first ensure there are no paused buffers
-    // from a previous safepoint.  There might be many list instances existing
-    // at the same time though; there can be many threads competing to create
-    // and install the next list, and meanwhile there can be a thread dealing
-    // with the previous list.
-    PausedList* volatile _plist;
-    DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, sizeof(PausedList*));
-
-    NONCOPYABLE(PausedBuffers);
-
-  public:
-    PausedBuffers();
-    DEBUG_ONLY(~PausedBuffers();)
-
-    // Thread-safe add the buffer to paused list for next safepoint.
-    // precondition: not at safepoint.
-    // precondition: does not have paused buffers from a previous safepoint.
-    void add(BufferNode* node);
-
-    // Thread-safe take all paused buffers for previous safepoints.
-    // precondition: not at safepoint.
-    HeadTail take_previous();
-
-    // Take all the paused buffers.
-    // precondition: at safepoint.
-    HeadTail take_all();
-  };
-
-  DEFINE_PAD_MINUS_SIZE(0, DEFAULT_PADDING_SIZE, 0);
-  // Upper bound on the number of cards in the completed and paused buffers.
-  volatile size_t _num_cards;
-  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, sizeof(size_t));
-  // If the queue contains more cards than configured here, the
-  // mutator must start doing some of the concurrent refinement work.
-  volatile size_t _mutator_refinement_threshold;
-  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_PADDING_SIZE, sizeof(size_t));
-  // Buffers ready for refinement.
-  // NonblockingQueue has inner padding of one cache line.
-  NonblockingQueue<BufferNode, &BufferNode::next_ptr> _completed;
-  // Add a trailer padding after NonblockingQueue.
-  DEFINE_PAD_MINUS_SIZE(3, DEFAULT_PADDING_SIZE, sizeof(BufferNode*));
-  // Buffers for which refinement is temporarily paused.
-  // PausedBuffers has inner padding, including trailer.
-  PausedBuffers _paused;
-
-  G1FreeIdSet _free_ids;
-
-  G1ConcurrentRefineStats _concatenated_refinement_stats;
-  G1ConcurrentRefineStats _detached_refinement_stats;
-
-  // Verify _num_cards == sum of cards in the completed queue.
-  void verify_num_cards() const NOT_DEBUG_RETURN;
-
-  // Thread-safe add a buffer to paused list for next safepoint.
-  // precondition: not at safepoint.
-  void record_paused_buffer(BufferNode* node);
-  void enqueue_paused_buffers_aux(const HeadTail& paused);
-  // Thread-safe transfer paused buffers for previous safepoints to the queue.
-  // precondition: not at safepoint.
-  void enqueue_previous_paused_buffers();
-  // Transfer all paused buffers to the queue.
-  // precondition: at safepoint.
-  void enqueue_all_paused_buffers();
-
-  void abandon_completed_buffers();
-
-  // Refine the cards in "node" from its index to buffer_capacity.
-  // Stops processing if SuspendibleThreadSet::should_yield() is true.
-  // Returns true if the entire buffer was processed, false if there
-  // is a pending yield request.  The node's index is updated to exclude
-  // the processed elements, e.g. up to the element before processing
-  // stopped, or one past the last element if the entire buffer was
-  // processed. Updates stats.
-  bool refine_buffer(BufferNode* node,
-                     uint worker_id,
-                     G1ConcurrentRefineStats* stats);
-
-  // Deal with buffer after a call to refine_buffer.  If fully processed,
-  // deallocate the buffer.  Otherwise, record it as paused.
-  void handle_refined_buffer(BufferNode* node, bool fully_processed);
-
-  // Thread-safe attempt to remove and return the first buffer from
-  // the _completed queue.
-  // Returns null if the queue is empty, or if a concurrent push/append
-  // interferes. It uses GlobalCounter critical section to avoid ABA problem.
-  BufferNode* dequeue_completed_buffer();
-  // Remove and return a completed buffer from the list, or return null
-  // if none available.
-  BufferNode* get_completed_buffer();
-
-  // Called when queue is full or has no buffer.
-  void handle_zero_index(G1DirtyCardQueue& queue);
-
-  // Enqueue the buffer, and optionally perform refinement by the mutator.
-  // Mutator refinement is only done by Java threads, and only if there
-  // are more than mutator_refinement_threshold cards in the completed buffers.
-  // Updates stats.
-  //
-  // Mutator refinement, if performed, stops processing a buffer if
-  // SuspendibleThreadSet::should_yield(), recording the incompletely
-  // processed buffer for later processing of the remainder.
-  void handle_completed_buffer(BufferNode* node, G1ConcurrentRefineStats* stats);
-
-public:
-  G1DirtyCardQueueSet(BufferNode::Allocator* allocator);
-  ~G1DirtyCardQueueSet();
-
-  // The number of parallel ids that can be claimed to allow collector or
-  // mutator threads to do card-processing work.
-  static uint num_par_ids();
-
-  static void handle_zero_index_for_thread(Thread* t);
-
-  virtual void enqueue_completed_buffer(BufferNode* node);
-
-  // Upper bound on the number of cards currently in this queue set.
-  // Read without synchronization.  The value may be high because there
-  // is a concurrent modification of the set of buffers.
-  size_t num_cards() const;
-
-  void merge_bufferlists(G1RedirtyCardsQueueSet* src);
-
-  BufferNodeList take_all_completed_buffers();
-
-  void flush_queue(G1DirtyCardQueue& queue);
-
-  using CardValue = G1CardTable::CardValue;
-  void enqueue(G1DirtyCardQueue& queue, volatile CardValue* card_ptr);
-
-  // If there are more than stop_at cards in the completed buffers, pop
-  // a buffer, refine its contents, and return true.  Otherwise return
-  // false.  Updates stats.
-  //
-  // Stops processing a buffer if SuspendibleThreadSet::should_yield(),
-  // recording the incompletely processed buffer for later processing of
-  // the remainder.
-  bool refine_completed_buffer_concurrently(uint worker_id,
-                                            size_t stop_at,
-                                            G1ConcurrentRefineStats* stats);
-
-  // If a full collection is happening, reset per-thread refinement stats and
-  // partial logs, and release completed logs. The full collection will make
-  // them all irrelevant.
-  // precondition: at safepoint.
-  void abandon_logs_and_stats();
-
-  // Update global refinement statistics with the ones given and the ones from
-  // detached threads.
-  // precondition: at safepoint.
-  void update_refinement_stats(G1ConcurrentRefineStats& stats);
-  // Add the given thread's partial logs to the global list and return and reset
-  // its refinement stats.
-  // precondition: at safepoint.
-  G1ConcurrentRefineStats concatenate_log_and_stats(Thread* thread);
-
-  // Return the total of mutator refinement stats for all threads.
-  // precondition: at safepoint.
-  // precondition: only call after concatenate_logs_and_stats.
-  G1ConcurrentRefineStats concatenated_refinement_stats() const;
-
-  // Accumulate refinement stats from threads that are detaching.
-  void record_detached_refinement_stats(G1ConcurrentRefineStats* stats);
-
-  // Number of cards above which mutator threads should do refinement.
-  size_t mutator_refinement_threshold() const;
-
-  // Set number of cards above which mutator threads should do refinement.
-  void set_mutator_refinement_threshold(size_t value);
-};
-
-#endif // SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP
diff --git a/src/hotspot/share/gc/g1/g1FromCardCache.cpp b/src/hotspot/share/gc/g1/g1FromCardCache.cpp
index 4a29bcbc6dc..8f5c84da0e3 100644
--- a/src/hotspot/share/gc/g1/g1FromCardCache.cpp
+++ b/src/hotspot/share/gc/g1/g1FromCardCache.cpp
@@ -22,8 +22,6 @@
  *
  */
 
-#include "gc/g1/g1ConcurrentRefine.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1FromCardCache.hpp"
 #include "gc/shared/gc_globals.hpp"
 #include "memory/padded.inline.hpp"
@@ -80,7 +78,7 @@ void G1FromCardCache::print(outputStream* out) {
 #endif
 
 uint G1FromCardCache::num_par_rem_sets() {
-  return G1DirtyCardQueueSet::num_par_ids() + G1ConcRefinementThreads + MAX2(ConcGCThreads, ParallelGCThreads);
+  return G1ConcRefinementThreads + ConcGCThreads;
 }
 
 void G1FromCardCache::clear(uint region_idx) {
diff --git a/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp b/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp
index cc71cf86172..5dbf70f36b3 100644
--- a/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp
+++ b/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp
@@ -147,6 +147,10 @@ void G1FullGCCompactTask::free_non_overlapping_regions(uint src_start_idx, uint
 
   for (uint i = non_overlapping_start; i <= src_end_idx; ++i) {
     G1HeapRegion* hr = _g1h->region_at(i);
+    if (VerifyDuringGC) {
+      // Satisfy some asserts in free_..._region
+      hr->clear_both_card_tables();
+    }
     _g1h->free_humongous_region(hr, nullptr);
   }
 }
diff --git a/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp b/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp
index f9868bba678..64d85660ca7 100644
--- a/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp
@@ -35,6 +35,10 @@
 #include "gc/shared/fullGCForwarding.inline.hpp"
 
 void G1DetermineCompactionQueueClosure::free_empty_humongous_region(G1HeapRegion* hr) {
+  if (VerifyDuringGC) {
+    // Satisfy some asserts in free_..._region.
+    hr->clear_both_card_tables();
+  }
   _g1h->free_humongous_region(hr, nullptr);
   _collector->set_free(hr->hrm_index());
   add_to_compaction_queue(hr);
diff --git a/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp b/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp
index ae9a78a9cdf..02397392a6e 100644
--- a/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp
+++ b/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp
@@ -32,7 +32,7 @@ G1FullGCResetMetadataTask::G1ResetMetadataClosure::G1ResetMetadataClosure(G1Full
 
 void G1FullGCResetMetadataTask::G1ResetMetadataClosure::reset_region_metadata(G1HeapRegion* hr) {
   hr->rem_set()->clear();
-  hr->clear_cardtable();
+  hr->clear_both_card_tables();
 }
 
 bool G1FullGCResetMetadataTask::G1ResetMetadataClosure::do_heap_region(G1HeapRegion* hr) {
diff --git a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
index 15fb65c5700..b211b1e32fb 100644
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
@@ -50,8 +50,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
 {
   assert(max_gc_threads > 0, "Must have some GC threads");
 
-  _gc_par_phases[RetireTLABsAndFlushLogs] = new WorkerDataArray<double>("RetireTLABsAndFlushLogs", "JT Retire TLABs And Flush Logs (ms):", max_gc_threads);
-  _gc_par_phases[NonJavaThreadFlushLogs] = new WorkerDataArray<double>("NonJavaThreadFlushLogs", "Non-JT Flush Logs (ms):", max_gc_threads);
+  _gc_par_phases[RetireTLABs] = new WorkerDataArray<double>("RetireTLABs", "JavaThread Retire TLABs (ms):", max_gc_threads);
 
   _gc_par_phases[GCWorkerStart] = new WorkerDataArray<double>("GCWorkerStart", "GC Worker Start (ms):", max_gc_threads);
   _gc_par_phases[ExtRootScan] = new WorkerDataArray<double>("ExtRootScan", "Ext Root Scanning (ms):", max_gc_threads);
@@ -83,7 +82,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
     _gc_par_phases[OptMergeRS]->create_thread_work_items(GCMergeRSWorkItemsStrings[i], i);
   }
 
-  _gc_par_phases[MergeLB] = new WorkerDataArray<double>("MergeLB", "Log Buffers (ms):", max_gc_threads);
+  _gc_par_phases[SweepRT] = new WorkerDataArray<double>("SweepRT", "Sweep (ms):", max_gc_threads);
   _gc_par_phases[ScanHR] = new WorkerDataArray<double>("ScanHR", "Scan Heap Roots (ms):", max_gc_threads);
   _gc_par_phases[OptScanHR] = new WorkerDataArray<double>("OptScanHR", "Optional Scan Heap Roots (ms):", max_gc_threads);
   _gc_par_phases[CodeRoots] = new WorkerDataArray<double>("CodeRoots", "Code Root Scan (ms):", max_gc_threads);
@@ -98,7 +97,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
   _gc_par_phases[MergePSS] = new WorkerDataArray<double>("MergePSS", "Merge Per-Thread State (ms):", max_gc_threads);
   _gc_par_phases[RestoreEvacuationFailedRegions] = new WorkerDataArray<double>("RestoreEvacuationFailedRegions", "Restore Evacuation Failed Regions (ms):", max_gc_threads);
   _gc_par_phases[RemoveSelfForwards] = new WorkerDataArray<double>("RemoveSelfForwards", "Remove Self Forwards (ms):", max_gc_threads);
-  _gc_par_phases[ClearCardTable] = new WorkerDataArray<double>("ClearLoggedCards", "Clear Logged Cards (ms):", max_gc_threads);
+  _gc_par_phases[ClearCardTable] = new WorkerDataArray<double>("ClearPendingCards", "Clear Pending Cards (ms):", max_gc_threads);
   _gc_par_phases[RecalculateUsed] = new WorkerDataArray<double>("RecalculateUsed", "Recalculate Used Memory (ms):", max_gc_threads);
 #if COMPILER2_OR_JVMCI
   _gc_par_phases[UpdateDerivedPointers] = new WorkerDataArray<double>("UpdateDerivedPointers", "Update Derived Pointers (ms):", max_gc_threads);
@@ -107,11 +106,15 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
   _gc_par_phases[ResetPartialArrayStateManager] = new WorkerDataArray<double>("ResetPartialArrayStateManager", "Reset Partial Array State Manager (ms):", max_gc_threads);
   _gc_par_phases[ProcessEvacuationFailedRegions] = new WorkerDataArray<double>("ProcessEvacuationFailedRegions", "Process Evacuation Failed Regions (ms):", max_gc_threads);
 
+  _gc_par_phases[ScanHR]->create_thread_work_items("Pending Cards:", ScanHRPendingCards);
+  _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Empty:", ScanHRScannedEmptyCards);
   _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Cards:", ScanHRScannedCards);
   _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Blocks:", ScanHRScannedBlocks);
   _gc_par_phases[ScanHR]->create_thread_work_items("Claimed Chunks:", ScanHRClaimedChunks);
   _gc_par_phases[ScanHR]->create_thread_work_items("Found Roots:", ScanHRFoundRoots);
 
+  _gc_par_phases[OptScanHR]->create_thread_work_items("Pending Cards:", ScanHRPendingCards);
+  _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Empty:", ScanHRScannedEmptyCards);
   _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Cards:", ScanHRScannedCards);
   _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Blocks:", ScanHRScannedBlocks);
   _gc_par_phases[OptScanHR]->create_thread_work_items("Claimed Chunks:", ScanHRClaimedChunks);
@@ -119,9 +122,6 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
   _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Refs:", ScanHRScannedOptRefs);
   _gc_par_phases[OptScanHR]->create_thread_work_items("Used Memory:", ScanHRUsedMemory);
 
-  _gc_par_phases[MergeLB]->create_thread_work_items("Dirty Cards:", MergeLBDirtyCards);
-  _gc_par_phases[MergeLB]->create_thread_work_items("Skipped Cards:", MergeLBSkippedCards);
-
   _gc_par_phases[CodeRoots]->create_thread_work_items("Scanned Nmethods:", CodeRootsScannedNMethods);
 
   _gc_par_phases[OptCodeRoots]->create_thread_work_items("Scanned Nmethods:", CodeRootsScannedNMethods);
@@ -129,7 +129,10 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
   _gc_par_phases[MergePSS]->create_thread_work_items("Copied Bytes:", MergePSSCopiedBytes);
   _gc_par_phases[MergePSS]->create_thread_work_items("LAB Waste:", MergePSSLABWasteBytes);
   _gc_par_phases[MergePSS]->create_thread_work_items("LAB Undo Waste:", MergePSSLABUndoWasteBytes);
-  _gc_par_phases[MergePSS]->create_thread_work_items("Evac Fail Extra Cards:", MergePSSEvacFailExtra);
+  _gc_par_phases[MergePSS]->create_thread_work_items("Pending Cards:", MergePSSPendingCards);
+  _gc_par_phases[MergePSS]->create_thread_work_items("To-Young-Gen Cards:", MergePSSToYoungGenCards);
+  _gc_par_phases[MergePSS]->create_thread_work_items("Evac-Fail Cards:", MergePSSEvacFail);
+  _gc_par_phases[MergePSS]->create_thread_work_items("Marked Cards:", MergePSSMarked);
 
   _gc_par_phases[RestoreEvacuationFailedRegions]->create_thread_work_items("Evacuation Failed Regions:", RestoreEvacFailureRegionsEvacFailedNum);
   _gc_par_phases[RestoreEvacuationFailedRegions]->create_thread_work_items("Pinned Regions:", RestoreEvacFailureRegionsPinnedNum);
@@ -150,9 +153,6 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
 
   _gc_par_phases[OptTermination]->create_thread_work_items("Optional Termination Attempts:");
 
-  _gc_par_phases[RedirtyCards] = new WorkerDataArray<double>("RedirtyCards", "Redirty Logged Cards (ms):", max_gc_threads);
-  _gc_par_phases[RedirtyCards]->create_thread_work_items("Redirtied Cards:");
-
   _gc_par_phases[ResizeThreadLABs] = new WorkerDataArray<double>("ResizeTLABs", "Resize TLABs (ms):", max_gc_threads);
 
   _gc_par_phases[FreeCollectionSet] = new WorkerDataArray<double>("FreeCSet", "Free Collection Set (ms):", max_gc_threads);
@@ -171,9 +171,9 @@ void G1GCPhaseTimes::reset() {
   _cur_optional_evac_time_ms = 0.0;
   _cur_collection_nmethod_list_cleanup_time_ms = 0.0;
   _cur_merge_heap_roots_time_ms = 0.0;
+  _cur_merge_refinement_table_time_ms = 0.0;
   _cur_optional_merge_heap_roots_time_ms = 0.0;
   _cur_prepare_merge_heap_roots_time_ms = 0.0;
-  _cur_distribute_log_buffers_time_ms = 0.0;
   _cur_optional_prepare_merge_heap_roots_time_ms = 0.0;
   _cur_pre_evacuate_prepare_time_ms = 0.0;
   _cur_post_evacuate_cleanup_1_time_ms = 0.0;
@@ -249,7 +249,7 @@ void G1GCPhaseTimes::record_gc_pause_end() {
       ASSERT_PHASE_UNINITIALIZED(MergeER);
       ASSERT_PHASE_UNINITIALIZED(MergeRS);
       ASSERT_PHASE_UNINITIALIZED(OptMergeRS);
-      ASSERT_PHASE_UNINITIALIZED(MergeLB);
+      ASSERT_PHASE_UNINITIALIZED(SweepRT);
       ASSERT_PHASE_UNINITIALIZED(ScanHR);
       ASSERT_PHASE_UNINITIALIZED(CodeRoots);
       ASSERT_PHASE_UNINITIALIZED(OptCodeRoots);
@@ -425,8 +425,7 @@ double G1GCPhaseTimes::print_pre_evacuate_collection_set() const {
   }
 
   debug_time("Pre Evacuate Prepare", _cur_pre_evacuate_prepare_time_ms);
-  debug_phase(_gc_par_phases[RetireTLABsAndFlushLogs], 1);
-  debug_phase(_gc_par_phases[NonJavaThreadFlushLogs], 1);
+  debug_phase(_gc_par_phases[RetireTLABs], 1);
   debug_time("Choose Collection Set", (_recorded_young_cset_choice_time_ms + _recorded_non_young_cset_choice_time_ms));
   debug_time("Region Register", _cur_region_register_time);
 
@@ -458,8 +457,8 @@ double G1GCPhaseTimes::print_evacuate_initial_collection_set() const {
   debug_time("Prepare Merge Heap Roots", _cur_prepare_merge_heap_roots_time_ms);
   debug_phase_merge_remset();
 
-  debug_time("Distribute Log Buffers", _cur_distribute_log_buffers_time_ms);
-  debug_phase(_gc_par_phases[MergeLB]);
+  debug_time("Merge Refinement Table", _cur_merge_refinement_table_time_ms);
+  debug_phase(_gc_par_phases[SweepRT], 1);
 
   info_time("Evacuate Collection Set", _cur_collection_initial_evac_time_ms);
 
@@ -521,7 +520,6 @@ double G1GCPhaseTimes::print_post_evacuate_collection_set(bool evacuation_failed
   if (G1CollectedHeap::heap()->should_sample_collection_set_candidates()) {
     debug_phase(_gc_par_phases[SampleCollectionSetCandidates], 1);
   }
-  debug_phase(_gc_par_phases[RedirtyCards], 1);
   if (UseTLAB && ResizeTLAB) {
     debug_phase(_gc_par_phases[ResizeThreadLABs], 1);
   }
diff --git a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
index 045160a6162..8223148b791 100644
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
@@ -46,8 +46,7 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
 
  public:
   enum GCParPhases {
-    RetireTLABsAndFlushLogs,
-    NonJavaThreadFlushLogs,
+    RetireTLABs,
     GCWorkerStart,
     ExtRootScan,
     ThreadRoots,
@@ -59,7 +58,7 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     MergeER = StrongOopStorageSetRoots + EnumRange<OopStorageSet::StrongId>().size(),
     MergeRS,
     OptMergeRS,
-    MergeLB,
+    SweepRT,
     ScanHR,
     OptScanHR,
     CodeRoots,
@@ -71,7 +70,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     Other,
     GCWorkerTotal,
     GCWorkerEnd,
-    RedirtyCards,
     FreeCollectionSet,
     YoungFreeCSet,
     NonYoungFreeCSet,
@@ -111,16 +109,19 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     MergeRSHowlArrayOfCards,
     MergeRSHowlBitmap,
     MergeRSHowlFull,
-    MergeRSCards,
+    MergeRSFromRemSetCards,
+    MergeRSTotalCards,
     MergeRSContainersSentinel
   };
 
   static constexpr const char* GCMergeRSWorkItemsStrings[MergeRSContainersSentinel] =
     { "Merged Inline:", "Merged ArrayOfCards:", "Merged Howl:", "Merged Full:",
       "Merged Howl Inline:", "Merged Howl ArrayOfCards:", "Merged Howl BitMap:", "Merged Howl Full:",
-      "Merged Cards:" };
+      "Merged From RS Cards:", "Total Cards:" };
 
   enum GCScanHRWorkItems {
+    ScanHRPendingCards,
+    ScanHRScannedEmptyCards,
     ScanHRScannedCards,
     ScanHRScannedBlocks,
     ScanHRClaimedChunks,
@@ -129,11 +130,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     ScanHRUsedMemory
   };
 
-  enum GCMergeLBWorkItems {
-    MergeLBDirtyCards,
-    MergeLBSkippedCards
-  };
-
   enum GCCodeRootsWorkItems {
     CodeRootsScannedNMethods
   };
@@ -143,7 +139,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     MergePSSLABSize,
     MergePSSLABWasteBytes,
     MergePSSLABUndoWasteBytes,
-    MergePSSEvacFailExtra
+    MergePSSPendingCards,      // To be scanned cards generated by GC (from cross-references and evacuation failure).
+    MergePSSToYoungGenCards,   // To-young-gen cards generated by GC.
+    MergePSSEvacFail,          // Evacuation failure generated dirty cards by GC.
+    MergePSSMarked,            // Total newly marked cards.
   };
 
   enum RestoreEvacFailureRegionsWorkItems {
@@ -176,9 +175,9 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
   double _cur_collection_nmethod_list_cleanup_time_ms;
 
   double _cur_merge_heap_roots_time_ms;
+  // Merge refinement table time. Note that this time is included in _cur_merge_heap_roots_time_ms.
+  double _cur_merge_refinement_table_time_ms;
   double _cur_optional_merge_heap_roots_time_ms;
-  // Included in above merge and optional-merge time.
-  double _cur_distribute_log_buffers_time_ms;
 
   double _cur_prepare_merge_heap_roots_time_ms;
   double _cur_optional_prepare_merge_heap_roots_time_ms;
@@ -302,6 +301,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     _cur_merge_heap_roots_time_ms += ms;
   }
 
+  void record_merge_refinement_table_time(double ms) {
+    _cur_merge_refinement_table_time_ms = ms;
+  }
+
   void record_or_add_optional_merge_heap_roots_time(double ms) {
     _cur_optional_merge_heap_roots_time_ms += ms;
   }
@@ -310,10 +313,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     _cur_prepare_merge_heap_roots_time_ms += ms;
   }
 
-  void record_distribute_log_buffers_time_ms(double ms) {
-    _cur_distribute_log_buffers_time_ms += ms;
-  }
-
   void record_or_add_optional_prepare_merge_heap_roots_time(double ms) {
     _cur_optional_prepare_merge_heap_roots_time_ms += ms;
   }
@@ -382,10 +381,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
     _recorded_prepare_heap_roots_time_ms = recorded_prepare_heap_roots_time_ms;
   }
 
-  double cur_distribute_log_buffers_time_ms() {
-    return _cur_distribute_log_buffers_time_ms;
-  }
-
   double cur_collection_par_time_ms() {
     return _cur_collection_initial_evac_time_ms +
            _cur_optional_evac_time_ms +
@@ -396,6 +391,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
            _cur_collection_nmethod_list_cleanup_time_ms;
   }
 
+  double cur_merge_refinement_table_time() const {
+    return _cur_merge_refinement_table_time_ms;
+  }
+
   double cur_resize_heap_time_ms() {
     return _cur_resize_heap_time_ms;
   }
diff --git a/src/hotspot/share/gc/g1/g1HeapRegion.cpp b/src/hotspot/share/gc/g1/g1HeapRegion.cpp
index 09bdfefccb7..ca4359dcc24 100644
--- a/src/hotspot/share/gc/g1/g1HeapRegion.cpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegion.cpp
@@ -39,6 +39,7 @@
 #include "logging/log.hpp"
 #include "logging/logStream.hpp"
 #include "memory/iterator.inline.hpp"
+#include "memory/memRegion.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/access.inline.hpp"
 #include "oops/compressedOops.inline.hpp"
@@ -137,11 +138,21 @@ void G1HeapRegion::hr_clear(bool clear_space) {
   if (clear_space) clear(SpaceDecorator::Mangle);
 }
 
-void G1HeapRegion::clear_cardtable() {
+void G1HeapRegion::clear_card_table() {
   G1CardTable* ct = G1CollectedHeap::heap()->card_table();
   ct->clear_MemRegion(MemRegion(bottom(), end()));
 }
 
+void G1HeapRegion::clear_refinement_table() {
+  G1CardTable* ct = G1CollectedHeap::heap()->refinement_table();
+  ct->clear_MemRegion(MemRegion(bottom(), end()));
+}
+
+void G1HeapRegion::clear_both_card_tables() {
+  clear_card_table();
+  clear_refinement_table();
+}
+
 void G1HeapRegion::set_free() {
   if (!is_free()) {
     report_region_type_change(G1HeapRegionTraceType::Free);
@@ -591,8 +602,12 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure {
 
     G1HeapRegion* _from;
     G1HeapRegion* _to;
-    CardValue _cv_obj;
-    CardValue _cv_field;
+
+    CardValue _cv_obj_ct;    // In card table.
+    CardValue _cv_field_ct;
+
+    CardValue _cv_obj_rt;    // In refinement table.
+    CardValue _cv_field_rt;
 
     RemSetChecker(G1VerifyFailureCounter* failures, oop containing_obj, T* p, oop obj)
       : Checker<T>(failures, containing_obj, p, obj) {
@@ -600,19 +615,23 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure {
       _to = this->_g1h->heap_region_containing(obj);
 
       CardTable* ct = this->_g1h->card_table();
-      _cv_obj = *ct->byte_for_const(this->_containing_obj);
-      _cv_field = *ct->byte_for_const(p);
+      _cv_obj_ct = *ct->byte_for_const(this->_containing_obj);
+      _cv_field_ct = *ct->byte_for_const(p);
+
+      ct = this->_g1h->refinement_table();
+      _cv_obj_rt = *ct->byte_for_const(this->_containing_obj);
+      _cv_field_rt = *ct->byte_for_const(p);
     }
 
     bool failed() const {
       if (_from != _to && !_from->is_young() &&
           _to->rem_set()->is_complete() &&
           _from->rem_set()->cset_group() != _to->rem_set()->cset_group()) {
-        const CardValue dirty = G1CardTable::dirty_card_val();
+        const CardValue clean = G1CardTable::clean_card_val();
         return !(_to->rem_set()->contains_reference(this->_p) ||
                  (this->_containing_obj->is_objArray() ?
-                  _cv_field == dirty :
-                  _cv_obj == dirty || _cv_field == dirty));
+                  (_cv_field_ct != clean || _cv_field_rt != clean) :
+                  (_cv_obj_ct != clean || _cv_field_ct != clean || _cv_obj_rt != clean || _cv_field_rt != clean)));
       }
       return false;
     }
@@ -630,7 +649,8 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure {
       log.error("Missing rem set entry:");
       this->print_containing_obj(&ls, _from);
       this->print_referenced_obj(&ls, _to, "");
-      log.error("Obj head CV = %d, field CV = %d.", _cv_obj, _cv_field);
+      log.error("CT obj head CV = %d, field CV = %d.", _cv_obj_ct, _cv_field_ct);
+      log.error("RT Obj head CV = %d, field CV = %d.", _cv_obj_rt, _cv_field_rt);
       log.error("----------");
     }
   };
diff --git a/src/hotspot/share/gc/g1/g1HeapRegion.hpp b/src/hotspot/share/gc/g1/g1HeapRegion.hpp
index 71584ffb24d..17ec3055b52 100644
--- a/src/hotspot/share/gc/g1/g1HeapRegion.hpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegion.hpp
@@ -42,7 +42,6 @@ class G1CollectedHeap;
 class G1CMBitMap;
 class G1CSetCandidateGroup;
 class G1Predictions;
-class G1HeapRegion;
 class G1HeapRegionRemSet;
 class G1HeapRegionSetBase;
 class nmethod;
@@ -478,7 +477,10 @@ public:
   // Callers must ensure this is not called by multiple threads at the same time.
   void hr_clear(bool clear_space);
   // Clear the card table corresponding to this region.
-  void clear_cardtable();
+  void clear_card_table();
+  void clear_refinement_table();
+
+  void clear_both_card_tables();
 
   // Notify the region that an evacuation failure occurred for an object within this
   // region.
diff --git a/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp b/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp
index d4286a1caeb..795b6543bae 100644
--- a/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp
@@ -63,7 +63,8 @@ public:
 
 G1HeapRegionManager::G1HeapRegionManager() :
   _bot_mapper(nullptr),
-  _cardtable_mapper(nullptr),
+  _card_table_mapper(nullptr),
+  _refinement_table_mapper(nullptr),
   _committed_map(),
   _next_highest_used_hrm_index(0),
   _regions(), _heap_mapper(nullptr),
@@ -74,7 +75,8 @@ G1HeapRegionManager::G1HeapRegionManager() :
 void G1HeapRegionManager::initialize(G1RegionToSpaceMapper* heap_storage,
                                      G1RegionToSpaceMapper* bitmap,
                                      G1RegionToSpaceMapper* bot,
-                                     G1RegionToSpaceMapper* cardtable) {
+                                     G1RegionToSpaceMapper* card_table,
+                                     G1RegionToSpaceMapper* refinement_table) {
   _next_highest_used_hrm_index = 0;
 
   _heap_mapper = heap_storage;
@@ -82,7 +84,8 @@ void G1HeapRegionManager::initialize(G1RegionToSpaceMapper* heap_storage,
   _bitmap_mapper = bitmap;
 
   _bot_mapper = bot;
-  _cardtable_mapper = cardtable;
+  _card_table_mapper = card_table;
+  _refinement_table_mapper = refinement_table;
 
   _regions.initialize(heap_storage->reserved(), G1HeapRegion::GrainBytes);
 
@@ -186,7 +189,8 @@ void G1HeapRegionManager::commit_regions(uint index, size_t num_regions, WorkerT
   _bitmap_mapper->commit_regions(index, num_regions, pretouch_workers);
 
   _bot_mapper->commit_regions(index, num_regions, pretouch_workers);
-  _cardtable_mapper->commit_regions(index, num_regions, pretouch_workers);
+  _card_table_mapper->commit_regions(index, num_regions, pretouch_workers);
+  _refinement_table_mapper->commit_regions(index, num_regions, pretouch_workers);
 }
 
 void G1HeapRegionManager::uncommit_regions(uint start, uint num_regions) {
@@ -209,7 +213,8 @@ void G1HeapRegionManager::uncommit_regions(uint start, uint num_regions) {
   _bitmap_mapper->uncommit_regions(start, num_regions);
 
   _bot_mapper->uncommit_regions(start, num_regions);
-  _cardtable_mapper->uncommit_regions(start, num_regions);
+  _card_table_mapper->uncommit_regions(start, num_regions);
+  _refinement_table_mapper->uncommit_regions(start, num_regions);
 
   _committed_map.uncommit(start, end);
 }
@@ -261,19 +266,23 @@ void G1HeapRegionManager::clear_auxiliary_data_structures(uint start, uint num_r
   // Signal G1BlockOffsetTable to clear the given regions.
   _bot_mapper->signal_mapping_changed(start, num_regions);
   // Signal G1CardTable to clear the given regions.
-  _cardtable_mapper->signal_mapping_changed(start, num_regions);
+  _card_table_mapper->signal_mapping_changed(start, num_regions);
+  // Signal refinement table to clear the given regions.
+  _refinement_table_mapper->signal_mapping_changed(start, num_regions);
 }
 
 MemoryUsage G1HeapRegionManager::get_auxiliary_data_memory_usage() const {
   size_t used_sz =
     _bitmap_mapper->committed_size() +
     _bot_mapper->committed_size() +
-    _cardtable_mapper->committed_size();
+    _card_table_mapper->committed_size() +
+    _refinement_table_mapper->committed_size();
 
   size_t committed_sz =
     _bitmap_mapper->reserved_size() +
     _bot_mapper->reserved_size() +
-    _cardtable_mapper->reserved_size();
+    _card_table_mapper->reserved_size() +
+    _refinement_table_mapper->reserved_size();
 
   return MemoryUsage(0, used_sz, committed_sz, committed_sz);
 }
diff --git a/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp b/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp
index 19ae9887e94..b4ce3b0a8be 100644
--- a/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp
@@ -74,7 +74,8 @@ class G1HeapRegionManager: public CHeapObj<mtGC> {
   friend class G1HeapRegionClaimer;
 
   G1RegionToSpaceMapper* _bot_mapper;
-  G1RegionToSpaceMapper* _cardtable_mapper;
+  G1RegionToSpaceMapper* _card_table_mapper;
+  G1RegionToSpaceMapper* _refinement_table_mapper;
 
   // Keeps track of the currently committed regions in the heap. The committed regions
   // can either be active (ready for use) or inactive (ready for uncommit).
@@ -161,7 +162,8 @@ public:
   void initialize(G1RegionToSpaceMapper* heap_storage,
                   G1RegionToSpaceMapper* bitmap,
                   G1RegionToSpaceMapper* bot,
-                  G1RegionToSpaceMapper* cardtable);
+                  G1RegionToSpaceMapper* card_table,
+                  G1RegionToSpaceMapper* refinement_table);
 
   // Return the "dummy" region used for G1AllocRegion. This is currently a hardwired
   // new G1HeapRegion that owns G1HeapRegion at index 0. Since at the moment we commit
diff --git a/src/hotspot/share/gc/g1/g1HeapVerifier.cpp b/src/hotspot/share/gc/g1/g1HeapVerifier.cpp
index c5af7e34dd9..21b3545f7e0 100644
--- a/src/hotspot/share/gc/g1/g1HeapVerifier.cpp
+++ b/src/hotspot/share/gc/g1/g1HeapVerifier.cpp
@@ -42,6 +42,7 @@
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/handles.inline.hpp"
+#include "runtime/threads.hpp"
 
 int G1HeapVerifier::_enabled_verification_types = G1HeapVerifier::G1VerifyAll;
 
@@ -528,6 +529,7 @@ void G1HeapVerifier::verify_before_gc() {
 
 void G1HeapVerifier::verify_after_gc() {
   verify(VerifyOption::G1UseConcMarking, "After GC");
+  verify_card_tables_in_sync();
 }
 
 void G1HeapVerifier::verify_bitmap_clear(bool from_tams) {
@@ -556,17 +558,17 @@ void G1HeapVerifier::verify_bitmap_clear(bool from_tams) {
   G1CollectedHeap::heap()->heap_region_iterate(&cl);
 }
 
-#ifndef PRODUCT
 class G1VerifyCardTableCleanup: public G1HeapRegionClosure {
   G1HeapVerifier* _verifier;
 public:
   G1VerifyCardTableCleanup(G1HeapVerifier* verifier)
     : _verifier(verifier) { }
   virtual bool do_heap_region(G1HeapRegion* r) {
+    _verifier->verify_ct_clean_region(r);
     if (r->is_survivor()) {
-      _verifier->verify_dirty_region(r);
+      _verifier->verify_rt_clean_region(r);
     } else {
-      _verifier->verify_not_dirty_region(r);
+      _verifier->verify_rt_clean_from_top(r);
     }
     return false;
   }
@@ -579,14 +581,35 @@ void G1HeapVerifier::verify_card_table_cleanup() {
   }
 }
 
-void G1HeapVerifier::verify_not_dirty_region(G1HeapRegion* hr) {
-  // All of the region should be clean.
-  G1CardTable* ct = _g1h->card_table();
-  MemRegion mr(hr->bottom(), hr->end());
-  ct->verify_not_dirty_region(mr);
+class G1VerifyCardTablesClean: public G1HeapRegionClosure {
+  G1HeapVerifier* _verifier;
+  bool _both_card_tables;
+
+public:
+  G1VerifyCardTablesClean(G1HeapVerifier* verifier, bool both_card_tables = true)
+    : _verifier(verifier), _both_card_tables(both_card_tables) { }
+
+  virtual bool do_heap_region(G1HeapRegion* r) {
+    _verifier->verify_rt_clean_region(r);     // Must be all Clean from bottom -> end.
+    if (_both_card_tables) {
+      _verifier->verify_ct_clean_region(r);
+    }
+    return false;
+  }
+};
+
+void G1HeapVerifier::verify_card_tables_clean(bool both_card_tables) {
+  G1VerifyCardTablesClean cl(this, both_card_tables);
+  _g1h->heap_region_iterate(&cl);
 }
 
-void G1HeapVerifier::verify_dirty_region(G1HeapRegion* hr) {
+void G1HeapVerifier::verify_rt_clean_from_top(G1HeapRegion* hr) {
+  G1CardTable* ct = _g1h->refinement_table();
+  MemRegion mr(align_up(hr->top(), G1CardTable::card_size()), hr->end());
+  ct->verify_region(mr, G1CardTable::clean_card_val(), true);
+}
+
+void G1HeapVerifier::verify_rt_dirty_to_dummy_top(G1HeapRegion* hr) {
   // We cannot guarantee that [bottom(),end()] is dirty.  Threads
   // dirty allocated blocks as they allocate them. The thread that
   // retires each region and replaces it with a new one will do a
@@ -594,29 +617,56 @@ void G1HeapVerifier::verify_dirty_region(G1HeapRegion* hr) {
   // not dirty that area (one less thing to have to do while holding
   // a lock). So we can only verify that [bottom(),pre_dummy_top()]
   // is dirty.
-  G1CardTable* ct = _g1h->card_table();
+  G1CardTable* ct = _g1h->refinement_table();
   MemRegion mr(hr->bottom(), hr->pre_dummy_top());
-  if (hr->is_young()) {
-    ct->verify_g1_young_region(mr);
-  } else {
-    ct->verify_dirty_region(mr);
-  }
+  ct->verify_dirty_region(mr);
 }
 
-class G1VerifyDirtyYoungListClosure : public G1HeapRegionClosure {
-private:
-  G1HeapVerifier* _verifier;
-public:
-  G1VerifyDirtyYoungListClosure(G1HeapVerifier* verifier) : G1HeapRegionClosure(), _verifier(verifier) { }
-  virtual bool do_heap_region(G1HeapRegion* r) {
-    _verifier->verify_dirty_region(r);
-    return false;
-  }
-};
+void G1HeapVerifier::verify_ct_clean_region(G1HeapRegion* hr) {
+  G1CardTable* ct = _g1h->card_table();
+  MemRegion mr(hr->bottom(), hr->end());
+  ct->verify_region(mr, G1CardTable::clean_card_val(), true);
+}
 
-void G1HeapVerifier::verify_dirty_young_regions() {
-  G1VerifyDirtyYoungListClosure cl(this);
-  _g1h->collection_set()->iterate(&cl);
+void G1HeapVerifier::verify_rt_clean_region(G1HeapRegion* hr) {
+  G1CardTable* ct = _g1h->refinement_table();
+  MemRegion mr(hr->bottom(), hr->end());
+  ct->verify_region(mr, G1CardTable::clean_card_val(), true);
+}
+
+#ifndef PRODUCT
+
+void G1HeapVerifier::verify_card_tables_in_sync() {
+
+    // Non-Java thread card tables must be null.
+    class AssertCardTableBaseNull : public ThreadClosure {
+    public:
+
+      void do_thread(Thread* thread) {
+        ResourceMark rm;
+        assert(G1ThreadLocalData::get_byte_map_base(thread) == nullptr, "thread " PTR_FORMAT " (%s) has non-null card table base",
+               p2i(thread), thread->name());
+      }
+    } check_null_cl;
+
+    Threads::non_java_threads_do(&check_null_cl);
+
+    // Java thread card tables must be the same as the global card table.
+    class AssertSameCardTableClosure : public ThreadClosure {
+    public:
+
+      void do_thread(Thread* thread) {
+        G1CardTable::CardValue* global_ct_base = G1CollectedHeap::heap()->card_table_base();
+        G1CardTable::CardValue* cur_ct_base = G1ThreadLocalData::get_byte_map_base(thread);
+
+        ResourceMark rm;
+        assert(cur_ct_base == global_ct_base,
+               "thread " PTR_FORMAT " (%s) has wrong card table base, should be " PTR_FORMAT " is " PTR_FORMAT,
+               p2i(thread), thread->name(), p2i(global_ct_base), p2i(cur_ct_base));
+      }
+    } check_same_cl;
+
+    Threads::java_threads_do(&check_same_cl);
 }
 
 class G1CheckRegionAttrTableClosure : public G1HeapRegionClosure {
diff --git a/src/hotspot/share/gc/g1/g1HeapVerifier.hpp b/src/hotspot/share/gc/g1/g1HeapVerifier.hpp
index d4ab4c60214..6a26c77ec0d 100644
--- a/src/hotspot/share/gc/g1/g1HeapVerifier.hpp
+++ b/src/hotspot/share/gc/g1/g1HeapVerifier.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -78,11 +78,16 @@ public:
   // Do sanity check on the contents of the in-cset fast test table.
   bool check_region_attr_table() PRODUCT_RETURN_( return true; );
 
-  void verify_card_table_cleanup() PRODUCT_RETURN;
+  void verify_card_table_cleanup();
+  void verify_card_tables_clean(bool both_card_tables);
 
-  void verify_not_dirty_region(G1HeapRegion* hr) PRODUCT_RETURN;
-  void verify_dirty_region(G1HeapRegion* hr) PRODUCT_RETURN;
-  void verify_dirty_young_regions() PRODUCT_RETURN;
+  void verify_ct_clean_region(G1HeapRegion* hr);
+  void verify_rt_dirty_to_dummy_top(G1HeapRegion* hr);
+  void verify_rt_clean_from_top(G1HeapRegion* hr);
+  void verify_rt_clean_region(G1HeapRegion* hr);
+
+  // Verify that the global card table and the thread's card tables are in sync.
+  void verify_card_tables_in_sync() PRODUCT_RETURN;
 };
 
 #endif // SHARE_GC_G1_G1HEAPVERIFIER_HPP
diff --git a/src/hotspot/share/gc/g1/g1OopClosures.hpp b/src/hotspot/share/gc/g1/g1OopClosures.hpp
index 3bff668bcec..a61c9d17f70 100644
--- a/src/hotspot/share/gc/g1/g1OopClosures.hpp
+++ b/src/hotspot/share/gc/g1/g1OopClosures.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -86,19 +86,19 @@ public:
 
 // This closure is applied to the fields of the objects that have just been copied during evacuation.
 class G1ScanEvacuatedObjClosure : public G1ScanClosureBase {
-  friend class G1SkipCardEnqueueSetter;
+  friend class G1SkipCardMarkSetter;
 
-  enum SkipCardEnqueueTristate {
+  enum SkipCardMarkTristate {
     False = 0,
     True,
     Uninitialized
   };
 
-  SkipCardEnqueueTristate _skip_card_enqueue;
+  SkipCardMarkTristate _skip_card_mark;
 
 public:
   G1ScanEvacuatedObjClosure(G1CollectedHeap* g1h, G1ParScanThreadState* par_scan_state) :
-    G1ScanClosureBase(g1h, par_scan_state), _skip_card_enqueue(Uninitialized) { }
+    G1ScanClosureBase(g1h, par_scan_state), _skip_card_mark(Uninitialized) { }
 
   template <class T> void do_oop_work(T* p);
   virtual void do_oop(oop* p)          { do_oop_work(p); }
@@ -109,22 +109,22 @@ public:
   }
 
 #ifdef ASSERT
-  bool skip_card_enqueue_set() const { return _skip_card_enqueue != Uninitialized; }
+  bool skip_card_mark_set() const { return _skip_card_mark != Uninitialized; }
 #endif
 };
 
-// RAII object to properly set the _skip_card_enqueue field in G1ScanEvacuatedObjClosure.
-class G1SkipCardEnqueueSetter : public StackObj {
+// RAII object to properly set the _skip_card_mark field in G1ScanEvacuatedObjClosure.
+class G1SkipCardMarkSetter : public StackObj {
   G1ScanEvacuatedObjClosure* _closure;
 
 public:
-  G1SkipCardEnqueueSetter(G1ScanEvacuatedObjClosure* closure, bool skip_card_enqueue) : _closure(closure) {
-    assert(_closure->_skip_card_enqueue == G1ScanEvacuatedObjClosure::Uninitialized, "Must not be set");
-    _closure->_skip_card_enqueue = skip_card_enqueue ? G1ScanEvacuatedObjClosure::True : G1ScanEvacuatedObjClosure::False;
+  G1SkipCardMarkSetter(G1ScanEvacuatedObjClosure* closure, bool skip_card_mark) : _closure(closure) {
+    assert(_closure->_skip_card_mark == G1ScanEvacuatedObjClosure::Uninitialized, "Must not be set");
+    _closure->_skip_card_mark = skip_card_mark ? G1ScanEvacuatedObjClosure::True : G1ScanEvacuatedObjClosure::False;
   }
 
-  ~G1SkipCardEnqueueSetter() {
-    DEBUG_ONLY(_closure->_skip_card_enqueue = G1ScanEvacuatedObjClosure::Uninitialized;)
+  ~G1SkipCardMarkSetter() {
+    DEBUG_ONLY(_closure->_skip_card_mark = G1ScanEvacuatedObjClosure::Uninitialized;)
   }
 };
 
@@ -206,13 +206,20 @@ public:
 class G1ConcurrentRefineOopClosure: public BasicOopIterateClosure {
   G1CollectedHeap* _g1h;
   uint _worker_id;
+  bool _has_ref_to_cset;
+  bool _has_ref_to_old;
 
 public:
   G1ConcurrentRefineOopClosure(G1CollectedHeap* g1h, uint worker_id) :
     _g1h(g1h),
-    _worker_id(worker_id) {
+    _worker_id(worker_id),
+    _has_ref_to_cset(false),
+    _has_ref_to_old(false) {
   }
 
+  bool has_ref_to_cset() const { return _has_ref_to_cset; }
+  bool has_ref_to_old() const { return _has_ref_to_old; }
+
   virtual ReferenceIterationMode reference_iteration_mode() { return DO_FIELDS; }
 
   template <class T> void do_oop_work(T* p);
@@ -223,6 +230,7 @@ public:
 class G1RebuildRemSetClosure : public BasicOopIterateClosure {
   G1CollectedHeap* _g1h;
   uint _worker_id;
+
 public:
   G1RebuildRemSetClosure(G1CollectedHeap* g1h, uint worker_id) : _g1h(g1h), _worker_id(worker_id) {
   }
diff --git a/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp b/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
index c0c67fda949..87e3a1cc7c4 100644
--- a/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
@@ -90,11 +90,11 @@ inline void G1ScanEvacuatedObjClosure::do_oop_work(T* p) {
     prefetch_and_push(p, obj);
   } else if (!G1HeapRegion::is_in_same_region(p, obj)) {
     handle_non_cset_obj_common(region_attr, p, obj);
-    assert(_skip_card_enqueue != Uninitialized, "Scan location has not been initialized.");
-    if (_skip_card_enqueue == True) {
+    assert(_skip_card_mark != Uninitialized, "Scan location has not been initialized.");
+    if (_skip_card_mark == True) {
       return;
     }
-    _par_scan_state->enqueue_card_if_tracked(region_attr, p, obj);
+    _par_scan_state->mark_card_if_tracked(region_attr, p, obj);
   }
 }
 
@@ -127,6 +127,11 @@ inline static void check_obj_during_refinement(T* p, oop const obj) {
 
 template <class T>
 inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) {
+  // Early out if we already found a to-young reference.
+  if (_has_ref_to_cset) {
+    return;
+  }
+
   T o = RawAccess<MO_RELAXED>::oop_load(p);
   if (CompressedOops::is_null(o)) {
     return;
@@ -146,7 +151,12 @@ inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) {
     return;
   }
 
-  G1HeapRegionRemSet* to_rem_set = _g1h->heap_region_containing(obj)->rem_set();
+  G1HeapRegion* to_region = _g1h->heap_region_containing(obj);
+  if (to_region->is_young()) {
+    _has_ref_to_cset = true;
+    return;
+  }
+  G1HeapRegionRemSet* to_rem_set = to_region->rem_set();
 
   assert(to_rem_set != nullptr, "Need per-region 'into' remsets.");
   if (to_rem_set->is_tracked()) {
@@ -154,6 +164,7 @@ inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) {
 
     if (from->rem_set()->cset_group() != to_rem_set->cset_group()) {
       to_rem_set->add_reference(p, _worker_id);
+      _has_ref_to_old = true;
     }
   }
 }
@@ -180,7 +191,7 @@ inline void G1ScanCardClosure::do_oop_work(T* p) {
     _heap_roots_found++;
   } else if (!G1HeapRegion::is_in_same_region(p, obj)) {
     handle_non_cset_obj_common(region_attr, p, obj);
-    _par_scan_state->enqueue_card_if_tracked(region_attr, p, obj);
+    _par_scan_state->mark_card_if_tracked(region_attr, p, obj);
   }
 }
 
@@ -272,10 +283,14 @@ template <class T> void G1RebuildRemSetClosure::do_oop_work(T* p) {
   G1HeapRegion* to = _g1h->heap_region_containing(obj);
   G1HeapRegionRemSet* rem_set = to->rem_set();
   if (rem_set->is_tracked()) {
-    G1HeapRegion* from = _g1h->heap_region_containing(p);
+    if (to->is_young()) {
+      G1BarrierSet::g1_barrier_set()->write_ref_field_post(p);
+    } else {
+      G1HeapRegion* from = _g1h->heap_region_containing(p);
 
-    if (from->rem_set()->cset_group() != rem_set->cset_group()) {
-      rem_set->add_reference(p, _worker_id);
+      if (from->rem_set()->cset_group() != rem_set->cset_group()) {
+        rem_set->add_reference(p, _worker_id);
+      }
     }
   }
 }
diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
index 42c3a872e6b..80e5fd44fcd 100644
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
@@ -57,22 +57,21 @@
 #define MAYBE_INLINE_EVACUATION NOT_DEBUG(inline) DEBUG_ONLY(NOINLINE)
 
 G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h,
-                                           G1RedirtyCardsQueueSet* rdcqs,
                                            uint worker_id,
                                            uint num_workers,
                                            G1CollectionSet* collection_set,
                                            G1EvacFailureRegions* evac_failure_regions)
   : _g1h(g1h),
     _task_queue(g1h->task_queue(worker_id)),
-    _rdc_local_qset(rdcqs),
-    _ct(g1h->card_table()),
+    _ct(g1h->refinement_table()),
     _closures(nullptr),
     _plab_allocator(nullptr),
     _age_table(false),
     _tenuring_threshold(g1h->policy()->tenuring_threshold()),
     _scanner(g1h, this),
     _worker_id(worker_id),
-    _last_enqueued_card(SIZE_MAX),
+    _num_cards_marked_dirty(0),
+    _num_cards_marked_to_cset(0),
     _stack_trim_upper_threshold(GCDrainStackTargetSize * 2 + 1),
     _stack_trim_lower_threshold(GCDrainStackTargetSize),
     _trim_ticks(),
@@ -88,7 +87,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h,
     ALLOCATION_FAILURE_INJECTOR_ONLY(_allocation_failure_inject_counter(0) COMMA)
     _evacuation_failed_info(),
     _evac_failure_regions(evac_failure_regions),
-    _evac_failure_enqueued_cards(0)
+    _num_cards_from_evac_failure(0)
 {
   // We allocate number of young gen regions in the collection set plus one
   // entries, since entry 0 keeps track of surviving bytes for non-young regions.
@@ -112,8 +111,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h,
   initialize_numa_stats();
 }
 
-size_t G1ParScanThreadState::flush_stats(size_t* surviving_young_words, uint num_workers, BufferNodeList* rdc_buffers) {
-  *rdc_buffers = _rdc_local_qset.flush();
+size_t G1ParScanThreadState::flush_stats(size_t* surviving_young_words, uint num_workers) {
   flush_numa_stats();
   // Update allocation statistics.
   _plab_allocator->flush_and_retire_stats(num_workers);
@@ -147,8 +145,16 @@ size_t G1ParScanThreadState::lab_undo_waste_words() const {
   return _plab_allocator->undo_waste();
 }
 
-size_t G1ParScanThreadState::evac_failure_enqueued_cards() const {
-  return _evac_failure_enqueued_cards;
+size_t G1ParScanThreadState::num_cards_pending() const {
+  return _num_cards_marked_dirty + _num_cards_from_evac_failure;
+}
+
+size_t G1ParScanThreadState::num_cards_marked() const {
+  return num_cards_pending() + _num_cards_marked_to_cset;
+}
+
+size_t G1ParScanThreadState::num_cards_from_evac_failure() const {
+  return _num_cards_from_evac_failure;
 }
 
 #ifdef ASSERT
@@ -230,7 +236,7 @@ void G1ParScanThreadState::do_partial_array(PartialArrayState* state, bool stole
   PartialArraySplitter::Claim claim =
     _partial_array_splitter.claim(state, _task_queue, stolen);
   G1HeapRegionAttr dest_attr = _g1h->region_attr(to_array);
-  G1SkipCardEnqueueSetter x(&_scanner, dest_attr.is_new_survivor());
+  G1SkipCardMarkSetter x(&_scanner, dest_attr.is_new_survivor());
   // Process claimed task.
   to_array->oop_iterate_range(&_scanner,
                               checked_cast<int>(claim._start),
@@ -250,7 +256,7 @@ void G1ParScanThreadState::start_partial_objarray(oop from_obj,
     // The source array is unused when processing states.
     _partial_array_splitter.start(_task_queue, nullptr, to_array, array_length);
 
-  assert(_scanner.skip_card_enqueue_set(), "must be");
+  assert(_scanner.skip_card_mark_set(), "must be");
   // Process the initial chunk.  No need to process the type in the
   // klass, as it will already be handled by processing the built-in
   // module.
@@ -451,7 +457,7 @@ void G1ParScanThreadState::do_iterate_object(oop const obj,
       _string_dedup_requests.add(old);
     }
 
-    assert(_scanner.skip_card_enqueue_set(), "must be");
+    assert(_scanner.skip_card_mark_set(), "must be");
     obj->oop_iterate_backwards(&_scanner, klass);
 }
 
@@ -546,7 +552,7 @@ oop G1ParScanThreadState::do_copy_to_survivor_space(G1HeapRegionAttr const regio
       // Instead, we use dest_attr.is_young() because the two values are always
       // equal: successfully allocated young regions must be survivor regions.
       assert(dest_attr.is_young() == _g1h->heap_region_containing(obj)->is_survivor(), "must be");
-      G1SkipCardEnqueueSetter x(&_scanner, dest_attr.is_young());
+      G1SkipCardMarkSetter x(&_scanner, dest_attr.is_young());
       do_iterate_object(obj, old, klass, region_attr, dest_attr, age);
     }
 
@@ -569,7 +575,7 @@ G1ParScanThreadState* G1ParScanThreadStateSet::state_for_worker(uint worker_id)
   assert(worker_id < _num_workers, "out of bounds access");
   if (_states[worker_id] == nullptr) {
     _states[worker_id] =
-      new G1ParScanThreadState(_g1h, rdcqs(),
+      new G1ParScanThreadState(_g1h,
                                worker_id,
                                _num_workers,
                                _collection_set,
@@ -595,22 +601,24 @@ void G1ParScanThreadStateSet::flush_stats() {
     // because it resets the PLAB allocator where we get this info from.
     size_t lab_waste_bytes = pss->lab_waste_words() * HeapWordSize;
     size_t lab_undo_waste_bytes = pss->lab_undo_waste_words() * HeapWordSize;
-    size_t copied_bytes = pss->flush_stats(_surviving_young_words_total, _num_workers, &_rdc_buffers[worker_id]) * HeapWordSize;
-    size_t evac_fail_enqueued_cards = pss->evac_failure_enqueued_cards();
+    size_t copied_bytes = pss->flush_stats(_surviving_young_words_total, _num_workers) * HeapWordSize;
+    size_t pending_cards = pss->num_cards_pending();
+    size_t to_young_gen_cards = pss->num_cards_marked() - pss->num_cards_pending();
+    size_t evac_failure_cards = pss->num_cards_from_evac_failure();
+    size_t marked_cards = pss->num_cards_marked();
 
     p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, copied_bytes, G1GCPhaseTimes::MergePSSCopiedBytes);
     p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, lab_waste_bytes, G1GCPhaseTimes::MergePSSLABWasteBytes);
     p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, lab_undo_waste_bytes, G1GCPhaseTimes::MergePSSLABUndoWasteBytes);
-    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, evac_fail_enqueued_cards, G1GCPhaseTimes::MergePSSEvacFailExtra);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, pending_cards, G1GCPhaseTimes::MergePSSPendingCards);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, to_young_gen_cards, G1GCPhaseTimes::MergePSSToYoungGenCards);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, evac_failure_cards, G1GCPhaseTimes::MergePSSEvacFail);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, marked_cards, G1GCPhaseTimes::MergePSSMarked);
 
     delete pss;
     _states[worker_id] = nullptr;
   }
 
-  G1DirtyCardQueueSet& dcq = G1BarrierSet::dirty_card_queue_set();
-  dcq.merge_bufferlists(rdcqs());
-  rdcqs()->verify_empty();
-
   _flushed = true;
 }
 
@@ -652,7 +660,7 @@ oop G1ParScanThreadState::handle_evacuation_failure_par(oop old, markWord m, Kla
       // existing closure to scan evacuated objects; since we are iterating from a
       // collection set region (i.e. never a Survivor region), we always need to
       // gather cards for this case.
-      G1SkipCardEnqueueSetter x(&_scanner, false /* skip_card_enqueue */);
+      G1SkipCardMarkSetter x(&_scanner, false /* skip_card_mark */);
       do_iterate_object(old, old, klass, attr, attr, m.age());
     }
 
@@ -709,9 +717,7 @@ G1ParScanThreadStateSet::G1ParScanThreadStateSet(G1CollectedHeap* g1h,
                                                  G1EvacFailureRegions* evac_failure_regions) :
     _g1h(g1h),
     _collection_set(collection_set),
-    _rdcqs(G1BarrierSet::dirty_card_queue_set().allocator()),
     _states(NEW_C_HEAP_ARRAY(G1ParScanThreadState*, num_workers, mtGC)),
-    _rdc_buffers(NEW_C_HEAP_ARRAY(BufferNodeList, num_workers, mtGC)),
     _surviving_young_words_total(NEW_C_HEAP_ARRAY(size_t, collection_set->young_region_length() + 1, mtGC)),
     _num_workers(num_workers),
     _flushed(false),
@@ -719,7 +725,6 @@ G1ParScanThreadStateSet::G1ParScanThreadStateSet(G1CollectedHeap* g1h,
 {
   for (uint i = 0; i < num_workers; ++i) {
     _states[i] = nullptr;
-    _rdc_buffers[i] = BufferNodeList();
   }
   memset(_surviving_young_words_total, 0, (collection_set->young_region_length() + 1) * sizeof(size_t));
 }
@@ -728,7 +733,6 @@ G1ParScanThreadStateSet::~G1ParScanThreadStateSet() {
   assert(_flushed, "thread local state from the per thread states should have been flushed");
   FREE_C_HEAP_ARRAY(G1ParScanThreadState*, _states);
   FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_total);
-  FREE_C_HEAP_ARRAY(BufferNodeList, _rdc_buffers);
 }
 
 #if TASKQUEUE_STATS
diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
index 4d569622238..3fb080d40be 100644
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,7 +27,6 @@
 
 #include "gc/g1/g1CollectedHeap.hpp"
 #include "gc/g1/g1OopClosures.hpp"
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
 #include "gc/g1/g1YoungGCAllocationFailureInjector.hpp"
 #include "gc/shared/ageTable.hpp"
 #include "gc/shared/copyFailedInfo.hpp"
@@ -52,7 +51,6 @@ class outputStream;
 class G1ParScanThreadState : public CHeapObj<mtGC> {
   G1CollectedHeap* _g1h;
   G1ScannerTasksQueue* _task_queue;
-  G1RedirtyCardsLocalQueueSet _rdc_local_qset;
   G1CardTable* _ct;
   G1EvacuationRootClosures* _closures;
 
@@ -65,9 +63,8 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {
 
   uint _worker_id;
 
-  // Remember the last enqueued card to avoid enqueuing the same card over and over;
-  // since we only ever scan a card once, this is sufficient.
-  size_t _last_enqueued_card;
+  size_t _num_cards_marked_dirty;
+  size_t _num_cards_marked_to_cset;
 
   // Upper and lower threshold to start and end work queue draining.
   uint const _stack_trim_upper_threshold;
@@ -104,22 +101,19 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {
 
   EvacuationFailedInfo _evacuation_failed_info;
   G1EvacFailureRegions* _evac_failure_regions;
-  // Number of additional cards into evacuation failed regions enqueued into
-  // the local DCQS. This is an approximation, as cards that would be added later
-  // outside of evacuation failure will not be subtracted again.
-  size_t _evac_failure_enqueued_cards;
+  // Number of additional cards into evacuation failed regions.
+  size_t _num_cards_from_evac_failure;
 
-  // Enqueue the card if not already in the set; this is a best-effort attempt on
+  // Mark the card if not already in the set; this is a best-effort attempt on
   // detecting duplicates.
-  template <class T> bool enqueue_if_new(T* p);
-  // Enqueue the card of p into the (evacuation failed) region.
-  template <class T> void enqueue_card_into_evac_fail_region(T* p, oop obj);
+  template <class T> bool mark_if_new(T* p, bool into_survivor);
+  // Mark the card of p into the (evacuation failed) region.
+  template <class T> void mark_card_into_evac_fail_region(T* p, oop obj);
 
   bool inject_allocation_failure(uint region_idx) ALLOCATION_FAILURE_INJECTOR_RETURN_( return false; );
 
 public:
   G1ParScanThreadState(G1CollectedHeap* g1h,
-                       G1RedirtyCardsQueueSet* rdcqs,
                        uint worker_id,
                        uint num_workers,
                        G1CollectionSet* collection_set,
@@ -139,16 +133,16 @@ public:
 
   void push_on_queue(ScannerTask task);
 
-  // Apply the post barrier to the given reference field. Enqueues the card of p
+  // Apply the post barrier to the given reference field. Marks the card of p
   // if the barrier does not filter out the reference for some reason (e.g.
   // p and q are in the same region, p is in survivor, p is in collection set)
   // To be called during GC if nothing particular about p and obj are known.
   template <class T> void write_ref_field_post(T* p, oop obj);
 
-  // Enqueue the card if the reference's target region's remembered set is tracked.
+  // Mark the card if the reference's target region's remembered set is tracked.
   // Assumes that a significant amount of pre-filtering (like done by
   // write_ref_field_post() above) has already been performed.
-  template <class T> void enqueue_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o);
+  template <class T> void mark_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o);
 
   G1EvacuationRootClosures* closures() { return _closures; }
   uint worker_id() { return _worker_id; }
@@ -156,11 +150,22 @@ public:
   size_t lab_waste_words() const;
   size_t lab_undo_waste_words() const;
 
-  size_t evac_failure_enqueued_cards() const;
+  // Newly marked cards during this garbage collection, to be refined concurrently
+  // later. Contains both marks generated by new cross-region references as well
+  // as cards generated from regions into evacuation failed regions.
+  // Does not contain cards into the next collection set (e.g. survivors) - they will not
+  // be refined concurrently. Calculation is done on a best-effort basis.
+  size_t num_cards_pending() const;
+  // Number of cards newly generated by references into evacuation failed regions.
+  // Calculation is done on a best-effort basis.
+  size_t num_cards_from_evac_failure() const;
+  // Sum of cards marked by evacuation. Contains both pending cards as well as cards
+  // into the next collection set (e.g. survivors).
+  size_t num_cards_marked() const;
 
   // Pass locally gathered statistics to global state. Returns the total number of
   // HeapWords copied.
-  size_t flush_stats(size_t* surviving_young_words, uint num_workers, BufferNodeList* buffer_log);
+  size_t flush_stats(size_t* surviving_young_words, uint num_workers);
 
 #if TASKQUEUE_STATS
   PartialArrayTaskStats* partial_array_task_stats();
@@ -249,9 +254,7 @@ public:
 class G1ParScanThreadStateSet : public StackObj {
   G1CollectedHeap* _g1h;
   G1CollectionSet* _collection_set;
-  G1RedirtyCardsQueueSet _rdcqs;
   G1ParScanThreadState** _states;
-  BufferNodeList* _rdc_buffers;
   size_t* _surviving_young_words_total;
   uint _num_workers;
   bool _flushed;
@@ -264,9 +267,6 @@ class G1ParScanThreadStateSet : public StackObj {
                           G1EvacFailureRegions* evac_failure_regions);
   ~G1ParScanThreadStateSet();
 
-  G1RedirtyCardsQueueSet* rdcqs() { return &_rdcqs; }
-  BufferNodeList* rdc_buffers() { return _rdc_buffers; }
-
   void flush_stats();
   void record_unused_optional_region(G1HeapRegion* hr);
 #if TASKQUEUE_STATS
diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp
index 148284e7ef7..ee5bc93290e 100644
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp
@@ -96,25 +96,24 @@ G1OopStarChunkedList* G1ParScanThreadState::oops_into_optional_region(const G1He
   return &_oops_into_optional_regions[hr->index_in_opt_cset()];
 }
 
-template <class T> bool G1ParScanThreadState::enqueue_if_new(T* p) {
-  size_t card_index = ct()->index_for(p);
-  // If the card hasn't been added to the buffer, do it.
-  if (_last_enqueued_card != card_index) {
-    _rdc_local_qset.enqueue(ct()->byte_for_index(card_index));
-    _last_enqueued_card = card_index;
+template <class T> bool G1ParScanThreadState::mark_if_new(T* p, bool into_new_survivor) {
+  G1CardTable::CardValue* card = ct()->byte_for(p);
+  G1CardTable::CardValue value = *card;
+  if (value == G1CardTable::clean_card_val()) {
+    *card = into_new_survivor ? G1CardTable::g1_to_cset_card : G1CardTable::g1_dirty_card;
     return true;
   } else {
     return false;
   }
 }
 
-template <class T> void G1ParScanThreadState::enqueue_card_into_evac_fail_region(T* p, oop obj) {
+template <class T> void G1ParScanThreadState::mark_card_into_evac_fail_region(T* p, oop obj) {
   assert(!G1HeapRegion::is_in_same_region(p, obj), "Should have filtered out cross-region references already.");
   assert(!_g1h->heap_region_containing(p)->is_survivor(), "Should have filtered out from-newly allocated survivor references already.");
   assert(_g1h->heap_region_containing(obj)->in_collection_set(), "Only for enqeueing reference into collection set region");
 
-  if (enqueue_if_new(p)) {
-    _evac_failure_enqueued_cards++;
+  if (mark_if_new(p, false /* into_new_survivor */)) { // The reference is never into survivor regions.
+    _num_cards_from_evac_failure++;
   }
 }
 
@@ -137,18 +136,18 @@ template <class T> void G1ParScanThreadState::write_ref_field_post(T* p, oop obj
   if (dest_attr.is_in_cset()) {
     assert(obj->is_forwarded(), "evac-failed but not forwarded: " PTR_FORMAT, p2i(obj));
     assert(obj->forwardee() == obj, "evac-failed but not self-forwarded: " PTR_FORMAT, p2i(obj));
-    enqueue_card_into_evac_fail_region(p, obj);
+    mark_card_into_evac_fail_region(p, obj);
     return;
   }
-  enqueue_card_if_tracked(dest_attr, p, obj);
+  mark_card_if_tracked(dest_attr, p, obj);
 }
 
-template <class T> void G1ParScanThreadState::enqueue_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o) {
+template <class T> void G1ParScanThreadState::mark_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o) {
   assert(!G1HeapRegion::is_in_same_region(p, o), "Should have filtered out cross-region references already.");
   assert(!_g1h->heap_region_containing(p)->is_survivor(), "Should have filtered out from-newly allocated survivor references already.");
   // We relabel all regions that failed evacuation as old gen without remembered,
   // and so pre-filter them out in the caller.
-  assert(!_g1h->heap_region_containing(o)->in_collection_set(), "Should not try to enqueue reference into collection set region");
+  assert(!_g1h->heap_region_containing(o)->in_collection_set(), "Should not try to mark reference into collection set region");
 
 #ifdef ASSERT
   G1HeapRegion* const hr_obj = _g1h->heap_region_containing(o);
@@ -161,7 +160,14 @@ template <class T> void G1ParScanThreadState::enqueue_card_if_tracked(G1HeapRegi
   if (!region_attr.remset_is_tracked()) {
     return;
   }
-  enqueue_if_new(p);
+  bool into_survivor = region_attr.is_new_survivor();
+  if (mark_if_new(p, into_survivor)) {
+    if (into_survivor) {
+      _num_cards_marked_to_cset++;
+    } else {
+      _num_cards_marked_dirty++;
+    }
+  }
 }
 
 #endif // SHARE_GC_G1_G1PARSCANTHREADSTATE_INLINE_HPP
diff --git a/src/hotspot/share/gc/g1/g1Policy.cpp b/src/hotspot/share/gc/g1/g1Policy.cpp
index 9f872aa6ccd..754cc502031 100644
--- a/src/hotspot/share/gc/g1/g1Policy.cpp
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp
@@ -67,8 +67,7 @@ G1Policy::G1Policy(STWGCTimer* gc_timer) :
   _reserve_regions(0),
   _young_gen_sizer(),
   _free_regions_at_end_of_collection(0),
-  _card_rs_length(0),
-  _pending_cards_at_gc_start(0),
+  _pending_cards_from_gc(0),
   _concurrent_start_to_mixed(),
   _collection_set(nullptr),
   _g1h(nullptr),
@@ -553,12 +552,9 @@ G1GCPhaseTimes* G1Policy::phase_times() const {
   return _phase_times;
 }
 
-void G1Policy::revise_young_list_target_length(size_t card_rs_length, size_t code_root_rs_length) {
+void G1Policy::revise_young_list_target_length(size_t pending_cards, size_t card_rs_length, size_t code_root_rs_length) {
   guarantee(use_adaptive_young_list_length(), "should not call this otherwise" );
 
-  size_t thread_buffer_cards = _analytics->predict_dirtied_cards_in_thread_buffers();
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  size_t pending_cards = dcqs.num_cards() + thread_buffer_cards;
   update_young_length_bounds(pending_cards, card_rs_length, code_root_rs_length);
 }
 
@@ -567,7 +563,7 @@ void G1Policy::record_full_collection_start() {
   // Release the future to-space so that it is available for compaction into.
   collector_state()->set_in_young_only_phase(false);
   collector_state()->set_in_full_gc(true);
-  _pending_cards_at_gc_start = 0;
+  _collection_set->abandon_all_candidates();
 }
 
 void G1Policy::record_full_collection_end() {
@@ -600,59 +596,70 @@ void G1Policy::record_full_collection_end() {
   record_pause(G1GCPauseType::FullGC, start_time_sec, end_sec);
 }
 
-static void log_refinement_stats(const char* kind, const G1ConcurrentRefineStats& stats) {
+static void log_refinement_stats(const G1ConcurrentRefineStats& stats) {
   log_debug(gc, refine, stats)
-           ("%s refinement: %.2fms, refined: %zu"
-            ", precleaned: %zu, dirtied: %zu",
-            kind,
-            stats.refinement_time().seconds() * MILLIUNITS,
+           ("Refinement: sweep: %.2fms, yield: %.2fms refined: %zu, dirtied: %zu",
+            TimeHelper::counter_to_millis(stats.sweep_duration()),
+            TimeHelper::counter_to_millis(stats.yield_during_sweep_duration()),
             stats.refined_cards(),
-            stats.precleaned_cards(),
-            stats.dirtied_cards());
+            stats.cards_pending());
 }
 
-void G1Policy::record_concurrent_refinement_stats(size_t pending_cards,
-                                                  size_t thread_buffer_cards) {
-  _pending_cards_at_gc_start = pending_cards;
-  _analytics->report_dirtied_cards_in_thread_buffers(thread_buffer_cards);
-
-  // Collect per-thread stats, mostly from mutator activity.
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  G1ConcurrentRefineStats mut_stats = dcqs.concatenated_refinement_stats();
-
-  // Collect specialized concurrent refinement thread stats.
-  G1ConcurrentRefine* cr = _g1h->concurrent_refine();
-  G1ConcurrentRefineStats cr_stats = cr->get_and_reset_refinement_stats();
-
-  G1ConcurrentRefineStats total_stats = mut_stats + cr_stats;
-
-  log_refinement_stats("Mutator", mut_stats);
-  log_refinement_stats("Concurrent", cr_stats);
-  log_refinement_stats("Total", total_stats);
+void G1Policy::record_refinement_stats(G1ConcurrentRefineStats* refine_stats) {
+  log_refinement_stats(*refine_stats);
 
   // Record the rate at which cards were refined.
-  // Don't update the rate if the current sample is empty or time is zero.
-  Tickspan refinement_time = total_stats.refinement_time();
-  size_t refined_cards = total_stats.refined_cards();
-  if ((refined_cards > 0) && (refinement_time > Tickspan())) {
-    double rate = refined_cards / (refinement_time.seconds() * MILLIUNITS);
+  // Don't update the rate if the current sample is empty or time is zero (which is
+  // the case during GC).
+  double refinement_time = TimeHelper::counter_to_millis(refine_stats->sweep_duration());
+  size_t refined_cards = refine_stats->refined_cards();
+  if ((refined_cards > 0) && (refinement_time > 0)) {
+    double rate = refined_cards / refinement_time;
     _analytics->report_concurrent_refine_rate_ms(rate);
-    log_debug(gc, refine, stats)("Concurrent refinement rate: %.2f cards/ms", rate);
+    log_debug(gc, refine, stats)("Concurrent refinement rate: %.2f cards/ms predicted: %.2f cards/ms", rate, _analytics->predict_concurrent_refine_rate_ms());
   }
+}
 
+template<typename T>
+static T saturated_sub(T x, T y) {
+  return (x < y) ? T() : (x - y);
+}
+
+void G1Policy::record_dirtying_stats(double last_mutator_start_dirty_ms,
+                                     double last_mutator_end_dirty_ms,
+                                     size_t pending_cards,
+                                     double yield_duration_ms,
+                                     size_t next_pending_cards_from_gc,
+                                     size_t next_to_collection_set_cards) {
+  assert(SafepointSynchronize::is_at_safepoint() || G1ReviseYoungLength_lock->is_locked(),
+         "must be (at safepoint %s locked %s)",
+         BOOL_TO_STR(SafepointSynchronize::is_at_safepoint()), BOOL_TO_STR(G1ReviseYoungLength_lock->is_locked()));
   // Record mutator's card logging rate.
-  double mut_start_time = _analytics->prev_collection_pause_end_ms();
-  double mut_end_time = cur_pause_start_sec() * MILLIUNITS;
-  double mut_time = mut_end_time - mut_start_time;
+
   // Unlike above for conc-refine rate, here we should not require a
   // non-empty sample, since an application could go some time with only
   // young-gen or filtered out writes.  But we'll ignore unusually short
   // sample periods, as they may just pollute the predictions.
-  if (mut_time > 1.0) {   // Require > 1ms sample time.
-    double dirtied_rate = total_stats.dirtied_cards() / mut_time;
+  double const mutator_dirty_time_ms = (last_mutator_end_dirty_ms - last_mutator_start_dirty_ms) - yield_duration_ms;
+  assert(mutator_dirty_time_ms >= 0.0,
+         "must be (start: %.2f end: %.2f yield: %.2f)",
+         last_mutator_start_dirty_ms, last_mutator_end_dirty_ms, yield_duration_ms);
+
+  if (mutator_dirty_time_ms > 1.0) {   // Require > 1ms sample time.
+    // The subtractive term is pending_cards_from_gc() which includes both dirtied and dirty-as-young cards,
+    // which can be larger than what is actually considered as "pending" (dirty cards only).
+    size_t dirtied_cards = saturated_sub(pending_cards, pending_cards_from_gc());
+    double dirtied_rate = dirtied_cards / mutator_dirty_time_ms;
     _analytics->report_dirtied_cards_rate_ms(dirtied_rate);
-    log_debug(gc, refine, stats)("Generate dirty cards rate: %.2f cards/ms", dirtied_rate);
+    log_debug(gc, refine, stats)("Generate dirty cards rate: %.2f cards/ms dirtying time %.2f (start %.2f end %.2f yield %.2f) dirtied %zu (pending %zu during_gc %zu)",
+                                 dirtied_rate,
+                                 mutator_dirty_time_ms,
+                                 last_mutator_start_dirty_ms, last_mutator_end_dirty_ms, yield_duration_ms,
+                                 dirtied_cards, pending_cards, pending_cards_from_gc());
   }
+
+  _pending_cards_from_gc = next_pending_cards_from_gc;
+  _to_collection_set_cards = next_to_collection_set_cards;
 }
 
 bool G1Policy::should_retain_evac_failed_region(uint index) const {
@@ -761,27 +768,27 @@ bool G1Policy::concurrent_operation_is_full_mark(const char* msg) {
     ((_g1h->gc_cause() != GCCause::_g1_humongous_allocation) || need_to_start_conc_mark(msg));
 }
 
-double G1Policy::logged_cards_processing_time() const {
+double G1Policy::pending_cards_processing_time() const {
   double all_cards_processing_time = average_time_ms(G1GCPhaseTimes::ScanHR) + average_time_ms(G1GCPhaseTimes::OptScanHR);
-  size_t logged_dirty_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards);
+  size_t pending_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) +
+                         phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards);
   size_t scan_heap_roots_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
                                  phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
 
-  double merge_logged_cards_time = average_time_ms(G1GCPhaseTimes::MergeLB) +
-                                   phase_times()->cur_distribute_log_buffers_time_ms();
+  double merge_pending_cards_time = phase_times()->cur_merge_refinement_table_time();
 
-  // Approximate the time spent processing cards from log buffers by scaling
-  // the total processing time by the ratio of logged cards to total cards
+  // Approximate the time spent processing cards from pending cards by scaling
+  // the total processing time by the ratio of pending cards to total cards
   // processed.  There might be duplicate cards in different log buffers,
   // leading to an overestimate.  That effect should be relatively small
   // unless there are few cards to process, because cards in buffers are
   // dirtied to limit duplication.  Also need to avoid scaling when both
   // counts are zero, which happens especially during early GCs.  So ascribe
-  // all of the time to the logged cards unless there are more total cards.
-  if (logged_dirty_cards >= scan_heap_roots_cards) {
-    return all_cards_processing_time + merge_logged_cards_time;
+  // all of the time to the pending cards unless there are more total cards.
+  if (pending_cards >= scan_heap_roots_cards) {
+    return all_cards_processing_time + merge_pending_cards_time;
   }
-  return (all_cards_processing_time * logged_dirty_cards / scan_heap_roots_cards) + merge_logged_cards_time;
+  return (all_cards_processing_time * pending_cards / scan_heap_roots_cards) + merge_pending_cards_time;
 }
 
 // Anything below that is considered to be zero
@@ -815,6 +822,22 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
   // We make the assumption that these are rare.
   bool update_stats = !allocation_failure;
 
+  size_t const total_cards_scanned = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
+                                     p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
+
+  // Number of scanned cards with "Dirty" value (and nothing else).
+  size_t const pending_cards_from_refinement_table = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) +
+                                                     p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards);
+  // Number of cards actually merged in the Merge RS phase. MergeRSCards below includes the cards from the Eager Reclaim phase.
+  size_t const merged_cards_from_card_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSFromRemSetCards) +
+                                           p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSFromRemSetCards);
+  // Number of cards attempted to merge in the Merge RS phase.
+  size_t const total_cards_from_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSTotalCards) +
+                                     p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSTotalCards);
+
+  // Cards marked as being to collection set. May be inaccurate due to races.
+  size_t const total_non_young_rs_cards = MIN2(pending_cards_from_refinement_table + merged_cards_from_card_rs, total_cards_scanned);
+
   if (update_stats) {
     // We maintain the invariant that all objects allocated by mutator
     // threads will be allocated out of eden regions. So, we can use
@@ -827,6 +850,98 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
     uint regions_allocated = _collection_set->eden_region_length();
     double alloc_rate_ms = (double) regions_allocated / app_time_ms;
     _analytics->report_alloc_rate_ms(alloc_rate_ms);
+
+    double merge_refinement_table_time = p->cur_merge_refinement_table_time();
+    if (merge_refinement_table_time != 0.0) {
+      _analytics->report_merge_refinement_table_time_ms(merge_refinement_table_time);
+    }
+    if (merged_cards_from_card_rs >= G1NumCardsCostSampleThreshold) {
+      double avg_time_merge_cards = average_time_ms(G1GCPhaseTimes::MergeER) +
+                                    average_time_ms(G1GCPhaseTimes::MergeRS) +
+                                    average_time_ms(G1GCPhaseTimes::OptMergeRS);
+      _analytics->report_cost_per_card_merge_ms(avg_time_merge_cards / merged_cards_from_card_rs, is_young_only_pause);
+      log_debug(gc, ergo, cset)("cost per card merge (young %s): avg time %.2f merged cards %zu cost(1m) %.2f pred_cost(1m-yo) %.2f pred_cost(1m-old) %.2f",
+                                BOOL_TO_STR(is_young_only_pause),
+                                avg_time_merge_cards, merged_cards_from_card_rs, 1e6 * avg_time_merge_cards / merged_cards_from_card_rs, _analytics->predict_card_merge_time_ms(1e6, true), _analytics->predict_card_merge_time_ms(1e6, false));
+    } else {
+      log_debug(gc, ergo, cset)("cost per card merge (young: %s): skipped, total cards %zu", BOOL_TO_STR(is_young_only_pause), total_non_young_rs_cards);
+    }
+
+    // Update prediction for card scan
+
+    if (total_cards_scanned >= G1NumCardsCostSampleThreshold) {
+      double avg_card_scan_time = average_time_ms(G1GCPhaseTimes::ScanHR) +
+                                  average_time_ms(G1GCPhaseTimes::OptScanHR);
+
+      _analytics->report_cost_per_card_scan_ms(avg_card_scan_time / total_cards_scanned, is_young_only_pause);
+
+      log_debug(gc, ergo, cset)("cost per card scan (young: %s): avg time %.2f total cards %zu cost(1m) %.2f pred_cost(1m-yo) %.2f pred_cost(1m-old) %.2f",
+                                BOOL_TO_STR(is_young_only_pause),
+                                avg_card_scan_time, total_cards_scanned, 1e6 * avg_card_scan_time / total_cards_scanned, _analytics->predict_card_scan_time_ms(1e6, true), _analytics->predict_card_scan_time_ms(1e6, false));
+    } else {
+      log_debug(gc, ergo, cset)("cost per card scan (young: %s): skipped, total cards %zu", BOOL_TO_STR(is_young_only_pause), total_cards_scanned);
+    }
+
+    // Update prediction for the ratio between cards actually merged onto the card
+    // table from the remembered sets and the total number of cards attempted to
+    // merge.
+    double merge_to_scan_ratio = 1.0;
+    if (total_cards_from_rs > 0) {
+      merge_to_scan_ratio = (double)merged_cards_from_card_rs / total_cards_from_rs;
+    }
+    _analytics->report_card_merge_to_scan_ratio(merge_to_scan_ratio, is_young_only_pause);
+
+    // Update prediction for code root scan
+    size_t const total_code_roots_scanned = p->sum_thread_work_items(G1GCPhaseTimes::CodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods) +
+                                            p->sum_thread_work_items(G1GCPhaseTimes::OptCodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods);
+
+    if (total_code_roots_scanned >= G1NumCodeRootsCostSampleThreshold) {
+      double avg_time_code_root_scan = average_time_ms(G1GCPhaseTimes::CodeRoots) +
+                                       average_time_ms(G1GCPhaseTimes::OptCodeRoots);
+
+      _analytics->report_cost_per_code_root_scan_ms(avg_time_code_root_scan / total_code_roots_scanned, is_young_only_pause);
+    }
+
+    // Update prediction for copy cost per byte
+    size_t copied_bytes = p->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSCopiedBytes);
+
+    if (copied_bytes > 0) {
+      double avg_copy_time = average_time_ms(G1GCPhaseTimes::ObjCopy) + average_time_ms(G1GCPhaseTimes::OptObjCopy);
+      double cost_per_byte_ms = avg_copy_time / copied_bytes;
+      _analytics->report_cost_per_byte_ms(cost_per_byte_ms, is_young_only_pause);
+    }
+
+    if (_collection_set->young_region_length() > 0) {
+      _analytics->report_young_other_cost_per_region_ms(young_other_time_ms() /
+                                                        _collection_set->young_region_length());
+    }
+
+    if (_collection_set->initial_old_region_length() > 0) {
+      _analytics->report_non_young_other_cost_per_region_ms(non_young_other_time_ms() /
+                                                            _collection_set->initial_old_region_length());
+    }
+
+    _analytics->report_constant_other_time_ms(constant_other_time_ms(pause_time_ms));
+
+    _analytics->report_pending_cards(pending_cards_from_refinement_table, is_young_only_pause);
+
+    _analytics->report_card_rs_length(total_cards_scanned - total_non_young_rs_cards, is_young_only_pause);
+    _analytics->report_code_root_rs_length((double)total_code_roots_scanned, is_young_only_pause);
+  }
+
+  {
+    double mutator_end_time = cur_pause_start_sec() * MILLIUNITS;
+    G1ConcurrentRefineStats* stats = _g1h->concurrent_refine()->sweep_state().stats();
+    // Record any available refinement statistics.
+    record_refinement_stats(stats);
+
+    double yield_duration_ms = TimeHelper::counter_to_millis(_g1h->yield_duration_in_refinement_epoch());
+    record_dirtying_stats(TimeHelper::counter_to_millis(_g1h->last_refinement_epoch_start()),
+                          mutator_end_time,
+                          pending_cards_from_refinement_table,
+                          yield_duration_ms,
+                          phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSPendingCards),
+                          phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSToYoungGenCards));
   }
 
   record_pause(this_pause, start_time_sec, end_time_sec, allocation_failure);
@@ -857,82 +972,6 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
 
   _eden_surv_rate_group->start_adding_regions();
 
-  if (update_stats) {
-    // Update prediction for card merge.
-    size_t const merged_cards_from_log_buffers = p->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards);
-    // MergeRSCards includes the cards from the Eager Reclaim phase.
-    size_t const merged_cards_from_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSCards) +
-                                        p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSCards);
-    size_t const total_cards_merged = merged_cards_from_rs +
-                                      merged_cards_from_log_buffers;
-
-    if (total_cards_merged >= G1NumCardsCostSampleThreshold) {
-      double avg_time_merge_cards = average_time_ms(G1GCPhaseTimes::MergeER) +
-                                    average_time_ms(G1GCPhaseTimes::MergeRS) +
-                                    average_time_ms(G1GCPhaseTimes::MergeLB) +
-                                    p->cur_distribute_log_buffers_time_ms() +
-                                    average_time_ms(G1GCPhaseTimes::OptMergeRS);
-      _analytics->report_cost_per_card_merge_ms(avg_time_merge_cards / total_cards_merged, is_young_only_pause);
-    }
-
-    // Update prediction for card scan
-    size_t const total_cards_scanned = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
-                                       p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
-
-    if (total_cards_scanned >= G1NumCardsCostSampleThreshold) {
-      double avg_time_dirty_card_scan = average_time_ms(G1GCPhaseTimes::ScanHR) +
-                                        average_time_ms(G1GCPhaseTimes::OptScanHR);
-
-      _analytics->report_cost_per_card_scan_ms(avg_time_dirty_card_scan / total_cards_scanned, is_young_only_pause);
-    }
-
-    // Update prediction for the ratio between cards from the remembered
-    // sets and actually scanned cards from the remembered sets.
-    // Due to duplicates in the log buffers, the number of scanned cards
-    // can be smaller than the cards in the log buffers.
-    const size_t scanned_cards_from_rs = (total_cards_scanned > merged_cards_from_log_buffers) ? total_cards_scanned - merged_cards_from_log_buffers : 0;
-    double scan_to_merge_ratio = 0.0;
-    if (merged_cards_from_rs > 0) {
-      scan_to_merge_ratio = (double)scanned_cards_from_rs / merged_cards_from_rs;
-    }
-    _analytics->report_card_scan_to_merge_ratio(scan_to_merge_ratio, is_young_only_pause);
-
-    // Update prediction for code root scan
-    size_t const total_code_roots_scanned = p->sum_thread_work_items(G1GCPhaseTimes::CodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods) +
-                                            p->sum_thread_work_items(G1GCPhaseTimes::OptCodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods);
-
-    if (total_code_roots_scanned >= G1NumCodeRootsCostSampleThreshold) {
-      double avg_time_code_root_scan = average_time_ms(G1GCPhaseTimes::CodeRoots) +
-                                       average_time_ms(G1GCPhaseTimes::OptCodeRoots);
-
-      _analytics->report_cost_per_code_root_scan_ms(avg_time_code_root_scan / total_code_roots_scanned, is_young_only_pause);
-    }
-
-    // Update prediction for copy cost per byte
-    size_t copied_bytes = p->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSCopiedBytes);
-
-    if (copied_bytes > 0) {
-      double cost_per_byte_ms = (average_time_ms(G1GCPhaseTimes::ObjCopy) + average_time_ms(G1GCPhaseTimes::OptObjCopy)) / copied_bytes;
-      _analytics->report_cost_per_byte_ms(cost_per_byte_ms, is_young_only_pause);
-    }
-
-    if (_collection_set->young_region_length() > 0) {
-      _analytics->report_young_other_cost_per_region_ms(young_other_time_ms() /
-                                                        _collection_set->young_region_length());
-    }
-
-    if (_collection_set->initial_old_region_length() > 0) {
-      _analytics->report_non_young_other_cost_per_region_ms(non_young_other_time_ms() /
-                                                            _collection_set->initial_old_region_length());
-    }
-
-    _analytics->report_constant_other_time_ms(constant_other_time_ms(pause_time_ms));
-
-    _analytics->report_pending_cards((double)pending_cards_at_gc_start(), is_young_only_pause);
-    _analytics->report_card_rs_length((double)_card_rs_length, is_young_only_pause);
-    _analytics->report_code_root_rs_length((double)total_code_roots_scanned, is_young_only_pause);
-  }
-
   assert(!(G1GCPauseTypeHelper::is_concurrent_start_pause(this_pause) && collector_state()->mark_or_rebuild_in_progress()),
          "If the last pause has been concurrent start, we should not have been in the marking window");
   if (G1GCPauseTypeHelper::is_concurrent_start_pause(this_pause)) {
@@ -963,29 +1002,26 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
   }
 
   // Note that _mmu_tracker->max_gc_time() returns the time in seconds.
-  double logged_cards_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0;
+  double pending_cards_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0;
 
-  double const logged_cards_time_ms = logged_cards_processing_time();
-  size_t logged_cards =
-    phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB,
-                                         G1GCPhaseTimes::MergeLBDirtyCards);
-  bool exceeded_goal = logged_cards_time_goal_ms < logged_cards_time_ms;
-  size_t predicted_thread_buffer_cards = _analytics->predict_dirtied_cards_in_thread_buffers();
+  double const pending_cards_time_ms = pending_cards_processing_time();
+  size_t pending_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) +
+                         phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards);
+
+  bool exceeded_goal = pending_cards_time_goal_ms < pending_cards_time_ms;
   G1ConcurrentRefine* cr = _g1h->concurrent_refine();
 
   log_debug(gc, ergo, refine)
-           ("GC refinement: goal: %zu + %zu / %1.2fms, actual: %zu / %1.2fms, %s",
+           ("GC refinement: goal: %zu / %1.2fms, actual: %zu / %1.2fms, %s",
             cr->pending_cards_target(),
-            predicted_thread_buffer_cards,
-            logged_cards_time_goal_ms,
-            logged_cards,
-            logged_cards_time_ms,
+            pending_cards_time_goal_ms,
+            pending_cards,
+            pending_cards_time_ms,
             (exceeded_goal ? " (exceeded goal)" : ""));
 
-  cr->adjust_after_gc(logged_cards_time_ms,
-                      logged_cards,
-                      predicted_thread_buffer_cards,
-                      logged_cards_time_goal_ms);
+  cr->adjust_after_gc(pending_cards_time_ms,
+                      pending_cards,
+                      pending_cards_time_goal_ms);
 }
 
 G1IHOPControl* G1Policy::create_ihop_control(const G1OldGenAllocationTracker* old_gen_alloc_tracker,
@@ -1057,34 +1093,27 @@ double G1Policy::predict_base_time_ms(size_t pending_cards,
                                       size_t code_root_rs_length) const {
   bool in_young_only_phase = collector_state()->in_young_only_phase();
 
-  size_t unique_cards_from_rs = _analytics->predict_scan_card_num(card_rs_length, in_young_only_phase);
-  // Assume that all cards from the log buffers will be scanned, i.e. there are no
-  // duplicates in that set.
-  size_t effective_scanned_cards = unique_cards_from_rs + pending_cards;
+  // Cards from the refinement table and the cards from the young gen remset are
+  // unique to each other as they are located on the card table.
+  size_t effective_scanned_cards = card_rs_length + pending_cards;
 
-  double card_merge_time = _analytics->predict_card_merge_time_ms(pending_cards + card_rs_length, in_young_only_phase);
+  double refinement_table_merge_time = _analytics->predict_merge_refinement_table_time_ms();
   double card_scan_time = _analytics->predict_card_scan_time_ms(effective_scanned_cards, in_young_only_phase);
   double code_root_scan_time = _analytics->predict_code_root_scan_time_ms(code_root_rs_length, in_young_only_phase);
   double constant_other_time = _analytics->predict_constant_other_time_ms();
   double survivor_evac_time = predict_survivor_regions_evac_time();
 
-  double total_time = card_merge_time + card_scan_time + code_root_scan_time + constant_other_time + survivor_evac_time;
+  double total_time = refinement_table_merge_time + card_scan_time + code_root_scan_time + constant_other_time + survivor_evac_time;
 
   log_trace(gc, ergo, heap)("Predicted base time: total %f lb_cards %zu card_rs_length %zu effective_scanned_cards %zu "
-                            "card_merge_time %f card_scan_time %f code_root_rs_length %zu code_root_scan_time %f "
+                            "refinement_table_merge_time %f card_scan_time %f code_root_rs_length %zu code_root_scan_time %f "
                             "constant_other_time %f survivor_evac_time %f",
                             total_time, pending_cards, card_rs_length, effective_scanned_cards,
-                            card_merge_time, card_scan_time, code_root_rs_length, code_root_scan_time,
+                            refinement_table_merge_time, card_scan_time, code_root_rs_length, code_root_scan_time,
                             constant_other_time, survivor_evac_time);
   return total_time;
 }
 
-double G1Policy::predict_base_time_ms(size_t pending_cards) const {
-  bool for_young_only_phase = collector_state()->in_young_only_phase();
-  size_t card_rs_length = _analytics->predict_card_rs_length(for_young_only_phase);
-  return predict_base_time_ms(pending_cards, card_rs_length);
-}
-
 double G1Policy::predict_base_time_ms(size_t pending_cards, size_t card_rs_length) const {
   bool for_young_only_phase = collector_state()->in_young_only_phase();
   size_t code_root_rs_length = _analytics->predict_code_root_rs_length(for_young_only_phase);
@@ -1428,6 +1457,64 @@ size_t G1Policy::allowed_waste_in_collection_set() const {
   return G1HeapWastePercent * _g1h->capacity() / 100;
 }
 
+bool G1Policy::try_get_available_bytes_estimate(size_t& available_bytes) const {
+  // Getting used young bytes requires holding Heap_lock.  But we can't use
+  // normal lock and block until available.  Blocking on the lock could
+  // deadlock with a GC VMOp that is holding the lock and requesting a
+  // safepoint.  Instead try to lock, and return the result of that attempt,
+  // and the estimate if successful.
+  if (Heap_lock->try_lock()) {
+    size_t used_bytes = estimate_used_young_bytes_locked();
+    Heap_lock->unlock();
+
+    size_t young_bytes = young_list_target_length() * G1HeapRegion::GrainBytes;
+    available_bytes = young_bytes - MIN2(young_bytes, used_bytes);
+    return true;
+  } else {
+    available_bytes = 0;
+    return false;
+  }
+}
+
+double G1Policy::predict_time_to_next_gc_ms(size_t available_bytes) const {
+  double alloc_region_rate = _analytics->predict_alloc_rate_ms();
+  double alloc_bytes_rate = alloc_region_rate * G1HeapRegion::GrainBytes;
+  if (alloc_bytes_rate == 0.0) {
+    // A zero rate indicates we don't yet have data to use for predictions.
+    // Since we don't have any idea how long until the next GC, use a time of
+    // zero.
+    return 0.0;
+  } else {
+    // If the heap size is large and the allocation rate is small, we can get
+    // a predicted time until next GC that is so large it can cause problems
+    // (such as overflow) in other calculations.  Limit the prediction to one
+    // hour, which is still large in this context.
+    const double one_hour_ms = 60.0 * 60.0 * MILLIUNITS;
+    double raw_time_ms = available_bytes / alloc_bytes_rate;
+    return MIN2(raw_time_ms, one_hour_ms);
+  }
+}
+
+uint64_t G1Policy::adjust_wait_time_ms(double wait_time_ms, uint64_t min_time_ms) {
+  return MAX2(static_cast<uint64_t>(sqrt(wait_time_ms) * 4.0), min_time_ms);
+}
+
+double G1Policy::last_mutator_dirty_start_time_ms() {
+  return TimeHelper::counter_to_millis(_g1h->last_refinement_epoch_start());
+}
+
+size_t G1Policy::current_pending_cards() {
+  double now = os::elapsedTime() * MILLIUNITS;
+  return _pending_cards_from_gc + _analytics->predict_dirtied_cards_rate_ms() * (now - last_mutator_dirty_start_time_ms());
+}
+
+size_t G1Policy::current_to_collection_set_cards() {
+  // The incremental part is covered by the dirtied_cards_rate, i.e. pending cards
+  // cover both to collection set cards and other interesting cards because we do not
+  // know which is which until we look.
+  return _to_collection_set_cards;
+}
+
 uint G1Policy::min_retained_old_cset_length() const {
   // Guarantee some progress with retained regions regardless of available time by
   // taking at least one region.
diff --git a/src/hotspot/share/gc/g1/g1Policy.hpp b/src/hotspot/share/gc/g1/g1Policy.hpp
index e9f7529e509..01bad97ab84 100644
--- a/src/hotspot/share/gc/g1/g1Policy.hpp
+++ b/src/hotspot/share/gc/g1/g1Policy.hpp
@@ -48,6 +48,7 @@ class G1HeapRegion;
 class G1CollectionSet;
 class G1CollectionSetCandidates;
 class G1CollectionSetChooser;
+class G1ConcurrentRefineStats;
 class G1IHOPControl;
 class G1Analytics;
 class G1SurvivorRegions;
@@ -101,9 +102,18 @@ class G1Policy: public CHeapObj<mtGC> {
 
   uint _free_regions_at_end_of_collection;
 
-  size_t _card_rs_length;
-
-  size_t _pending_cards_at_gc_start;
+  // Tracks the number of cards marked as dirty (only) during garbage collection
+  // (evacuation) on the card table.
+  // This is needed to properly account for those cards in the heuristics to start
+  // refinement at the correct time which needs to know how many cards are currently
+  // approximately on the card table.
+  // After the first completed refinement sweep of the refinement table between two
+  // garbage collections this value is reset to zero as that refinement processed all
+  // those cards.
+  size_t _pending_cards_from_gc;
+  // Tracks the approximate number of cards found as to-collection-set by either the
+  // garbage collection or the most recent refinement sweep.
+  size_t _to_collection_set_cards;
 
   G1ConcurrentStartToMixedTimeTracker _concurrent_start_to_mixed;
 
@@ -111,7 +121,7 @@ class G1Policy: public CHeapObj<mtGC> {
     return collector_state()->in_young_only_phase() && !collector_state()->mark_or_rebuild_in_progress();
   }
 
-  double logged_cards_processing_time() const;
+  double pending_cards_processing_time() const;
 public:
   const G1Predictions& predictor() const { return _predictor; }
   const G1Analytics* analytics()   const { return const_cast<const G1Analytics*>(_analytics); }
@@ -129,16 +139,10 @@ public:
     hr->install_surv_rate_group(_survivor_surv_rate_group);
   }
 
-  void record_card_rs_length(size_t num_cards) {
-    _card_rs_length = num_cards;
-  }
-
   double cur_pause_start_sec() const {
     return _cur_pause_start_sec;
   }
 
-  double predict_base_time_ms(size_t pending_cards) const;
-
   double predict_base_time_ms(size_t pending_cards, size_t card_rs_length) const;
 
   // Base time contains handling remembered sets and constant other time of the
@@ -239,7 +243,13 @@ private:
 
 public:
   size_t predict_bytes_to_copy(G1HeapRegion* hr) const;
-  size_t pending_cards_at_gc_start() const { return _pending_cards_at_gc_start; }
+
+  double last_mutator_dirty_start_time_ms();
+  size_t pending_cards_from_gc() const { return _pending_cards_from_gc; }
+
+  size_t current_pending_cards();
+
+  size_t current_to_collection_set_cards();
 
   // GC efficiency for collecting the region based on the time estimate for
   // merging and scanning incoming references.
@@ -286,7 +296,7 @@ public:
   // Check the current value of the young list RSet length and
   // compare it against the last prediction. If the current value is
   // higher, recalculate the young list target length prediction.
-  void revise_young_list_target_length(size_t card_rs_length, size_t code_root_rs_length);
+  void revise_young_list_target_length(size_t pending_cards, size_t card_rs_length, size_t code_root_rs_length);
 
   // This should be called after the heap is resized.
   void record_new_heap_size(uint new_number_of_regions);
@@ -325,7 +335,6 @@ public:
   // Amount of allowed waste in bytes in the collection set.
   size_t allowed_waste_in_collection_set() const;
 
-
 private:
 
   // Predict the number of bytes of surviving objects from survivor and old
@@ -359,17 +368,39 @@ public:
 
   bool use_adaptive_young_list_length() const;
 
+  // Try to get an estimate of the currently available bytes in the young gen. This
+  // operation considers itself low-priority: if other threads need the resources
+  // required to get the information, return false to indicate that the caller
+  // should retry "soon".
+  bool try_get_available_bytes_estimate(size_t& bytes) const;
+  // Estimate time until next GC, based on remaining bytes available for
+  // allocation and the allocation rate.
+  double predict_time_to_next_gc_ms(size_t available_bytes) const;
+
+  // Adjust wait times to make them less frequent the longer the next GC is away.
+  // But don't increase the wait time too rapidly, further bound it by min_time_ms.
+  // This reduces the number of thread wakeups that just immediately
+  // go back to waiting, while still being responsive to behavior changes.
+  uint64_t adjust_wait_time_ms(double wait_time_ms, uint64_t min_time_ms);
+
+private:
   // Return an estimate of the number of bytes used in young gen.
   // precondition: holding Heap_lock
   size_t estimate_used_young_bytes_locked() const;
 
+public:
+
   void transfer_survivors_to_cset(const G1SurvivorRegions* survivors);
 
-  // Record and log stats and pending cards before not-full collection.
-  // thread_buffer_cards is the number of cards that were in per-thread
-  // buffers.  pending_cards includes thread_buffer_cards.
-  void record_concurrent_refinement_stats(size_t pending_cards,
-                                          size_t thread_buffer_cards);
+  // Record and log stats and pending cards to update predictors.
+  void record_refinement_stats(G1ConcurrentRefineStats* stats);
+
+  void record_dirtying_stats(double last_mutator_start_dirty_ms,
+                             double last_mutator_end_dirty_ms,
+                             size_t pending_cards,
+                             double yield_duration,
+                             size_t next_pending_cards_from_gc,
+                             size_t next_to_collection_set_cards);
 
   bool should_retain_evac_failed_region(G1HeapRegion* r) const {
     return should_retain_evac_failed_region(r->hrm_index());
diff --git a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp
deleted file mode 100644
index 45e262c440a..00000000000
--- a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
-#include "gc/shared/bufferNode.hpp"
-#include "runtime/atomicAccess.hpp"
-#include "utilities/debug.hpp"
-#include "utilities/macros.hpp"
-
-// G1RedirtyCardsLocalQueueSet
-
-G1RedirtyCardsLocalQueueSet::G1RedirtyCardsLocalQueueSet(G1RedirtyCardsQueueSet* shared_qset) :
-  PtrQueueSet(shared_qset->allocator()),
-  _shared_qset(shared_qset),
-  _buffers(),
-  _queue(this)
-{}
-
-#ifdef ASSERT
-G1RedirtyCardsLocalQueueSet::~G1RedirtyCardsLocalQueueSet() {
-  assert(_buffers._head == nullptr, "unflushed qset");
-  assert(_buffers._tail == nullptr, "invariant");
-  assert(_buffers._entry_count == 0, "invariant");
-}
-#endif // ASSERT
-
-void G1RedirtyCardsLocalQueueSet::enqueue_completed_buffer(BufferNode* node) {
-  _buffers._entry_count += node->size();
-  node->set_next(_buffers._head);
-  _buffers._head = node;
-  if (_buffers._tail == nullptr) {
-    _buffers._tail = node;
-  }
-}
-
-void G1RedirtyCardsLocalQueueSet::enqueue(void* value) {
-  if (!try_enqueue(_queue, value)) {
-    BufferNode* old_node = exchange_buffer_with_new(_queue);
-    if (old_node != nullptr) {
-      enqueue_completed_buffer(old_node);
-    }
-    retry_enqueue(_queue, value);
-  }
-}
-
-BufferNodeList G1RedirtyCardsLocalQueueSet::flush() {
-  flush_queue(_queue);
-  BufferNodeList cur_buffers = _buffers;
-  _shared_qset->add_bufferlist(_buffers);
-  _buffers = BufferNodeList();
-  return cur_buffers;
-}
-
-// G1RedirtyCardsLocalQueueSet::Queue
-
-G1RedirtyCardsLocalQueueSet::Queue::Queue(G1RedirtyCardsLocalQueueSet* qset) :
-  PtrQueue(qset)
-{}
-
-#ifdef ASSERT
-G1RedirtyCardsLocalQueueSet::Queue::~Queue() {
-  assert(buffer() == nullptr, "unflushed queue");
-}
-#endif // ASSERT
-
-// G1RedirtyCardsQueueSet
-
-G1RedirtyCardsQueueSet::G1RedirtyCardsQueueSet(BufferNode::Allocator* allocator) :
-  PtrQueueSet(allocator),
-  _list(),
-  _entry_count(0),
-  _tail(nullptr)
-  DEBUG_ONLY(COMMA _collecting(true))
-{}
-
-G1RedirtyCardsQueueSet::~G1RedirtyCardsQueueSet() {
-  verify_empty();
-}
-
-#ifdef ASSERT
-void G1RedirtyCardsQueueSet::verify_empty() const {
-  assert(_list.empty(), "precondition");
-  assert(_tail == nullptr, "invariant");
-  assert(_entry_count == 0, "invariant");
-}
-#endif // ASSERT
-
-BufferNode* G1RedirtyCardsQueueSet::all_completed_buffers() const {
-  DEBUG_ONLY(_collecting = false;)
-  return _list.top();
-}
-
-BufferNodeList G1RedirtyCardsQueueSet::take_all_completed_buffers() {
-  DEBUG_ONLY(_collecting = false;)
-  BufferNodeList result(_list.pop_all(), _tail, _entry_count);
-  _tail = nullptr;
-  _entry_count = 0;
-  DEBUG_ONLY(_collecting = true;)
-  return result;
-}
-
-void G1RedirtyCardsQueueSet::update_tail(BufferNode* node) {
-  // Node is the tail of a (possibly single element) list just prepended to
-  // _list.  If, after that prepend, node's follower is null, then node is
-  // also the tail of _list, so record it as such.
-  if (node->next() == nullptr) {
-    assert(_tail == nullptr, "invariant");
-    _tail = node;
-  }
-}
-
-void G1RedirtyCardsQueueSet::enqueue_completed_buffer(BufferNode* node) {
-  assert(_collecting, "precondition");
-  AtomicAccess::add(&_entry_count, node->size());
-  _list.push(*node);
-  update_tail(node);
-}
-
-void G1RedirtyCardsQueueSet::add_bufferlist(const BufferNodeList& buffers) {
-  assert(_collecting, "precondition");
-  if (buffers._head != nullptr) {
-    assert(buffers._tail != nullptr, "invariant");
-    AtomicAccess::add(&_entry_count, buffers._entry_count);
-    _list.prepend(*buffers._head, *buffers._tail);
-    update_tail(buffers._tail);
-  }
-}
diff --git a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp
deleted file mode 100644
index add66f24cca..00000000000
--- a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP
-#define SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP
-
-#include "gc/shared/bufferNode.hpp"
-#include "gc/shared/bufferNodeList.hpp"
-#include "gc/shared/ptrQueue.hpp"
-#include "memory/padded.hpp"
-#include "utilities/macros.hpp"
-
-class G1RedirtyCardsQueueSet;
-
-// A thread-local qset and queue.  It provides an uncontended staging
-// area for completed buffers, to be flushed to the shared qset en masse.
-class G1RedirtyCardsLocalQueueSet : private PtrQueueSet {
-  class Queue : public PtrQueue {
-  public:
-    Queue(G1RedirtyCardsLocalQueueSet* qset);
-    ~Queue() NOT_DEBUG(= default);
-  };
-
-  G1RedirtyCardsQueueSet* _shared_qset;
-  BufferNodeList _buffers;
-  Queue _queue;
-
-  // Add the buffer to the local list.
-  virtual void enqueue_completed_buffer(BufferNode* node);
-
-public:
-  G1RedirtyCardsLocalQueueSet(G1RedirtyCardsQueueSet* shared_qset);
-  ~G1RedirtyCardsLocalQueueSet() NOT_DEBUG(= default);
-
-  void enqueue(void* value);
-
-  // Transfer all completed buffers to the shared qset.
-  // Returns the flushed BufferNodeList which is later used
-  // as a shortcut into the shared qset.
-  BufferNodeList flush();
-};
-
-// Card table entries to be redirtied and the cards reprocessed later.
-// Has two phases, collecting and processing.  During the collecting
-// phase buffers are added to the set.  Once collecting is complete and
-// processing starts, buffers can no longer be added.  Taking all the
-// collected (and processed) buffers reverts back to collecting, allowing
-// the set to be reused for another round of redirtying.
-class G1RedirtyCardsQueueSet : public PtrQueueSet {
-  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, 0);
-  BufferNode::Stack _list;
-  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_PADDING_SIZE, sizeof(size_t));
-  volatile size_t _entry_count;
-  DEFINE_PAD_MINUS_SIZE(3, DEFAULT_PADDING_SIZE, sizeof(BufferNode*));
-  BufferNode* _tail;
-  DEBUG_ONLY(mutable bool _collecting;)
-
-  void update_tail(BufferNode* node);
-
-public:
-  G1RedirtyCardsQueueSet(BufferNode::Allocator* allocator);
-  ~G1RedirtyCardsQueueSet();
-
-  void verify_empty() const NOT_DEBUG_RETURN;
-
-  // Collect buffers.  These functions are thread-safe.
-  // precondition: Must not be concurrent with buffer processing.
-  virtual void enqueue_completed_buffer(BufferNode* node);
-  void add_bufferlist(const BufferNodeList& buffers);
-
-  // Processing phase operations.
-  // precondition: Must not be concurrent with buffer collection.
-  BufferNode* all_completed_buffers() const;
-  BufferNodeList take_all_completed_buffers();
-};
-
-#endif // SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP
diff --git a/src/hotspot/share/gc/g1/g1RemSet.cpp b/src/hotspot/share/gc/g1/g1RemSet.cpp
index 2a09512730c..d2df416edc2 100644
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
@@ -27,11 +27,12 @@
 #include "gc/g1/g1BlockOffsetTable.inline.hpp"
 #include "gc/g1/g1CardSet.inline.hpp"
 #include "gc/g1/g1CardTable.inline.hpp"
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
 #include "gc/g1/g1CardTableEntryClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectionSet.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
+#include "gc/g1/g1ConcurrentRefineSweepTask.hpp"
 #include "gc/g1/g1FromCardCache.hpp"
 #include "gc/g1/g1GCParPhaseTimesTracker.hpp"
 #include "gc/g1/g1GCPhaseTimes.hpp"
@@ -42,8 +43,6 @@
 #include "gc/g1/g1Policy.hpp"
 #include "gc/g1/g1RemSet.hpp"
 #include "gc/g1/g1RootClosures.hpp"
-#include "gc/shared/bufferNode.hpp"
-#include "gc/shared/bufferNodeList.hpp"
 #include "gc/shared/gc_globals.hpp"
 #include "gc/shared/gcTraceTime.inline.hpp"
 #include "jfr/jfrEvents.hpp"
@@ -63,7 +62,7 @@
 // Collects information about the overall heap root scan progress during an evacuation.
 //
 // Scanning the remembered sets works by first merging all sources of cards to be
-// scanned (log buffers, remembered sets) into a single data structure to remove
+// scanned (refinement table, remembered sets) into a single data structure to remove
 // duplicates and simplify work distribution.
 //
 // During the following card scanning we not only scan this combined set of cards, but
@@ -89,37 +88,13 @@
 class G1RemSetScanState : public CHeapObj<mtGC> {
   class G1DirtyRegions;
 
-  size_t _max_reserved_regions;
-
-  // Card table iteration claim for each heap region, from 0 (completely unscanned)
-  // to (>=) G1HeapRegion::CardsPerRegion (completely scanned).
-  uint volatile* _card_table_scan_state;
-
-  uint _scan_chunks_per_region;         // Number of chunks per region.
-  uint8_t _log_scan_chunks_per_region;  // Log of number of chunks per region.
-  bool* _region_scan_chunks;
-  size_t _num_total_scan_chunks;        // Total number of elements in _region_scan_chunks.
-  uint8_t _scan_chunks_shift;           // For conversion between card index and chunk index.
-public:
-  uint scan_chunk_size_in_cards() const { return (uint)1 << _scan_chunks_shift; }
-
-  // Returns whether the chunk corresponding to the given region/card in region contain a
-  // dirty card, i.e. actually needs scanning.
-  bool chunk_needs_scan(uint const region_idx, uint const card_in_region) const {
-    size_t const idx = ((size_t)region_idx << _log_scan_chunks_per_region) + (card_in_region >> _scan_chunks_shift);
-    assert(idx < _num_total_scan_chunks, "Index %zu out of bounds %zu",
-           idx, _num_total_scan_chunks);
-    return _region_scan_chunks[idx];
-  }
-
-private:
+  G1CardTableClaimTable _card_claim_table;
   // The complete set of regions which card table needs to be cleared at the end
-  // of GC because we scribbled over these card tables.
+  // of GC because we scribbled over these card table entries.
   //
   // Regions may be added for two reasons:
-  // - they were part of the collection set: they may contain g1_young_card_val
-  // or regular card marks that we never scan so we must always clear their card
-  // table
+  // - they were part of the collection set: they may contain regular card marks
+  // that we never scan so we must always clear their card table.
   // - or in case g1 does an optional evacuation pass, g1 marks the cards in there
   // as g1_scanned_card_val. If G1 only did an initial evacuation pass, the
   // scanning already cleared these cards. In that case they are not in this set
@@ -129,7 +104,7 @@ private:
   // in the current evacuation pass.
   G1DirtyRegions* _next_dirty_regions;
 
-  // Set of (unique) regions that can be added to concurrently.
+// Set of (unique) regions that can be added to concurrently.
   class G1DirtyRegions : public CHeapObj<mtGC> {
     uint* _buffer;
     uint _cur_idx;
@@ -147,8 +122,6 @@ private:
       reset();
     }
 
-    static size_t chunk_size() { return M; }
-
     ~G1DirtyRegions() {
       FREE_C_HEAP_ARRAY(uint, _buffer);
       FREE_C_HEAP_ARRAY(bool, _contains);
@@ -197,7 +170,7 @@ private:
   // entries from free regions.
   HeapWord** _scan_top;
 
-  class G1ClearCardTableTask : public G1AbstractSubTask {
+class G1ClearCardTableTask : public G1AbstractSubTask {
     G1CollectedHeap* _g1h;
     G1DirtyRegions* _regions;
     uint volatile _cur_dirty_regions;
@@ -229,9 +202,9 @@ private:
 
     virtual ~G1ClearCardTableTask() {
       _scan_state->cleanup();
-#ifndef PRODUCT
-      G1CollectedHeap::heap()->verifier()->verify_card_table_cleanup();
-#endif
+      if (VerifyDuringGC) {
+        G1CollectedHeap::heap()->verifier()->verify_card_table_cleanup();
+      }
     }
 
     void do_work(uint worker_id) override {
@@ -243,7 +216,15 @@ private:
 
         for (uint i = next; i < max; i++) {
           G1HeapRegion* r = _g1h->region_at(_regions->at(i));
-          r->clear_cardtable();
+          // The card table contains "dirty" card marks. Clear unconditionally.
+          //
+          // Humongous reclaim candidates are not in the dirty set. This is fine because
+          // their card and refinement table should always be clear as they are typeArrays.
+          r->clear_card_table();
+          // There is no need to clear the refinement table here: at the start of the collection
+          // we had to clear the refinement card table for collection set regions already, and any
+          // old regions use it for old->collection set candidates, so they should not be cleared
+          // either.
         }
       }
     }
@@ -251,56 +232,41 @@ private:
 
 public:
   G1RemSetScanState() :
-    _max_reserved_regions(0),
-    _card_table_scan_state(nullptr),
-    _scan_chunks_per_region(G1CollectedHeap::get_chunks_per_region()),
-    _log_scan_chunks_per_region(log2i(_scan_chunks_per_region)),
-    _region_scan_chunks(nullptr),
-    _num_total_scan_chunks(0),
-    _scan_chunks_shift(0),
+    _card_claim_table(G1CollectedHeap::get_chunks_per_region_for_scan()),
     _all_dirty_regions(nullptr),
     _next_dirty_regions(nullptr),
-    _scan_top(nullptr) {
-  }
+    _scan_top(nullptr) { }
 
   ~G1RemSetScanState() {
-    FREE_C_HEAP_ARRAY(uint, _card_table_scan_state);
-    FREE_C_HEAP_ARRAY(bool, _region_scan_chunks);
     FREE_C_HEAP_ARRAY(HeapWord*, _scan_top);
   }
 
-  void initialize(size_t max_reserved_regions) {
-    assert(_card_table_scan_state == nullptr, "Must not be initialized twice");
-    _max_reserved_regions = max_reserved_regions;
-    _card_table_scan_state = NEW_C_HEAP_ARRAY(uint, max_reserved_regions, mtGC);
-    _num_total_scan_chunks = max_reserved_regions * _scan_chunks_per_region;
-    _region_scan_chunks = NEW_C_HEAP_ARRAY(bool, _num_total_scan_chunks, mtGC);
-
-    _scan_chunks_shift = (uint8_t)log2i(G1HeapRegion::CardsPerRegion / _scan_chunks_per_region);
+  void initialize(uint max_reserved_regions) {
+    _card_claim_table.initialize(max_reserved_regions);
     _scan_top = NEW_C_HEAP_ARRAY(HeapWord*, max_reserved_regions, mtGC);
   }
 
+  // Reset the claim and clear scan top for all regions, including
+  // regions currently not available or free. Since regions might
+  // become used during the collection these values must be valid
+  // for those regions as well.
   void prepare() {
-    // Reset the claim and clear scan top for all regions, including
-    // regions currently not available or free. Since regions might
-    // become used during the collection these values must be valid
-    // for those regions as well.
-    for (size_t i = 0; i < _max_reserved_regions; i++) {
+    size_t max_reserved_regions = _card_claim_table.max_reserved_regions();
+
+    for (size_t i = 0; i < max_reserved_regions; i++) {
       clear_scan_top((uint)i);
     }
 
-    _all_dirty_regions = new G1DirtyRegions(_max_reserved_regions);
-    _next_dirty_regions = new G1DirtyRegions(_max_reserved_regions);
+    _all_dirty_regions = new G1DirtyRegions(max_reserved_regions);
+    _next_dirty_regions = new G1DirtyRegions(max_reserved_regions);
   }
 
   void prepare_for_merge_heap_roots() {
-    assert(_next_dirty_regions->size() == 0, "next dirty regions must be empty");
+    // We populate the next dirty regions at the start of GC with all old/humongous
+    // regions.
+    //assert(_next_dirty_regions->size() == 0, "next dirty regions must be empty");
 
-    for (size_t i = 0; i < _max_reserved_regions; i++) {
-      _card_table_scan_state[i] = 0;
-    }
-
-    ::memset(_region_scan_chunks, false, _num_total_scan_chunks * sizeof(*_region_scan_chunks));
+    _card_claim_table.reset_all_to_unclaimed();
   }
 
   void complete_evac_phase(bool merge_dirty_regions) {
@@ -321,38 +287,10 @@ public:
     return (hr != nullptr && !hr->in_collection_set() && hr->is_old_or_humongous());
   }
 
-  size_t num_visited_cards() const {
-    size_t result = 0;
-    for (uint i = 0; i < _num_total_scan_chunks; i++) {
-      if (_region_scan_chunks[i]) {
-        result++;
-      }
-    }
-    return result * (G1HeapRegion::CardsPerRegion / _scan_chunks_per_region);
-  }
-
   size_t num_cards_in_dirty_regions() const {
     return _next_dirty_regions->size() * G1HeapRegion::CardsPerRegion;
   }
 
-  void set_chunk_range_dirty(size_t const region_card_idx, size_t const card_length) {
-    size_t chunk_idx = region_card_idx >> _scan_chunks_shift;
-    // Make sure that all chunks that contain the range are marked. Calculate the
-    // chunk of the last card that is actually marked.
-    size_t const end_chunk = (region_card_idx + card_length - 1) >> _scan_chunks_shift;
-    for (; chunk_idx <= end_chunk; chunk_idx++) {
-      _region_scan_chunks[chunk_idx] = true;
-    }
-  }
-
-  void set_chunk_dirty(size_t const card_idx) {
-    assert((card_idx >> _scan_chunks_shift) < _num_total_scan_chunks,
-           "Trying to access index %zu out of bounds %zu",
-           card_idx >> _scan_chunks_shift, _num_total_scan_chunks);
-    size_t const chunk_idx = card_idx >> _scan_chunks_shift;
-    _region_scan_chunks[chunk_idx] = true;
-  }
-
   G1AbstractSubTask* create_cleanup_after_scan_heap_roots_task() {
     return new G1ClearCardTableTask(G1CollectedHeap::heap(), _all_dirty_regions, this);
   }
@@ -391,22 +329,16 @@ public:
   }
 
   bool has_cards_to_scan(uint region) {
-    assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
-    return _card_table_scan_state[region] < G1HeapRegion::CardsPerRegion;
-  }
-
-  uint claim_cards_to_scan(uint region, uint increment) {
-    assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
-    return AtomicAccess::fetch_then_add(&_card_table_scan_state[region], increment, memory_order_relaxed);
+    return _card_claim_table.has_unclaimed_cards(region);
   }
 
   void add_dirty_region(uint const region) {
-#ifdef ASSERT
+  #ifdef ASSERT
    G1HeapRegion* hr = G1CollectedHeap::heap()->region_at(region);
    assert(!hr->in_collection_set() && hr->is_old_or_humongous(),
           "Region %u is not suitable for scanning, is %sin collection set or %s",
           hr->hrm_index(), hr->in_collection_set() ? "" : "not ", hr->get_short_type_str());
-#endif
+  #endif
     _next_dirty_regions->add_dirty_region(region);
   }
 
@@ -431,14 +363,16 @@ public:
   void clear_scan_top(uint region_idx) {
     set_scan_top(region_idx, nullptr);
   }
+
+  G1CardTableChunkClaimer claimer(uint region_idx) {
+    return G1CardTableChunkClaimer(&_card_claim_table, region_idx);
+  }
 };
 
-G1RemSet::G1RemSet(G1CollectedHeap* g1h,
-                   G1CardTable* ct) :
+G1RemSet::G1RemSet(G1CollectedHeap* g1h) :
   _scan_state(new G1RemSetScanState()),
   _prev_period_summary(false),
   _g1h(g1h),
-  _ct(ct),
   _g1p(_g1h->policy()) {
 }
 
@@ -450,36 +384,6 @@ void G1RemSet::initialize(uint max_reserved_regions) {
   _scan_state->initialize(max_reserved_regions);
 }
 
-// Helper class to claim dirty chunks within the card table.
-class G1CardTableChunkClaimer {
-  G1RemSetScanState* _scan_state;
-  uint _region_idx;
-  uint _cur_claim;
-
-public:
-  G1CardTableChunkClaimer(G1RemSetScanState* scan_state, uint region_idx) :
-    _scan_state(scan_state),
-    _region_idx(region_idx),
-    _cur_claim(0) {
-    guarantee(size() <= G1HeapRegion::CardsPerRegion, "Should not claim more space than possible.");
-  }
-
-  bool has_next() {
-    while (true) {
-      _cur_claim = _scan_state->claim_cards_to_scan(_region_idx, size());
-      if (_cur_claim >= G1HeapRegion::CardsPerRegion) {
-        return false;
-      }
-      if (_scan_state->chunk_needs_scan(_region_idx, _cur_claim)) {
-        return true;
-      }
-    }
-  }
-
-  uint value() const { return _cur_claim; }
-  uint size() const { return _scan_state->scan_chunk_size_in_cards(); }
-};
-
 // Scans a heap region for dirty cards.
 class G1ScanHRForRegionClosure : public G1HeapRegionClosure {
   using CardValue = CardTable::CardValue;
@@ -495,6 +399,8 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure {
 
   uint   _worker_id;
 
+  size_t _cards_pending;
+  size_t _cards_empty;
   size_t _cards_scanned;
   size_t _blocks_scanned;
   size_t _chunks_claimed;
@@ -508,9 +414,9 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure {
   HeapWord* _scanned_to;
   CardValue _scanned_card_value;
 
-  HeapWord* scan_memregion(uint region_idx_for_card, MemRegion mr) {
+  HeapWord* scan_memregion(uint region_idx_for_card, MemRegion mr, size_t &roots_found) {
     G1HeapRegion* const card_region = _g1h->region_at(region_idx_for_card);
-    G1ScanCardClosure card_cl(_g1h, _pss, _heap_roots_found);
+    G1ScanCardClosure card_cl(_g1h, _pss, roots_found);
 
     HeapWord* const scanned_to = card_region->oops_on_memregion_seq_iterate_careful<true>(mr, &card_cl);
     assert(scanned_to != nullptr, "Should be able to scan range");
@@ -520,8 +426,8 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure {
     return scanned_to;
   }
 
-  void do_claimed_block(uint const region_idx, CardValue* const dirty_l, CardValue* const dirty_r) {
-    _ct->change_dirty_cards_to(dirty_l, dirty_r, _scanned_card_value);
+  void do_claimed_block(uint const region_idx, CardValue* const dirty_l, CardValue* const dirty_r, size_t& pending_cards) {
+    pending_cards += _ct->change_dirty_cards_to(dirty_l, dirty_r, _scanned_card_value);
     size_t num_cards = pointer_delta(dirty_r, dirty_l, sizeof(CardValue));
     _blocks_scanned++;
 
@@ -536,115 +442,22 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure {
       return;
     }
     MemRegion mr(MAX2(card_start, _scanned_to), scan_end);
-    _scanned_to = scan_memregion(region_idx, mr);
+    size_t roots_found = 0;
+    _scanned_to = scan_memregion(region_idx, mr, roots_found);
 
+    if (roots_found == 0) {
+      _cards_empty += num_cards;
+    }
     _cards_scanned += num_cards;
+    _heap_roots_found += roots_found;
   }
 
-  // To locate consecutive dirty cards inside a chunk.
-  class ChunkScanner {
-    using Word = size_t;
-
-    CardValue* const _start_card;
-    CardValue* const _end_card;
-
-    static const size_t ExpandedToScanMask = G1CardTable::WordAlreadyScanned;
-    static const size_t ToScanMask = G1CardTable::g1_card_already_scanned;
-
-    static bool is_card_dirty(const CardValue* const card) {
-      return (*card & ToScanMask) == 0;
-    }
-
-    static bool is_word_aligned(const void* const addr) {
-      return ((uintptr_t)addr) % sizeof(Word) == 0;
-    }
-
-    CardValue* find_first_dirty_card(CardValue* i_card) const {
-      while (!is_word_aligned(i_card)) {
-        if (is_card_dirty(i_card)) {
-          return i_card;
-        }
-        i_card++;
-      }
-
-      for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) {
-        Word word_value = *reinterpret_cast<Word*>(i_card);
-        bool has_dirty_cards_in_word = (~word_value & ExpandedToScanMask) != 0;
-
-        if (has_dirty_cards_in_word) {
-          for (uint i = 0; i < sizeof(Word); ++i) {
-            if (is_card_dirty(i_card)) {
-              return i_card;
-            }
-            i_card++;
-          }
-          assert(false, "should have early-returned");
-        }
-      }
-
-      return _end_card;
-    }
-
-    CardValue* find_first_non_dirty_card(CardValue* i_card) const {
-      while (!is_word_aligned(i_card)) {
-        if (!is_card_dirty(i_card)) {
-          return i_card;
-        }
-        i_card++;
-      }
-
-      for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) {
-        Word word_value = *reinterpret_cast<Word*>(i_card);
-        bool all_cards_dirty = (word_value == G1CardTable::WordAllDirty);
-
-        if (!all_cards_dirty) {
-          for (uint i = 0; i < sizeof(Word); ++i) {
-            if (!is_card_dirty(i_card)) {
-              return i_card;
-            }
-            i_card++;
-          }
-          assert(false, "should have early-returned");
-        }
-      }
-
-      return _end_card;
-    }
-
-  public:
-    ChunkScanner(CardValue* const start_card, CardValue* const end_card) :
-      _start_card(start_card),
-      _end_card(end_card) {
-        assert(is_word_aligned(start_card), "precondition");
-        assert(is_word_aligned(end_card), "precondition");
-      }
-
-    template<typename Func>
-    void on_dirty_cards(Func&& f) {
-      for (CardValue* cur_card = _start_card; cur_card < _end_card; /* empty */) {
-        CardValue* dirty_l = find_first_dirty_card(cur_card);
-        CardValue* dirty_r = find_first_non_dirty_card(dirty_l);
-
-        assert(dirty_l <= dirty_r, "inv");
-
-        if (dirty_l == dirty_r) {
-          assert(dirty_r == _end_card, "finished the entire chunk");
-          return;
-        }
-
-        f(dirty_l, dirty_r);
-
-        cur_card = dirty_r + 1;
-      }
-    }
-  };
-
   void scan_heap_roots(G1HeapRegion* r) {
     uint const region_idx = r->hrm_index();
 
     ResourceMark rm;
 
-    G1CardTableChunkClaimer claim(_scan_state, region_idx);
+    G1CardTableChunkClaimer claim = _scan_state->claimer(region_idx);
 
     // Set the current scan "finger" to null for every heap region to scan. Since
     // the claim value is monotonically increasing, the check to not scan below this
@@ -652,6 +465,8 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure {
     // to resetting this value for every claim.
     _scanned_to = nullptr;
 
+    size_t pending_cards = 0;
+
     while (claim.has_next()) {
       _chunks_claimed++;
 
@@ -660,11 +475,12 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure {
       CardValue* const start_card = _ct->byte_for_index(region_card_base_idx);
       CardValue* const end_card = start_card + claim.size();
 
-      ChunkScanner chunk_scanner{start_card, end_card};
+      G1ChunkScanner chunk_scanner{start_card, end_card};
       chunk_scanner.on_dirty_cards([&] (CardValue* dirty_l, CardValue* dirty_r) {
-                                     do_claimed_block(region_idx, dirty_l, dirty_r);
+                                     do_claimed_block(region_idx, dirty_l, dirty_r, pending_cards);
                                    });
     }
+    _cards_pending += pending_cards;
   }
 
 public:
@@ -679,6 +495,8 @@ public:
     _scan_state(scan_state),
     _phase(phase),
     _worker_id(worker_id),
+    _cards_pending(0),
+    _cards_empty(0),
     _cards_scanned(0),
     _blocks_scanned(0),
     _chunks_claimed(0),
@@ -706,6 +524,8 @@ public:
   Tickspan rem_set_root_scan_time() const { return _rem_set_root_scan_time; }
   Tickspan rem_set_trim_partially_time() const { return _rem_set_trim_partially_time; }
 
+  size_t cards_pending() const { return _cards_pending; }
+  size_t cards_scanned_empty() const { return _cards_empty; }
   size_t cards_scanned() const { return _cards_scanned; }
   size_t blocks_scanned() const { return _blocks_scanned; }
   size_t chunks_claimed() const { return _chunks_claimed; }
@@ -728,6 +548,9 @@ void G1RemSet::scan_heap_roots(G1ParScanThreadState* pss,
   p->record_or_add_time_secs(objcopy_phase, worker_id, cl.rem_set_trim_partially_time().seconds());
 
   p->record_or_add_time_secs(scan_phase, worker_id, cl.rem_set_root_scan_time().seconds());
+
+  p->record_or_add_thread_work_item(scan_phase, worker_id, cl.cards_pending(), G1GCPhaseTimes::ScanHRPendingCards);
+  p->record_or_add_thread_work_item(scan_phase, worker_id, cl.cards_scanned_empty(), G1GCPhaseTimes::ScanHRScannedEmptyCards);
   p->record_or_add_thread_work_item(scan_phase, worker_id, cl.cards_scanned(), G1GCPhaseTimes::ScanHRScannedCards);
   p->record_or_add_thread_work_item(scan_phase, worker_id, cl.blocks_scanned(), G1GCPhaseTimes::ScanHRScannedBlocks);
   p->record_or_add_thread_work_item(scan_phase, worker_id, cl.chunks_claimed(), G1GCPhaseTimes::ScanHRClaimedChunks);
@@ -901,6 +724,7 @@ void G1RemSet::prepare_region_for_scan(G1HeapRegion* r) {
     assert_scan_top_is_null(hrm_index);
   } else if (r->is_old_or_humongous()) {
     _scan_state->set_scan_top(hrm_index, r->top());
+    _scan_state->add_dirty_region(hrm_index);
   } else {
     assert_scan_top_is_null(hrm_index);
     assert(r->is_free(),
@@ -956,6 +780,90 @@ public:
   }
 };
 
+// Task to merge a non-dirty refinement table into the (primary) card table.
+class MergeRefinementTableTask : public WorkerTask {
+
+  G1CardTableClaimTable* _scan_state;
+  uint _max_workers;
+
+  class G1MergeRefinementTableRegionClosure : public G1HeapRegionClosure {
+    G1CardTableClaimTable* _scan_state;
+
+    bool do_heap_region(G1HeapRegion* r) override {
+      if (!_scan_state->has_unclaimed_cards(r->hrm_index())) {
+        return false;
+      }
+
+      // We can blindly clear all collection set region's refinement tables: these
+      // regions will be evacuated and need their refinement table reset in case
+      // of evacuation failure.
+      // Young regions contain random marks, which are obvious to just clear. The
+      // card marks of other collection set region's refinement tables are also
+      // uninteresting.
+      if (r->in_collection_set()) {
+        uint claim = _scan_state->claim_all_cards(r->hrm_index());
+        // Concurrent refinement may have started merging this region (we also
+        // get here for non-young regions), the claim may be non-zero for those.
+        // We could get away here with just clearing the area from the current
+        // claim to the last card in the region, but for now just do it all.
+        if (claim < G1HeapRegion::CardsPerRegion) {
+          r->clear_refinement_table();
+        }
+        return false;
+      }
+
+      assert(r->is_old_or_humongous(), "must be");
+
+      G1CollectedHeap* g1h = G1CollectedHeap::heap();
+      G1CardTable* card_table = g1h->card_table();
+      G1CardTable* refinement_table = g1h->refinement_table();
+
+      size_t const region_card_base_idx = (size_t)r->hrm_index() << G1HeapRegion::LogCardsPerRegion;
+
+      G1CardTableChunkClaimer claim(_scan_state, r->hrm_index());
+
+      while (claim.has_next()) {
+        size_t const start_idx = region_card_base_idx + claim.value();
+
+        size_t* card_cur_word = (size_t*)card_table->byte_for_index(start_idx);
+
+        size_t* refinement_cur_word = (size_t*)refinement_table->byte_for_index(start_idx);
+        size_t* const refinement_end_word = refinement_cur_word + claim.size() / (sizeof(size_t) / sizeof(G1CardTable::CardValue));
+
+        for (; refinement_cur_word < refinement_end_word; ++refinement_cur_word, ++card_cur_word) {
+          size_t value = *refinement_cur_word;
+          *refinement_cur_word = G1CardTable::WordAllClean;
+          // Dirty is "0", so we need to logically-and here. This is also safe
+          // for all other possible values in the card table; at this point this
+          // can be either g1_dirty_card or g1_to_cset_card which will both be
+          // scanned.
+          size_t new_value = *card_cur_word & value;
+          *card_cur_word = new_value;
+        }
+      }
+
+      return false;
+    }
+
+  public:
+    G1MergeRefinementTableRegionClosure(G1CardTableClaimTable* scan_state) : G1HeapRegionClosure(), _scan_state(scan_state) {
+    }
+  };
+
+public:
+  MergeRefinementTableTask(G1CardTableClaimTable* scan_state, uint max_workers) :
+    WorkerTask("Merge Refinement Table"), _scan_state(scan_state), _max_workers(max_workers) {     guarantee(_scan_state != nullptr, "must be");  }
+
+  void work(uint worker_id) override {
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+
+    G1GCParPhaseTimesTracker x(g1h->phase_times(), G1GCPhaseTimes::SweepRT, worker_id, false /* allow multiple invocation */);
+
+    G1MergeRefinementTableRegionClosure cl(_scan_state);
+    _scan_state->heap_region_iterate_from_worker_offset(&cl, worker_id, _max_workers);
+  }
+};
+
 class G1MergeHeapRootsTask : public WorkerTask {
 
   class G1MergeCardSetStats {
@@ -973,12 +881,16 @@ class G1MergeHeapRootsTask : public WorkerTask {
       _merged[tag]++;
     }
 
-    void inc_remset_cards(size_t increment = 1) {
-      _merged[G1GCPhaseTimes::MergeRSCards] += increment;
+    void inc_merged_cards(size_t increment = 1) {
+      _merged[G1GCPhaseTimes::MergeRSFromRemSetCards] += increment;
+    }
+
+    void inc_total_cards(size_t increment = 1) {
+      _merged[G1GCPhaseTimes::MergeRSTotalCards] += increment;
     }
 
     void dec_remset_cards(size_t decrement) {
-      _merged[G1GCPhaseTimes::MergeRSCards] -= decrement;
+      _merged[G1GCPhaseTimes::MergeRSTotalCards] -= decrement;
     }
 
     size_t merged(uint i) const { return _merged[i]; }
@@ -1031,10 +943,10 @@ class G1MergeHeapRootsTask : public WorkerTask {
     }
 
     void mark_card(G1CardTable::CardValue* value) {
-      if (_ct->mark_clean_as_dirty(value)) {
-        _scan_state->set_chunk_dirty(_ct->index_for_cardvalue(value));
+      if (_ct->mark_clean_as_from_remset(value)) {
+        _stats.inc_merged_cards();
       }
-      _stats.inc_remset_cards();
+      _stats.inc_total_cards();
     }
 
   public:
@@ -1054,7 +966,7 @@ class G1MergeHeapRootsTask : public WorkerTask {
 
     // Returns whether the given region actually needs iteration.
     bool start_iterate(uint const tag, uint const region_idx) {
-      assert(tag < G1GCPhaseTimes::MergeRSCards, "invalid tag %u", tag);
+      assert(tag < G1GCPhaseTimes::MergeRSFromRemSetCards, "invalid tag %u", tag);
       if (remember_if_interesting(region_idx)) {
         _region_base_idx = (size_t)region_idx << G1HeapRegion::LogCardsPerRegion;
         _stats.inc_card_set_merged(tag);
@@ -1064,9 +976,9 @@ class G1MergeHeapRootsTask : public WorkerTask {
     }
 
     void do_card_range(uint const start_card_idx, uint const length) {
-      _ct->mark_range_dirty(_region_base_idx + start_card_idx, length);
-      _stats.inc_remset_cards(length);
-      _scan_state->set_chunk_range_dirty(_region_base_idx + start_card_idx, length);
+      size_t cards_changed = _ct->mark_clean_range_as_from_remset(_region_base_idx + start_card_idx, length);
+      _stats.inc_merged_cards(cards_changed);
+      _stats.inc_total_cards(length);
     }
 
     G1MergeCardSetStats stats() {
@@ -1086,12 +998,19 @@ class G1MergeHeapRootsTask : public WorkerTask {
   class G1ClearBitmapClosure : public G1HeapRegionClosure {
     G1CollectedHeap* _g1h;
     G1RemSetScanState* _scan_state;
+    bool _initial_evacuation;
 
     void assert_bitmap_clear(G1HeapRegion* hr, const G1CMBitMap* bitmap) {
       assert(bitmap->get_next_marked_addr(hr->bottom(), hr->end()) == hr->end(),
              "Bitmap should have no mark for region %u (%s)", hr->hrm_index(), hr->get_short_type_str());
     }
 
+    void assert_refinement_table_clear(G1HeapRegion* hr) {
+#ifdef ASSERT
+      _g1h->refinement_table()->verify_region(MemRegion(hr->bottom(), hr->end()), G1CardTable::clean_card_val(), true);
+#endif
+    }
+
     bool should_clear_region(G1HeapRegion* hr) const {
       // The bitmap for young regions must obviously be clear as we never mark through them;
       // old regions that are currently being marked through are only in the collection set
@@ -1110,14 +1029,31 @@ class G1MergeHeapRootsTask : public WorkerTask {
     }
 
   public:
-    G1ClearBitmapClosure(G1CollectedHeap* g1h, G1RemSetScanState* scan_state) :
+    G1ClearBitmapClosure(G1CollectedHeap* g1h, G1RemSetScanState* scan_state, bool initial_evacuation) :
       _g1h(g1h),
-      _scan_state(scan_state)
+      _scan_state(scan_state),
+      _initial_evacuation(initial_evacuation)
     { }
 
     bool do_heap_region(G1HeapRegion* hr) {
       assert(_g1h->is_in_cset(hr), "Should only be used iterating the collection set");
 
+      // Collection set regions after the initial evacuation need their refinement
+      // table cleared because
+      // * we use the refinement table for recording references to other regions
+      // during evacuation failure handling
+      // * during previous passes we used the refinement table to contain marks for
+      // cross-region references. Now that we evacuate the region, they need to be
+      // cleared.
+      //
+      // We do not need to do this extra work for initial evacuation because we
+      // make sure the refinement table is clean for all regions either in
+      // concurrent refinement or in the merge refinement table phase earlier.
+      if (!_initial_evacuation) {
+        hr->clear_refinement_table();
+      } else {
+        assert_refinement_table_clear(hr);
+      }
       // Evacuation failure uses the bitmap to record evacuation failed objects,
       // so the bitmap for the regions in the collection set must be cleared if not already.
       if (should_clear_region(hr)) {
@@ -1177,145 +1113,23 @@ class G1MergeHeapRootsTask : public WorkerTask {
     }
   };
 
-  // Visitor for the log buffer entries to merge them into the card table.
-  class G1MergeLogBufferCardsClosure : public G1CardTableEntryClosure {
-
-    G1RemSetScanState* _scan_state;
-    G1CardTable* _ct;
-
-    size_t _cards_dirty;
-    size_t _cards_skipped;
-
-    void process_card(CardValue* card_ptr) {
-      if (*card_ptr == G1CardTable::dirty_card_val()) {
-        uint const region_idx = _ct->region_idx_for(card_ptr);
-        _scan_state->add_dirty_region(region_idx);
-        _scan_state->set_chunk_dirty(_ct->index_for_cardvalue(card_ptr));
-        _cards_dirty++;
-      }
-    }
-
-  public:
-    G1MergeLogBufferCardsClosure(G1CollectedHeap* g1h, G1RemSetScanState* scan_state) :
-      _scan_state(scan_state),
-      _ct(g1h->card_table()),
-      _cards_dirty(0),
-      _cards_skipped(0)
-    {}
-
-    void do_card_ptr(CardValue* card_ptr) override {
-      // The only time we care about recording cards that
-      // contain references that point into the collection set
-      // is during RSet updating within an evacuation pause.
-      assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause");
-
-      uint const region_idx = _ct->region_idx_for(card_ptr);
-
-      // The second clause must come after - the log buffers might contain cards to uncommitted
-      // regions.
-      // This code may count duplicate entries in the log buffers (even if rare) multiple
-      // times.
-      if (_scan_state->contains_cards_to_process(region_idx)) {
-        process_card(card_ptr);
-      } else {
-        // We may have had dirty cards in the (initial) collection set (or the
-        // young regions which are always in the initial collection set). We do
-        // not fix their cards here: we already added these regions to the set of
-        // regions to clear the card table at the end during the prepare() phase.
-        _cards_skipped++;
-      }
-    }
-
-    size_t cards_dirty() const { return _cards_dirty; }
-    size_t cards_skipped() const { return _cards_skipped; }
-  };
-
   uint _num_workers;
   G1HeapRegionClaimer _hr_claimer;
   G1RemSetScanState* _scan_state;
 
-  // To mitigate contention due multiple threads accessing and popping BufferNodes from a shared
-  // G1DirtyCardQueueSet, we implement a sequential distribution phase. Here, BufferNodes are
-  // distributed to worker threads in a sequential manner utilizing the _dirty_card_buffers. By doing
-  // so, we effectively alleviate the bottleneck encountered during pop operations on the
-  // G1DirtyCardQueueSet. Importantly, this approach preserves the helping aspect among worker
-  // threads, allowing them to assist one another in case of imbalances in work distribution.
-  BufferNode::Stack* _dirty_card_buffers;
-
   bool _initial_evacuation;
 
   volatile bool _fast_reclaim_handled;
 
-  void apply_closure_to_dirty_card_buffers(G1MergeLogBufferCardsClosure* cl, uint worker_id) {
-    G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-    for (uint i = 0; i < _num_workers; i++) {
-      uint index = (worker_id + i) % _num_workers;
-      while (BufferNode* node = _dirty_card_buffers[index].pop()) {
-        cl->apply_to_buffer(node, worker_id);
-        dcqs.deallocate_buffer(node);
-      }
-    }
-  }
-
 public:
   G1MergeHeapRootsTask(G1RemSetScanState* scan_state, uint num_workers, bool initial_evacuation) :
     WorkerTask("G1 Merge Heap Roots"),
     _num_workers(num_workers),
     _hr_claimer(num_workers),
     _scan_state(scan_state),
-    _dirty_card_buffers(nullptr),
     _initial_evacuation(initial_evacuation),
     _fast_reclaim_handled(false)
-  {
-    if (initial_evacuation) {
-      Ticks start = Ticks::now();
-
-      _dirty_card_buffers = NEW_C_HEAP_ARRAY(BufferNode::Stack, num_workers, mtGC);
-      for (uint i = 0; i < num_workers; i++) {
-        new (&_dirty_card_buffers[i]) BufferNode::Stack();
-      }
-
-      G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-      BufferNodeList buffers = dcqs.take_all_completed_buffers();
-
-      size_t entries_per_thread = ceil(buffers._entry_count / (double)num_workers);
-
-      BufferNode* head = buffers._head;
-      BufferNode* tail = head;
-
-      uint worker = 0;
-      while (tail != nullptr) {
-        size_t count = tail->size();
-        BufferNode* cur = tail->next();
-
-        while (count < entries_per_thread && cur != nullptr) {
-          tail = cur;
-          count += tail->size();
-          cur = tail->next();
-        }
-
-        tail->set_next(nullptr);
-        _dirty_card_buffers[worker++ % num_workers].prepend(*head, *tail);
-
-        assert(cur != nullptr || tail == buffers._tail, "Must be");
-        head = cur;
-        tail = cur;
-      }
-
-      Tickspan total = Ticks::now() - start;
-      G1CollectedHeap::heap()->phase_times()->record_distribute_log_buffers_time_ms(total.seconds() * 1000.0);
-    }
-  }
-
-  ~G1MergeHeapRootsTask() {
-    if (_dirty_card_buffers != nullptr) {
-      using Stack = BufferNode::Stack;
-      for (uint i = 0; i < _num_workers; i++) {
-        _dirty_card_buffers[i].~Stack();
-      }
-      FREE_C_HEAP_ARRAY(Stack, _dirty_card_buffers);
-    }
-  }
+  { }
 
   virtual void work(uint worker_id) {
     G1CollectedHeap* g1h = G1CollectedHeap::heap();
@@ -1368,50 +1182,28 @@ public:
 
     // Preparation for evacuation failure handling.
     {
-      G1ClearBitmapClosure clear(g1h, _scan_state);
+      G1ClearBitmapClosure clear(g1h, _scan_state, _initial_evacuation);
       g1h->collection_set_iterate_increment_from(&clear, &_hr_claimer, worker_id);
     }
-
-    // Now apply the closure to all remaining log entries.
-    if (_initial_evacuation) {
-      assert(merge_remset_phase == G1GCPhaseTimes::MergeRS, "Wrong merge phase");
-      G1GCParPhaseTimesTracker x(p, G1GCPhaseTimes::MergeLB, worker_id);
-
-      G1MergeLogBufferCardsClosure cl(g1h, _scan_state);
-      apply_closure_to_dirty_card_buffers(&cl, worker_id);
-
-      p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_dirty(), G1GCPhaseTimes::MergeLBDirtyCards);
-      p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_skipped(), G1GCPhaseTimes::MergeLBSkippedCards);
-    }
   }
 };
 
-void G1RemSet::print_merge_heap_roots_stats() {
-  LogTarget(Debug, gc, remset) lt;
-  if (lt.is_enabled()) {
-    LogStream ls(lt);
+static void merge_refinement_table() {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
 
-    size_t num_visited_cards = _scan_state->num_visited_cards();
+  G1ConcurrentRefineSweepState& state = g1h->concurrent_refine()->sweep_state_for_merge();
+  WorkerThreads* workers = g1h->workers();
 
-    size_t total_dirty_region_cards = _scan_state->num_cards_in_dirty_regions();
-
-    G1CollectedHeap* g1h = G1CollectedHeap::heap();
-    size_t total_old_region_cards =
-      (g1h->num_committed_regions() - (g1h->num_free_regions() - g1h->collection_set()->cur_length())) * G1HeapRegion::CardsPerRegion;
-
-    ls.print_cr("Visited cards %zu Total dirty %zu (%.2lf%%) Total old %zu (%.2lf%%)",
-                num_visited_cards,
-                total_dirty_region_cards,
-                percent_of(num_visited_cards, total_dirty_region_cards),
-                total_old_region_cards,
-                percent_of(num_visited_cards, total_old_region_cards));
-  }
+  MergeRefinementTableTask cl(state.sweep_table(), workers->active_workers());
+  log_debug(gc, ergo)("Running %s using %u workers", cl.name(), workers->active_workers());
+  workers->run_task(&cl);
 }
 
 void G1RemSet::merge_heap_roots(bool initial_evacuation) {
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
   G1GCPhaseTimes* pt = g1h->phase_times();
 
+  // 1. Prepare the merging process
   {
     Ticks start = Ticks::now();
 
@@ -1425,28 +1217,42 @@ void G1RemSet::merge_heap_roots(bool initial_evacuation) {
     }
   }
 
-  WorkerThreads* workers = g1h->workers();
-  size_t const increment_length = g1h->collection_set()->regions_cur_length();
+  // 2. (Optionally) Merge the refinement table into the card table (if needed).
+  G1ConcurrentRefineSweepState& state = g1h->concurrent_refine()->sweep_state();
+  if (initial_evacuation && state.is_in_progress()) {
+    Ticks start = Ticks::now();
 
-  uint const num_workers = initial_evacuation ? workers->active_workers() :
-                                                MIN2(workers->active_workers(), (uint)increment_length);
+    merge_refinement_table();
 
+    g1h->phase_times()->record_merge_refinement_table_time((Ticks::now() - start).seconds() * MILLIUNITS);
+  }
+
+  // 3. Merge other heap roots.
   Ticks start = Ticks::now();
 
   {
+    WorkerThreads* workers = g1h->workers();
+
+    size_t const increment_length = g1h->collection_set()->groups_increment_length();
+
+    uint const num_workers = initial_evacuation ? workers->active_workers() :
+                                                  MIN2(workers->active_workers(), (uint)increment_length);
+
     G1MergeHeapRootsTask cl(_scan_state, num_workers, initial_evacuation);
     log_debug(gc, ergo)("Running %s using %u workers for %zu regions",
                         cl.name(), num_workers, increment_length);
     workers->run_task(&cl, num_workers);
   }
 
-  print_merge_heap_roots_stats();
-
   if (initial_evacuation) {
     pt->record_merge_heap_roots_time((Ticks::now() - start).seconds() * 1000.0);
   } else {
     pt->record_or_add_optional_merge_heap_roots_time((Ticks::now() - start).seconds() * 1000.0);
   }
+
+  if (VerifyDuringGC && initial_evacuation) {
+    g1h->verifier()->verify_card_tables_clean(false /* both_card_tables */);
+  }
 }
 
 void G1RemSet::complete_evac_phase(bool has_more_than_one_evacuation_phase) {
@@ -1482,86 +1288,20 @@ inline void check_card_ptr(CardTable::CardValue* card_ptr, G1CardTable* ct) {
 #endif
 }
 
-bool G1RemSet::clean_card_before_refine(CardValue** const card_ptr_addr) {
-  assert(!SafepointSynchronize::is_at_safepoint(), "Only call concurrently");
-
-  CardValue* card_ptr = *card_ptr_addr;
-  // Find the start address represented by the card.
-  HeapWord* start = _ct->addr_for(card_ptr);
-  // And find the region containing it.
-  G1HeapRegion* r = _g1h->heap_region_containing_or_null(start);
-
-  // If this is a (stale) card into an uncommitted region, exit.
-  if (r == nullptr) {
-    return false;
-  }
-
-  check_card_ptr(card_ptr, _ct);
-
-  // If the card is no longer dirty, nothing to do.
-  // We cannot load the card value before the "r == nullptr" check above, because G1
-  // could uncommit parts of the card table covering uncommitted regions.
-  if (*card_ptr != G1CardTable::dirty_card_val()) {
-    return false;
-  }
-
-  // This check is needed for some uncommon cases where we should
-  // ignore the card.
-  //
-  // The region could be young.  Cards for young regions are
-  // distinctly marked (set to g1_young_gen), so the post-barrier will
-  // filter them out.  However, that marking is performed
-  // concurrently.  A write to a young object could occur before the
-  // card has been marked young, slipping past the filter.
-  //
-  // The card could be stale, because the region has been freed since
-  // the card was recorded. In this case the region type could be
-  // anything.  If (still) free or (reallocated) young, just ignore
-  // it.  If (reallocated) old or humongous, the later card trimming
-  // and additional checks in iteration may detect staleness.  At
-  // worst, we end up processing a stale card unnecessarily.
-  //
-  // In the normal (non-stale) case, the synchronization between the
-  // enqueueing of the card and processing it here will have ensured
-  // we see the up-to-date region type here.
-  if (!r->is_old_or_humongous()) {
-    return false;
-  }
-
-  // Trim the region designated by the card to what's been allocated
-  // in the region.  The card could be stale, or the card could cover
-  // (part of) an object at the end of the allocated space and extend
-  // beyond the end of allocation.
-
-  // Non-humongous objects are either allocated in the old regions during GC.
-  // So if region is old then top is stable.
-  // Humongous object allocation sets top last; if top has not yet been set,
-  // this is a stale card and we'll end up with an empty intersection.
-  // If this is not a stale card, the synchronization between the
-  // enqueuing of the card and processing it here will have ensured
-  // we see the up-to-date top here.
-  HeapWord* scan_limit = r->top();
-
-  if (scan_limit <= start) {
-    // If the trimmed region is empty, the card must be stale.
-    return false;
-  }
-
-  // Okay to clean and process the card now.  There are still some
-  // stale card cases that may be detected by iteration and dealt with
-  // as iteration failure.
-  *const_cast<volatile CardValue*>(card_ptr) = G1CardTable::clean_card_val();
-
-  return true;
-}
-
-void G1RemSet::refine_card_concurrently(CardValue* const card_ptr,
-                                        const uint worker_id) {
+G1RemSet::RefineResult G1RemSet::refine_card_concurrently(CardValue* const card_ptr,
+                                                          const uint worker_id) {
   assert(!_g1h->is_stw_gc_active(), "Only call concurrently");
-  check_card_ptr(card_ptr, _ct);
+  G1CardTable* ct = _g1h->refinement_table();
+  check_card_ptr(card_ptr, ct);
+
+  // That card is already known to contain a reference to the collection set. Skip
+  // further processing.
+  if (*card_ptr == G1CardTable::g1_to_cset_card) {
+    return AlreadyToCSet;
+  }
 
   // Construct the MemRegion representing the card.
-  HeapWord* start = _ct->addr_for(card_ptr);
+  HeapWord* start = ct->addr_for(card_ptr);
   // And find the region containing it.
   G1HeapRegion* r = _g1h->heap_region_containing(start);
   // This reload of the top is safe even though it happens after the full
@@ -1571,7 +1311,7 @@ void G1RemSet::refine_card_concurrently(CardValue* const card_ptr,
   // cannot span across safepoint, so we don't need to worry about top being
   // changed during safepoint.
   HeapWord* scan_limit = r->top();
-  assert(scan_limit > start, "sanity");
+  assert(scan_limit > start, "sanity region %u (%s) scan_limit " PTR_FORMAT " start " PTR_FORMAT, r->hrm_index(), r->get_short_type_str(), p2i(scan_limit), p2i(start));
 
   // Don't use addr_for(card_ptr + 1) which can ask for
   // a card beyond the heap.
@@ -1581,43 +1321,21 @@ void G1RemSet::refine_card_concurrently(CardValue* const card_ptr,
 
   G1ConcurrentRefineOopClosure conc_refine_cl(_g1h, worker_id);
   if (r->oops_on_memregion_seq_iterate_careful<false>(dirty_region, &conc_refine_cl) != nullptr) {
-    return;
+    if (conc_refine_cl.has_ref_to_cset()) {
+      return HasRefToCSet;
+    } else if (conc_refine_cl.has_ref_to_old()) {
+      return HasRefToOld;
+    } else {
+      return NoCrossRegion;
+    }
   }
-
   // If unable to process the card then we encountered an unparsable
   // part of the heap (e.g. a partially allocated object, so only
   // temporarily a problem) while processing a stale card.  Despite
   // the card being stale, we can't simply ignore it, because we've
-  // already marked the card cleaned, so taken responsibility for
+  // already marked the card as cleaned, so taken responsibility for
   // ensuring the card gets scanned.
-  //
-  // However, the card might have gotten re-dirtied and re-enqueued
-  // while we worked.  (In fact, it's pretty likely.)
-  if (*card_ptr == G1CardTable::dirty_card_val()) {
-    return;
-  }
-
-  enqueue_for_reprocessing(card_ptr);
-}
-
-// Re-dirty and re-enqueue the card to retry refinement later.
-// This is used to deal with a rare race condition in concurrent refinement.
-void G1RemSet::enqueue_for_reprocessing(CardValue* card_ptr) {
-  // We can't use the thread-local queue, because that might be the queue
-  // that is being processed by us; we could be a Java thread conscripted to
-  // perform refinement on our queue's current buffer.  This situation only
-  // arises from rare race condition, so it's not worth any significant
-  // development effort or clever lock-free queue implementation.  Instead
-  // we use brute force, allocating and enqueuing an entire buffer for just
-  // this card.  Since buffers are processed in FIFO order and we try to
-  // keep some in the queue, it is likely that the racing state will have
-  // resolved by the time this card comes up for reprocessing.
-  *card_ptr = G1CardTable::dirty_card_val();
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  void** buffer = dcqs.allocate_buffer();
-  size_t index = dcqs.buffer_capacity() - 1;
-  buffer[index] = card_ptr;
-  dcqs.enqueue_completed_buffer(BufferNode::make_node_from_buffer(buffer, index));
+  return CouldNotParse;
 }
 
 void G1RemSet::print_periodic_summary_info(const char* header, uint period_count, bool show_thread_times) {
diff --git a/src/hotspot/share/gc/g1/g1RemSet.hpp b/src/hotspot/share/gc/g1/g1RemSet.hpp
index 50cc029a9a1..8b2353cdbb3 100644
--- a/src/hotspot/share/gc/g1/g1RemSet.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.hpp
@@ -26,6 +26,7 @@
 #define SHARE_GC_G1_G1REMSET_HPP
 
 #include "gc/g1/g1CardTable.hpp"
+#include "gc/g1/g1CardTableClaimTable.hpp"
 #include "gc/g1/g1GCPhaseTimes.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1OopClosures.hpp"
@@ -65,20 +66,15 @@ private:
 
   G1CollectedHeap* _g1h;
 
-  G1CardTable*           _ct;
-  G1Policy*              _g1p;
-
-  void print_merge_heap_roots_stats();
+  G1Policy* _g1p;
 
   void assert_scan_top_is_null(uint hrm_index) NOT_DEBUG_RETURN;
 
-  void enqueue_for_reprocessing(CardValue* card_ptr);
-
 public:
   // Initialize data that depends on the heap size being known.
   void initialize(uint max_num_regions);
 
-  G1RemSet(G1CollectedHeap* g1h, G1CardTable* ct);
+  G1RemSet(G1CollectedHeap* g1h);
   ~G1RemSet();
 
   // Scan all cards in the non-collection set regions that potentially contain
@@ -101,7 +97,7 @@ public:
 
   // Print coarsening stats.
   void print_coarsen_stats();
-  // Creates a task for cleaining up temporary data structures and the
+  // Creates a task for cleaning up temporary data structures and the
   // card table, removing temporary duplicate detection information.
   G1AbstractSubTask* create_cleanup_after_scan_heap_roots_task();
   // Excludes the given region from heap root scanning.
@@ -122,16 +118,19 @@ public:
                                           G1GCPhaseTimes::GCParPhases scan_phase,
                                           G1GCPhaseTimes::GCParPhases objcopy_phase);
 
-  // Two methods for concurrent refinement support, executed concurrently to
-  // the mutator:
-  // Cleans the card at "*card_ptr_addr" before refinement, returns true iff the
-  // card needs later refinement.
-  bool clean_card_before_refine(CardValue** const card_ptr_addr);
+  enum RefineResult {
+      HasRefToCSet,          // The (dirty) card has a reference to the collection set.
+      AlreadyToCSet,         // The card is already one marked as having a reference to the collection set.
+      HasRefToOld,           // The dirty card contains references to other old regions (not the collection set).
+      NoCrossRegion,         // There is no interesting reference in the card any more. The mutator changed all
+                             // references to such after dirtying the card.
+      CouldNotParse          // The card is unparsable, need to retry later.
+  };
   // Refine the region corresponding to "card_ptr". Must be called after
   // being filtered by clean_card_before_refine(), and after proper
   // fence/synchronization.
-  void refine_card_concurrently(CardValue* const card_ptr,
-                                const uint worker_id);
+  RefineResult refine_card_concurrently(CardValue* const card_ptr,
+                                        const uint worker_id);
 
   // Print accumulated summary info from the start of the VM.
   void print_summary_info();
diff --git a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
index 49cc993dac2..3e9cf938097 100644
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
@@ -27,7 +27,6 @@
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1HeapRegionRemSet.inline.hpp"
 #include "gc/g1/g1RemSet.hpp"
@@ -37,39 +36,61 @@
 #include "runtime/javaThread.hpp"
 
 void G1RemSetSummary::update() {
-  class CollectData : public ThreadClosure {
+  G1ConcurrentRefine* refine = G1CollectedHeap::heap()->concurrent_refine();
+
+  class CollectWorkerData : public ThreadClosure {
     G1RemSetSummary* _summary;
     uint _counter;
   public:
-    CollectData(G1RemSetSummary * summary) : _summary(summary),  _counter(0) {}
+    CollectWorkerData(G1RemSetSummary* summary) : _summary(summary),  _counter(0) {}
     virtual void do_thread(Thread* t) {
       G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
-      _summary->set_refine_thread_cpu_time(_counter, crt->cpu_time());
+      _summary->set_worker_thread_cpu_time(_counter, crt->cpu_time());
       _counter++;
     }
   } collector(this);
 
-  G1CollectedHeap* g1h = G1CollectedHeap::heap();
-  g1h->concurrent_refine()->threads_do(&collector);
+  refine->worker_threads_do(&collector);
+
+  class CollectControlData : public ThreadClosure {
+    G1RemSetSummary* _summary;
+  public:
+    CollectControlData(G1RemSetSummary* summary) : _summary(summary) {}
+    virtual void do_thread(Thread* t) {
+      G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
+      _summary->set_control_thread_cpu_time(crt->cpu_time());
+    }
+  } control(this);
+
+  refine->control_thread_do(&control);
 }
 
-void G1RemSetSummary::set_refine_thread_cpu_time(uint thread, jlong value) {
-  assert(_refine_threads_cpu_times != nullptr, "just checking");
-  assert(thread < _num_refine_threads, "just checking");
-  _refine_threads_cpu_times[thread] = value;
+void G1RemSetSummary::set_worker_thread_cpu_time(uint thread, jlong value) {
+  assert(_worker_threads_cpu_times != nullptr, "just checking");
+  assert(thread < _num_worker_threads, "just checking");
+  _worker_threads_cpu_times[thread] = value;
 }
 
-jlong G1RemSetSummary::refine_thread_cpu_time(uint thread) const {
-  assert(_refine_threads_cpu_times != nullptr, "just checking");
-  assert(thread < _num_refine_threads, "just checking");
-  return _refine_threads_cpu_times[thread];
+void G1RemSetSummary::set_control_thread_cpu_time(jlong value) {
+  _control_thread_cpu_time = value;
+}
+
+jlong G1RemSetSummary::worker_thread_cpu_time(uint thread) const {
+  assert(_worker_threads_cpu_times != nullptr, "just checking");
+  assert(thread < _num_worker_threads, "just checking");
+  return _worker_threads_cpu_times[thread];
+}
+
+jlong G1RemSetSummary::control_thread_cpu_time() const {
+  return _control_thread_cpu_time;
 }
 
 G1RemSetSummary::G1RemSetSummary(bool should_update) :
-  _num_refine_threads(G1ConcRefinementThreads),
-  _refine_threads_cpu_times(NEW_C_HEAP_ARRAY(jlong, _num_refine_threads, mtGC)) {
+  _num_worker_threads(G1ConcRefinementThreads),
+  _worker_threads_cpu_times(NEW_C_HEAP_ARRAY(jlong, _num_worker_threads, mtGC)),
+  _control_thread_cpu_time(0) {
 
-  memset(_refine_threads_cpu_times, 0, sizeof(jlong) * _num_refine_threads);
+  memset(_worker_threads_cpu_times, 0, sizeof(jlong) * _num_worker_threads);
 
   if (should_update) {
     update();
@@ -77,23 +98,25 @@ G1RemSetSummary::G1RemSetSummary(bool should_update) :
 }
 
 G1RemSetSummary::~G1RemSetSummary() {
-  FREE_C_HEAP_ARRAY(jlong, _refine_threads_cpu_times);
+  FREE_C_HEAP_ARRAY(jlong, _worker_threads_cpu_times);
 }
 
 void G1RemSetSummary::set(G1RemSetSummary* other) {
   assert(other != nullptr, "just checking");
-  assert(_num_refine_threads == other->_num_refine_threads, "just checking");
+  assert(_num_worker_threads == other->_num_worker_threads, "just checking");
 
-  memcpy(_refine_threads_cpu_times, other->_refine_threads_cpu_times, sizeof(jlong) * _num_refine_threads);
+  memcpy(_worker_threads_cpu_times, other->_worker_threads_cpu_times, sizeof(jlong) * _num_worker_threads);
+  _control_thread_cpu_time = other->_control_thread_cpu_time;
 }
 
 void G1RemSetSummary::subtract_from(G1RemSetSummary* other) {
   assert(other != nullptr, "just checking");
-  assert(_num_refine_threads == other->_num_refine_threads, "just checking");
+  assert(_num_worker_threads == other->_num_worker_threads, "just checking");
 
-  for (uint i = 0; i < _num_refine_threads; i++) {
-    set_refine_thread_cpu_time(i, other->refine_thread_cpu_time(i) - refine_thread_cpu_time(i));
+  for (uint i = 0; i < _num_worker_threads; i++) {
+    set_worker_thread_cpu_time(i, other->worker_thread_cpu_time(i) - worker_thread_cpu_time(i));
   }
+  _control_thread_cpu_time = other->_control_thread_cpu_time - _control_thread_cpu_time;
 }
 
 class G1PerRegionTypeRemSetCounters {
@@ -376,9 +399,10 @@ public:
 void G1RemSetSummary::print_on(outputStream* out, bool show_thread_times) {
   if (show_thread_times) {
     out->print_cr(" Concurrent refinement threads times (s)");
+    out->print_cr(" Control %5.2f Workers", (double)control_thread_cpu_time() / NANOSECS_PER_SEC);
     out->print("     ");
-    for (uint i = 0; i < _num_refine_threads; i++) {
-      out->print("    %5.2f", (double)refine_thread_cpu_time(i) / NANOSECS_PER_SEC);
+    for (uint i = 0; i < _num_worker_threads; i++) {
+      out->print("    %5.2f", (double)worker_thread_cpu_time(i) / NANOSECS_PER_SEC);
     }
     out->cr();
   }
diff --git a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
index 373f38952c8..dd7d55d5a2e 100644
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
@@ -33,10 +33,12 @@ class G1RemSet;
 
 // A G1RemSetSummary manages statistical information about the remembered set.
 class G1RemSetSummary {
-  size_t _num_refine_threads;
-  jlong* _refine_threads_cpu_times;
+  size_t _num_worker_threads;
+  jlong* _worker_threads_cpu_times;
+  jlong _control_thread_cpu_time;
 
-  void set_refine_thread_cpu_time(uint thread, jlong value);
+  void set_worker_thread_cpu_time(uint thread, jlong value);
+  void set_control_thread_cpu_time(jlong value);
 
   // Update this summary with current data from various places.
   void update();
@@ -53,7 +55,8 @@ public:
 
   void print_on(outputStream* out, bool show_thread_times);
 
-  jlong refine_thread_cpu_time(uint thread) const;
+  jlong worker_thread_cpu_time(uint thread) const;
+  jlong control_thread_cpu_time() const;
 };
 
 #endif // SHARE_GC_G1_G1REMSETSUMMARY_HPP
diff --git a/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp
new file mode 100644
index 00000000000..2f7acd9b710
--- /dev/null
+++ b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "gc/g1/g1CollectedHeap.hpp"
+#include "gc/g1/g1Policy.hpp"
+#include "gc/g1/g1ReviseYoungLengthTask.hpp"
+#include "gc/g1/g1ServiceThread.hpp"
+#include "gc/shared/suspendibleThreadSet.hpp"
+
+
+jlong G1ReviseYoungLengthTask::reschedule_delay_ms() const {
+  G1Policy* policy = G1CollectedHeap::heap()->policy();
+  size_t available_bytes;
+  if (policy->try_get_available_bytes_estimate(available_bytes)) {
+    double predicted_time_to_next_gc_ms = policy->predict_time_to_next_gc_ms(available_bytes);
+
+    // Use a prime number close to 50ms as minimum time, different to other components
+    // that derive their wait time from the try_get_available_bytes_estimate() call
+    // to minimize interference.
+    uint64_t const min_wait_time_ms = 47;
+
+    return policy->adjust_wait_time_ms(predicted_time_to_next_gc_ms, min_wait_time_ms);
+  } else {
+    // Failed to get estimate of available bytes. Try again asap.
+    return 1;
+  }
+}
+
+class G1ReviseYoungLengthTask::RemSetSamplingClosure : public G1HeapRegionClosure {
+  size_t _sampled_code_root_rs_length;
+
+public:
+  RemSetSamplingClosure() : _sampled_code_root_rs_length(0) { }
+
+  bool do_heap_region(G1HeapRegion* r) override {
+    G1HeapRegionRemSet* rem_set = r->rem_set();
+    _sampled_code_root_rs_length += rem_set->code_roots_list_length();
+    return false;
+  }
+
+  size_t sampled_code_root_rs_length() const { return _sampled_code_root_rs_length; }
+};
+
+void G1ReviseYoungLengthTask::adjust_young_list_target_length() {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  G1Policy* policy = g1h->policy();
+
+  assert(policy->use_adaptive_young_list_length(), "should not call otherwise");
+
+  size_t pending_cards;
+  size_t current_to_collection_set_cards;
+  {
+    MutexLocker x(G1ReviseYoungLength_lock, Mutex::_no_safepoint_check_flag);
+    pending_cards = policy->current_pending_cards();
+    current_to_collection_set_cards = policy->current_to_collection_set_cards();
+  }
+
+  RemSetSamplingClosure cl;
+  g1h->collection_set()->iterate(&cl);
+
+  policy->revise_young_list_target_length(pending_cards,
+                                          current_to_collection_set_cards,
+                                          cl.sampled_code_root_rs_length());
+}
+
+G1ReviseYoungLengthTask::G1ReviseYoungLengthTask(const char* name) :
+  G1ServiceTask(name) { }
+
+void G1ReviseYoungLengthTask::execute() {
+  SuspendibleThreadSetJoiner sts;
+
+  adjust_young_list_target_length();
+
+  schedule(reschedule_delay_ms());
+}
diff --git a/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp
new file mode 100644
index 00000000000..baa8af75fb7
--- /dev/null
+++ b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP
+#define SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP
+
+#include "gc/g1/g1CardSetMemory.hpp"
+#include "gc/g1/g1HeapRegionRemSet.hpp"
+#include "gc/g1/g1MonotonicArenaFreePool.hpp"
+#include "gc/g1/g1ServiceThread.hpp"
+#include "utilities/growableArray.hpp"
+#include "utilities/ticks.hpp"
+
+// ServiceTask to revise the young generation target length.
+class G1ReviseYoungLengthTask : public G1ServiceTask {
+
+  // The delay used to reschedule this task.
+  jlong reschedule_delay_ms() const;
+
+  class RemSetSamplingClosure; // Helper class for calculating remembered set summary.
+
+  // Adjust the target length (in regions) of the young gen, based on the
+  // current length of the remembered sets.
+  //
+  // At the end of the GC G1 determines the length of the young gen based on
+  // how much time the next GC can take, and when the next GC may occur
+  // according to the MMU.
+  //
+  // The assumption is that a significant part of the GC is spent on scanning
+  // the remembered sets (and many other components), so this thread constantly
+  // reevaluates the prediction for the remembered set scanning costs, and potentially
+  // resizes the young gen. This may do a premature GC or even increase the young
+  // gen size to keep pause time length goal.
+  void adjust_young_list_target_length();
+
+public:
+  explicit G1ReviseYoungLengthTask(const char* name);
+
+  void execute() override;
+};
+
+#endif // SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP
\ No newline at end of file
diff --git a/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp b/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp
index d0dcb59d7f0..858081b0581 100644
--- a/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp
+++ b/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -25,7 +25,7 @@
 #define SHARE_GC_G1_G1THREADLOCALDATA_HPP
 
 #include "gc/g1/g1BarrierSet.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
+#include "gc/g1/g1CardTable.hpp"
 #include "gc/g1/g1RegionPinCache.hpp"
 #include "gc/shared/gc_globals.hpp"
 #include "gc/shared/satbMarkQueue.hpp"
@@ -36,7 +36,7 @@
 class G1ThreadLocalData {
 private:
   SATBMarkQueue _satb_mark_queue;
-  G1DirtyCardQueue _dirty_card_queue;
+  G1CardTable::CardValue* _byte_map_base;
 
   // Per-thread cache of pinned object count to reduce atomic operation traffic
   // due to region pinning. Holds the last region where the mutator pinned an
@@ -45,8 +45,8 @@ private:
 
   G1ThreadLocalData() :
       _satb_mark_queue(&G1BarrierSet::satb_mark_queue_set()),
-      _dirty_card_queue(&G1BarrierSet::dirty_card_queue_set()),
-      _pin_cache() {}
+      _byte_map_base(nullptr),
+      _pin_cache() { }
 
   static G1ThreadLocalData* data(Thread* thread) {
     assert(UseG1GC, "Sanity");
@@ -57,10 +57,6 @@ private:
     return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _satb_mark_queue);
   }
 
-  static ByteSize dirty_card_queue_offset() {
-    return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _dirty_card_queue);
-  }
-
 public:
   static void create(Thread* thread) {
     new (data(thread)) G1ThreadLocalData();
@@ -74,10 +70,6 @@ public:
     return data(thread)->_satb_mark_queue;
   }
 
-  static G1DirtyCardQueue& dirty_card_queue(Thread* thread) {
-    return data(thread)->_dirty_card_queue;
-  }
-
   static ByteSize satb_mark_queue_active_offset() {
     return satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active();
   }
@@ -90,14 +82,20 @@ public:
     return satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf();
   }
 
-  static ByteSize dirty_card_queue_index_offset() {
-    return dirty_card_queue_offset() + G1DirtyCardQueue::byte_offset_of_index();
+  static ByteSize card_table_base_offset() {
+    return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _byte_map_base);
   }
 
-  static ByteSize dirty_card_queue_buffer_offset() {
-    return dirty_card_queue_offset() + G1DirtyCardQueue::byte_offset_of_buf();
+  static void set_byte_map_base(Thread* thread, G1CardTable::CardValue* new_byte_map_base) {
+    data(thread)->_byte_map_base = new_byte_map_base;
   }
 
+#ifndef PRODUCT
+  static G1CardTable::CardValue* get_byte_map_base(Thread* thread) {
+    return data(thread)->_byte_map_base;
+  }
+#endif
+
   static G1RegionPinCache& pin_count_cache(Thread* thread) {
     return data(thread)->_pin_cache;
   }
diff --git a/src/hotspot/share/gc/g1/g1YoungCollector.cpp b/src/hotspot/share/gc/g1/g1YoungCollector.cpp
index ee25e5fc028..e97e59575e3 100644
--- a/src/hotspot/share/gc/g1/g1YoungCollector.cpp
+++ b/src/hotspot/share/gc/g1/g1YoungCollector.cpp
@@ -39,7 +39,6 @@
 #include "gc/g1/g1MonitoringSupport.hpp"
 #include "gc/g1/g1ParScanThreadState.inline.hpp"
 #include "gc/g1/g1Policy.hpp"
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
 #include "gc/g1/g1RegionPinCache.inline.hpp"
 #include "gc/g1/g1RemSet.hpp"
 #include "gc/g1/g1RootProcessor.hpp"
@@ -914,13 +913,8 @@ class G1STWRefProcProxyTask : public RefProcProxyTask {
   TaskTerminator _terminator;
   G1ScannerTasksQueueSet& _task_queues;
 
-  // Special closure for enqueuing discovered fields: during enqueue the card table
-  // may not be in shape to properly handle normal barrier calls (e.g. card marks
-  // in regions that failed evacuation, scribbling of various values by card table
-  // scan code). Additionally the regular barrier enqueues into the "global"
-  // DCQS, but during GC we need these to-be-refined entries in the GC local queue
-  // so that after clearing the card table, the redirty cards phase will properly
-  // mark all dirty cards to be picked up by refinement.
+  // G1 specific closure for marking discovered fields. Need to mark the card in the
+  // refinement table as the card table is in use by garbage collection.
   class G1EnqueueDiscoveredFieldClosure : public EnqueueDiscoveredFieldClosure {
     G1CollectedHeap* _g1h;
     G1ParScanThreadState* _pss;
diff --git a/src/hotspot/share/gc/g1/g1YoungCollector.hpp b/src/hotspot/share/gc/g1/g1YoungCollector.hpp
index 2c4929958fe..76d443b1a9f 100644
--- a/src/hotspot/share/gc/g1/g1YoungCollector.hpp
+++ b/src/hotspot/share/gc/g1/g1YoungCollector.hpp
@@ -45,7 +45,6 @@ class G1MonotonicArenaMemoryStats;
 class G1NewTracer;
 class G1ParScanThreadStateSet;
 class G1Policy;
-class G1RedirtyCardsQueueSet;
 class G1RemSet;
 class G1SurvivorRegions;
 class G1YoungGCAllocationFailureInjector;
diff --git a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp
index 5b13e8fc206..2737def7e84 100644
--- a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp
+++ b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp
@@ -287,7 +287,7 @@ public:
     _chunk_bitmap(mtGC) {
 
     _num_evac_fail_regions = _evac_failure_regions->num_regions_evac_failed();
-    _num_chunks_per_region = G1CollectedHeap::get_chunks_per_region();
+    _num_chunks_per_region = G1CollectedHeap::get_chunks_per_region_for_scan();
 
     _chunk_size = static_cast<uint>(G1HeapRegion::GrainWords / _num_chunks_per_region);
 
@@ -300,7 +300,7 @@ public:
   double worker_cost() const override {
     assert(_evac_failure_regions->has_regions_evac_failed(), "Should not call this if there were no evacuation failures");
 
-    double workers_per_region = (double)G1CollectedHeap::get_chunks_per_region() / G1RestoreRetainedRegionChunksPerWorker;
+    double workers_per_region = (double)G1CollectedHeap::get_chunks_per_region_for_scan() / G1RestoreRetainedRegionChunksPerWorker;
     return workers_per_region * _evac_failure_regions->num_regions_evac_failed();
   }
 
@@ -480,43 +480,6 @@ public:
   }
 };
 
-class RedirtyLoggedCardTableEntryClosure : public G1CardTableEntryClosure {
-  size_t _num_dirtied;
-  G1CollectedHeap* _g1h;
-  G1CardTable* _g1_ct;
-  G1EvacFailureRegions* _evac_failure_regions;
-
-  G1HeapRegion* region_for_card(CardValue* card_ptr) const {
-    return _g1h->heap_region_containing(_g1_ct->addr_for(card_ptr));
-  }
-
-  bool will_become_free(G1HeapRegion* hr) const {
-    // A region will be freed by during the FreeCollectionSet phase if the region is in the
-    // collection set and has not had an evacuation failure.
-    return _g1h->is_in_cset(hr) && !_evac_failure_regions->contains(hr->hrm_index());
-  }
-
-public:
-  RedirtyLoggedCardTableEntryClosure(G1CollectedHeap* g1h, G1EvacFailureRegions* evac_failure_regions) :
-    G1CardTableEntryClosure(),
-    _num_dirtied(0),
-    _g1h(g1h),
-    _g1_ct(g1h->card_table()),
-    _evac_failure_regions(evac_failure_regions) { }
-
-  void do_card_ptr(CardValue* card_ptr) override {
-    G1HeapRegion* hr = region_for_card(card_ptr);
-
-    // Should only dirty cards in regions that won't be freed.
-    if (!will_become_free(hr)) {
-      *card_ptr = G1CardTable::dirty_card_val();
-      _num_dirtied++;
-    }
-  }
-
-  size_t num_dirtied()   const { return _num_dirtied; }
-};
-
 class G1PostEvacuateCollectionSetCleanupTask2::ProcessEvacuationFailedRegionsTask : public G1AbstractSubTask {
   G1EvacFailureRegions* _evac_failure_regions;
   G1HeapRegionClaimer _claimer;
@@ -572,48 +535,6 @@ public:
   }
 };
 
-class G1PostEvacuateCollectionSetCleanupTask2::RedirtyLoggedCardsTask : public G1AbstractSubTask {
-  BufferNodeList* _rdc_buffers;
-  uint _num_buffer_lists;
-  G1EvacFailureRegions* _evac_failure_regions;
-
-public:
-  RedirtyLoggedCardsTask(G1EvacFailureRegions* evac_failure_regions, BufferNodeList* rdc_buffers, uint num_buffer_lists) :
-    G1AbstractSubTask(G1GCPhaseTimes::RedirtyCards),
-    _rdc_buffers(rdc_buffers),
-    _num_buffer_lists(num_buffer_lists),
-    _evac_failure_regions(evac_failure_regions) { }
-
-  double worker_cost() const override {
-    // Needs more investigation.
-    return G1CollectedHeap::heap()->workers()->active_workers();
-  }
-
-  void do_work(uint worker_id) override {
-    RedirtyLoggedCardTableEntryClosure cl(G1CollectedHeap::heap(), _evac_failure_regions);
-
-    uint start = worker_id;
-    for (uint i = 0; i < _num_buffer_lists; i++) {
-      uint index = (start + i) % _num_buffer_lists;
-
-      BufferNode* next = AtomicAccess::load(&_rdc_buffers[index]._head);
-      BufferNode* tail = AtomicAccess::load(&_rdc_buffers[index]._tail);
-
-      while (next != nullptr) {
-        BufferNode* node = next;
-        next = AtomicAccess::cmpxchg(&_rdc_buffers[index]._head, node, (node != tail ) ? node->next() : nullptr);
-        if (next == node) {
-          cl.apply_to_buffer(node, worker_id);
-          next = (node != tail ) ? node->next() : nullptr;
-        } else {
-          break; // If there is contention, move to the next BufferNodeList
-        }
-      }
-    }
-    record_work_item(worker_id, 0, cl.num_dirtied());
-  }
-};
-
 // Helper class to keep statistics for the collection set freeing
 class FreeCSetStats {
   size_t _before_used_bytes;   // Usage in regions successfully evacuate
@@ -797,7 +718,6 @@ public:
     JFREventForRegion event(r, _worker_id);
     TimerForRegion timer(timer_for_region(r));
 
-
     if (r->is_young()) {
       assert_tracks_surviving_words(r);
       r->record_surv_words_in_group(_surviving_young_words[r->young_index_in_cset()]);
@@ -908,24 +828,34 @@ public:
   }
 };
 
-class G1PostEvacuateCollectionSetCleanupTask2::ResizeTLABsTask : public G1AbstractSubTask {
+class G1PostEvacuateCollectionSetCleanupTask2::ResizeTLABsAndSwapCardTableTask : public G1AbstractSubTask {
   G1JavaThreadsListClaimer _claimer;
 
   // There is not much work per thread so the number of threads per worker is high.
   static const uint ThreadsPerWorker = 250;
 
 public:
-  ResizeTLABsTask() : G1AbstractSubTask(G1GCPhaseTimes::ResizeThreadLABs), _claimer(ThreadsPerWorker) { }
+  ResizeTLABsAndSwapCardTableTask()
+    : G1AbstractSubTask(G1GCPhaseTimes::ResizeThreadLABs), _claimer(ThreadsPerWorker)
+  {
+    G1BarrierSet::g1_barrier_set()->swap_global_card_table();
+  }
 
   void do_work(uint worker_id) override {
-    class ResizeClosure : public ThreadClosure {
+
+    class ResizeAndSwapCardTableClosure : public ThreadClosure {
     public:
 
       void do_thread(Thread* thread) {
-        static_cast<JavaThread*>(thread)->tlab().resize();
+        if (UseTLAB && ResizeTLAB) {
+          static_cast<JavaThread*>(thread)->tlab().resize();
+        }
+
+        G1BarrierSet::g1_barrier_set()->update_card_table_base(thread);
       }
-    } cl;
-    _claimer.apply(&cl);
+    } resize_and_swap_cl;
+
+    _claimer.apply(&resize_and_swap_cl);
   }
 
   double worker_cost() const override {
@@ -968,13 +898,8 @@ G1PostEvacuateCollectionSetCleanupTask2::G1PostEvacuateCollectionSetCleanupTask2
   if (evac_failure_regions->has_regions_evac_failed()) {
     add_parallel_task(new ProcessEvacuationFailedRegionsTask(evac_failure_regions));
   }
-  add_parallel_task(new RedirtyLoggedCardsTask(evac_failure_regions,
-                                               per_thread_states->rdc_buffers(),
-                                               per_thread_states->num_workers()));
 
-  if (UseTLAB && ResizeTLAB) {
-    add_parallel_task(new ResizeTLABsTask());
-  }
+  add_parallel_task(new ResizeTLABsAndSwapCardTableTask());
   add_parallel_task(new FreeCollectionSetTask(evacuation_info,
                                               per_thread_states->surviving_young_words(),
                                               evac_failure_regions));
diff --git a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp
index ad850af2eac..bc3a08e2080 100644
--- a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp
+++ b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -55,9 +55,8 @@ public:
 // - Eagerly Reclaim Humongous Objects (s)
 // - Update Derived Pointers (s)
 // - Clear Retained Region Data (on evacuation failure)
-// - Redirty Logged Cards
 // - Free Collection Set
-// - Resize TLABs
+// - Resize TLABs and Swap Card Table
 // - Reset the reusable PartialArrayStateManager.
 class G1PostEvacuateCollectionSetCleanupTask2 : public G1BatchedTask {
   class EagerlyReclaimHumongousObjectsTask;
@@ -66,9 +65,8 @@ class G1PostEvacuateCollectionSetCleanupTask2 : public G1BatchedTask {
 #endif
 
   class ProcessEvacuationFailedRegionsTask;
-  class RedirtyLoggedCardsTask;
   class FreeCollectionSetTask;
-  class ResizeTLABsTask;
+  class ResizeTLABsAndSwapCardTableTask;
   class ResetPartialArrayStateManagerTask;
 
 public:
diff --git a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp
index 7214d624def..b11213ddeb3 100644
--- a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp
+++ b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp
@@ -24,7 +24,6 @@
 
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1RegionPinCache.inline.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/g1/g1YoungGCPreEvacuateTasks.hpp"
@@ -35,23 +34,21 @@
 #include "runtime/thread.inline.hpp"
 #include "runtime/threads.hpp"
 
-class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABAndFlushLogs : public G1AbstractSubTask {
+class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABs : public G1AbstractSubTask {
   G1JavaThreadsListClaimer _claimer;
 
   // Per worker thread statistics.
   ThreadLocalAllocStats* _local_tlab_stats;
-  G1ConcurrentRefineStats* _local_refinement_stats;
 
   uint _num_workers;
 
   // There is relatively little work to do per thread.
   static const uint ThreadsPerWorker = 250;
 
-  struct RetireTLABAndFlushLogsClosure : public ThreadClosure {
+  struct RetireTLABClosure : public ThreadClosure {
     ThreadLocalAllocStats _tlab_stats;
-    G1ConcurrentRefineStats _refinement_stats;
 
-    RetireTLABAndFlushLogsClosure() : _tlab_stats(), _refinement_stats() { }
+    RetireTLABClosure() : _tlab_stats() { }
 
     void do_thread(Thread* thread) override {
       assert(thread->is_Java_thread(), "must be");
@@ -61,37 +58,29 @@ class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABAndFlushLogs : pu
       if (UseTLAB) {
         thread->retire_tlab(&_tlab_stats);
       }
-      // Concatenate logs.
-      G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-      _refinement_stats += qset.concatenate_log_and_stats(thread);
       // Flush region pin count cache.
       G1ThreadLocalData::pin_count_cache(thread).flush();
     }
   };
 
 public:
-  JavaThreadRetireTLABAndFlushLogs() :
-    G1AbstractSubTask(G1GCPhaseTimes::RetireTLABsAndFlushLogs),
+  JavaThreadRetireTLABs() :
+    G1AbstractSubTask(G1GCPhaseTimes::RetireTLABs),
     _claimer(ThreadsPerWorker),
     _local_tlab_stats(nullptr),
-    _local_refinement_stats(nullptr),
     _num_workers(0) {
   }
 
-  ~JavaThreadRetireTLABAndFlushLogs() {
-    static_assert(std::is_trivially_destructible<G1ConcurrentRefineStats>::value, "must be");
-    FREE_C_HEAP_ARRAY(G1ConcurrentRefineStats, _local_refinement_stats);
-
+  ~JavaThreadRetireTLABs() {
     static_assert(std::is_trivially_destructible<ThreadLocalAllocStats>::value, "must be");
     FREE_C_HEAP_ARRAY(ThreadLocalAllocStats, _local_tlab_stats);
   }
 
   void do_work(uint worker_id) override {
-    RetireTLABAndFlushLogsClosure tc;
+    RetireTLABClosure tc;
     _claimer.apply(&tc);
 
     _local_tlab_stats[worker_id] = tc._tlab_stats;
-    _local_refinement_stats[worker_id] = tc._refinement_stats;
   }
 
   double worker_cost() const override {
@@ -101,11 +90,9 @@ public:
   void set_max_workers(uint max_workers) override {
     _num_workers = max_workers;
     _local_tlab_stats = NEW_C_HEAP_ARRAY(ThreadLocalAllocStats, _num_workers, mtGC);
-    _local_refinement_stats = NEW_C_HEAP_ARRAY(G1ConcurrentRefineStats, _num_workers, mtGC);
 
     for (uint i = 0; i < _num_workers; i++) {
       ::new (&_local_tlab_stats[i]) ThreadLocalAllocStats();
-      ::new (&_local_refinement_stats[i]) G1ConcurrentRefineStats();
     }
   }
 
@@ -116,85 +103,15 @@ public:
     }
     return result;
   }
-
-  G1ConcurrentRefineStats refinement_stats() const {
-    G1ConcurrentRefineStats result;
-    for (uint i = 0; i < _num_workers; i++) {
-      result += _local_refinement_stats[i];
-    }
-    return result;
-  }
-};
-
-class G1PreEvacuateCollectionSetBatchTask::NonJavaThreadFlushLogs : public G1AbstractSubTask {
-  struct FlushLogsClosure : public ThreadClosure {
-    G1ConcurrentRefineStats _refinement_stats;
-
-    FlushLogsClosure() : _refinement_stats() { }
-
-    void do_thread(Thread* thread) override {
-      G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-      _refinement_stats += qset.concatenate_log_and_stats(thread);
-
-      assert(G1ThreadLocalData::pin_count_cache(thread).count() == 0, "NonJava thread has pinned Java objects");
-    }
-  } _tc;
-
-public:
-  NonJavaThreadFlushLogs() : G1AbstractSubTask(G1GCPhaseTimes::NonJavaThreadFlushLogs), _tc() { }
-
-  void do_work(uint worker_id) override {
-    Threads::non_java_threads_do(&_tc);
-  }
-
-  double worker_cost() const override {
-    return 1.0;
-  }
-
-  G1ConcurrentRefineStats refinement_stats() const { return _tc._refinement_stats; }
 };
 
 G1PreEvacuateCollectionSetBatchTask::G1PreEvacuateCollectionSetBatchTask() :
   G1BatchedTask("Pre Evacuate Prepare", G1CollectedHeap::heap()->phase_times()),
-  _old_pending_cards(G1BarrierSet::dirty_card_queue_set().num_cards()),
-  _java_retire_task(new JavaThreadRetireTLABAndFlushLogs()),
-  _non_java_retire_task(new NonJavaThreadFlushLogs()) {
+  _java_retire_task(new JavaThreadRetireTLABs()) {
 
-  // Disable mutator refinement until concurrent refinement decides otherwise.
-  G1BarrierSet::dirty_card_queue_set().set_mutator_refinement_threshold(SIZE_MAX);
-
-  add_serial_task(_non_java_retire_task);
   add_parallel_task(_java_retire_task);
 }
 
-static void verify_empty_dirty_card_logs() {
-#ifdef ASSERT
-  ResourceMark rm;
-
-  struct Verifier : public ThreadClosure {
-    Verifier() {}
-    void do_thread(Thread* t) override {
-      G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t);
-      assert(queue.is_empty(), "non-empty dirty card queue for thread %s", t->name());
-    }
-  } verifier;
-  Threads::threads_do(&verifier);
-#endif
-}
-
 G1PreEvacuateCollectionSetBatchTask::~G1PreEvacuateCollectionSetBatchTask() {
   _java_retire_task->tlab_stats().publish();
-
-  G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-
-  G1ConcurrentRefineStats total_refinement_stats;
-  total_refinement_stats += _java_retire_task->refinement_stats();
-  total_refinement_stats += _non_java_retire_task->refinement_stats();
-  qset.update_refinement_stats(total_refinement_stats);
-
-  verify_empty_dirty_card_logs();
-
-  size_t pending_cards = qset.num_cards();
-  size_t thread_buffer_cards = pending_cards - _old_pending_cards;
-  G1CollectedHeap::heap()->policy()->record_concurrent_refinement_stats(pending_cards, thread_buffer_cards);
 }
diff --git a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp
index 791031d979f..7574862872c 100644
--- a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp
+++ b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,18 +28,13 @@
 #include "gc/g1/g1BatchedTask.hpp"
 
 // Set of pre evacuate collection set tasks containing ("s" means serial):
-// - Retire TLAB and Flush Logs (Java threads)
+// - Retire TLABs (Java threads)
 // - Flush pin count cache (Java threads)
-// - Flush Logs (s) (Non-Java threads)
 class G1PreEvacuateCollectionSetBatchTask : public G1BatchedTask {
-  class JavaThreadRetireTLABAndFlushLogs;
-  class NonJavaThreadFlushLogs;
-
-  size_t _old_pending_cards;
+  class JavaThreadRetireTLABs;
 
   // References to the tasks to retain access to statistics.
-  JavaThreadRetireTLABAndFlushLogs* _java_retire_task;
-  NonJavaThreadFlushLogs* _non_java_retire_task;
+  JavaThreadRetireTLABs* _java_retire_task;
 
 public:
   G1PreEvacuateCollectionSetBatchTask();
diff --git a/src/hotspot/share/gc/g1/g1_globals.hpp b/src/hotspot/share/gc/g1/g1_globals.hpp
index 1c712492f74..b338c11d5be 100644
--- a/src/hotspot/share/gc/g1/g1_globals.hpp
+++ b/src/hotspot/share/gc/g1/g1_globals.hpp
@@ -162,6 +162,11 @@
           "a single expand attempt.")                                       \
           range(0, 100)                                                     \
                                                                             \
+  product(size_t, G1PerThreadPendingCardThreshold, 256, DIAGNOSTIC,         \
+          "Number of pending cards allowed on the card table per GC "       \
+          "worker thread before considering starting refinement.")          \
+          range(0, UINT_MAX)                                                \
+                                                                            \
   product(uint, G1ShrinkByPercentOfAvailable, 50, DIAGNOSTIC,               \
           "When shrinking, maximum % of free space to free for a single "   \
           "shrink attempt.")                                                \
@@ -188,10 +193,6 @@
           "bound of acceptable deviation range.")                           \
           constraint(G1CPUUsageShrinkConstraintFunc, AfterErgo)             \
                                                                             \
-  product(size_t, G1UpdateBufferSize, 256,                                  \
-          "Size of an update buffer")                                       \
-          constraint(G1UpdateBufferSizeConstraintFunc, AfterErgo)           \
-                                                                            \
   product(uint, G1RSetUpdatingPauseTimePercent, 10,                         \
           "A target percentage of time that is allowed to be spend on "     \
           "processing remembered set update buffers during the collection " \
diff --git a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp
index 488a9c7aac9..2b084b387bc 100644
--- a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp
+++ b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp
@@ -206,12 +206,6 @@ JVMFlag::Error G1SATBBufferSizeConstraintFunc(size_t value, bool verbose) {
                                        verbose);
 }
 
-JVMFlag::Error G1UpdateBufferSizeConstraintFunc(size_t value, bool verbose) {
-  return buffer_size_constraint_helper(FLAG_MEMBER_ENUM(G1UpdateBufferSize),
-                                       value,
-                                       verbose);
-}
-
 JVMFlag::Error gc_cpu_usage_threshold_helper(JVMFlagsEnum flagid,
                                              uint value,
                                              bool verbose) {
diff --git a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp
index 89f05d73dcc..b2c7bb6dc96 100644
--- a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp
+++ b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp
@@ -47,7 +47,6 @@
                                                       \
   /* G1 PtrQueue buffer size constraints */           \
   f(size_t, G1SATBBufferSizeConstraintFunc)           \
-  f(size_t, G1UpdateBufferSizeConstraintFunc)         \
                                                       \
   /* G1 GC deviation counter threshold constraints */ \
   f(uint, G1CPUUsageExpandConstraintFunc)             \
diff --git a/src/hotspot/share/gc/g1/vmStructs_g1.hpp b/src/hotspot/share/gc/g1/vmStructs_g1.hpp
index 651808b4ba0..67c930e1b63 100644
--- a/src/hotspot/share/gc/g1/vmStructs_g1.hpp
+++ b/src/hotspot/share/gc/g1/vmStructs_g1.hpp
@@ -82,8 +82,7 @@
   declare_constant(G1HeapRegionType::StartsHumongousTag)                      \
   declare_constant(G1HeapRegionType::ContinuesHumongousTag)                   \
   declare_constant(G1HeapRegionType::OldMask)                                 \
-  declare_constant(BarrierSet::G1BarrierSet)                                  \
-  declare_constant(G1CardTable::g1_young_gen)
+  declare_constant(BarrierSet::G1BarrierSet)
 
 #define VM_TYPES_G1GC(declare_type,                                           \
                       declare_toplevel_type,                                  \
@@ -100,7 +99,6 @@
   declare_toplevel_type(PtrQueue)                                             \
   declare_toplevel_type(G1HeapRegionType)                                     \
   declare_toplevel_type(SATBMarkQueue)                                        \
-  declare_toplevel_type(G1DirtyCardQueue)                                     \
                                                                               \
   declare_toplevel_type(G1CollectedHeap*)                                     \
   declare_toplevel_type(G1HeapRegion*)                                        \
diff --git a/src/hotspot/share/gc/shared/bufferNodeList.cpp b/src/hotspot/share/gc/shared/bufferNodeList.cpp
deleted file mode 100644
index 768f40e0985..00000000000
--- a/src/hotspot/share/gc/shared/bufferNodeList.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "gc/shared/bufferNodeList.hpp"
-#include "utilities/debug.hpp"
-
-BufferNodeList::BufferNodeList() :
-  _head(nullptr), _tail(nullptr), _entry_count(0) {}
-
-BufferNodeList::BufferNodeList(BufferNode* head,
-                               BufferNode* tail,
-                               size_t entry_count) :
-  _head(head), _tail(tail), _entry_count(entry_count)
-{
-  assert((_head == nullptr) == (_tail == nullptr), "invariant");
-  assert((_head == nullptr) == (_entry_count == 0), "invariant");
-}
diff --git a/src/hotspot/share/gc/shared/cardTable.cpp b/src/hotspot/share/gc/shared/cardTable.cpp
index 76b8eb4d718..34f1847befe 100644
--- a/src/hotspot/share/gc/shared/cardTable.cpp
+++ b/src/hotspot/share/gc/shared/cardTable.cpp
@@ -225,6 +225,9 @@ uintx CardTable::ct_max_alignment_constraint() {
 
 #ifndef PRODUCT
 void CardTable::verify_region(MemRegion mr, CardValue val, bool val_equals) {
+  if (mr.is_empty()) {
+    return;
+  }
   CardValue* start    = byte_for(mr.start());
   CardValue* end      = byte_for(mr.last());
   bool failures = false;
@@ -255,7 +258,8 @@ void CardTable::verify_dirty_region(MemRegion mr) {
 }
 #endif
 
-void CardTable::print_on(outputStream* st) const {
-  st->print_cr("Card table byte_map: [" PTR_FORMAT "," PTR_FORMAT "] _byte_map_base: " PTR_FORMAT,
+void CardTable::print_on(outputStream* st, const char* description) const {
+  st->print_cr("%s table byte_map: [" PTR_FORMAT "," PTR_FORMAT "] _byte_map_base: " PTR_FORMAT,
+               description,
                p2i(_byte_map), p2i(_byte_map + _byte_map_size), p2i(_byte_map_base));
 }
diff --git a/src/hotspot/share/gc/shared/cardTable.hpp b/src/hotspot/share/gc/shared/cardTable.hpp
index ee41be06be0..63dcfe7aecb 100644
--- a/src/hotspot/share/gc/shared/cardTable.hpp
+++ b/src/hotspot/share/gc/shared/cardTable.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -203,12 +203,12 @@ public:
 
   virtual bool is_in_young(const void* p) const = 0;
 
-  // Print a description of the memory for the card table
-  virtual void print_on(outputStream* st) const;
+  // Print card table information.
+  void print_on(outputStream* st, const char* description = "Card") const;
 
   // val_equals -> it will check that all cards covered by mr equal val
   // !val_equals -> it will check that all cards covered by mr do not equal val
-  void verify_region(MemRegion mr, CardValue val, bool val_equals) PRODUCT_RETURN;
+  virtual void verify_region(MemRegion mr, CardValue val, bool val_equals) PRODUCT_RETURN;
   void verify_not_dirty_region(MemRegion mr) PRODUCT_RETURN;
   void verify_dirty_region(MemRegion mr) PRODUCT_RETURN;
 };
diff --git a/src/hotspot/share/gc/shared/workerDataArray.hpp b/src/hotspot/share/gc/shared/workerDataArray.hpp
index b2a81bc9482..587f9bbd167 100644
--- a/src/hotspot/share/gc/shared/workerDataArray.hpp
+++ b/src/hotspot/share/gc/shared/workerDataArray.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,7 +34,7 @@ template <class T>
 class WorkerDataArray  : public CHeapObj<mtGC> {
   friend class WDAPrinter;
 public:
-  static const uint MaxThreadWorkItems = 9;
+  static const uint MaxThreadWorkItems = 10;
 private:
   T*          _data;
   uint        _length;
diff --git a/src/hotspot/share/jvmci/jvmciRuntime.cpp b/src/hotspot/share/jvmci/jvmciRuntime.cpp
index 137782f93ef..e75527235f0 100644
--- a/src/hotspot/share/jvmci/jvmciRuntime.cpp
+++ b/src/hotspot/share/jvmci/jvmciRuntime.cpp
@@ -589,10 +589,6 @@ void JVMCIRuntime::write_barrier_pre(JavaThread* thread, oopDesc* obj) {
   G1BarrierSetRuntime::write_ref_field_pre_entry(obj, thread);
 }
 
-void JVMCIRuntime::write_barrier_post(JavaThread* thread, volatile CardValue* card_addr) {
-  G1BarrierSetRuntime::write_ref_field_post_entry(card_addr, thread);
-}
-
 #endif // INCLUDE_G1GC
 
 JRT_LEAF(jboolean, JVMCIRuntime::validate_object(JavaThread* thread, oopDesc* parent, oopDesc* child))
diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
index 3ddf7de0510..7ddb9be540a 100644
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@@ -560,6 +560,7 @@
   declare_constant(BranchData::not_taken_off_set)                         \
                                                                           \
   declare_constant_with_value("CardTable::dirty_card", CardTable::dirty_card_val()) \
+  declare_constant_with_value("CardTable::clean_card", CardTable::clean_card_val()) \
   declare_constant_with_value("LockStack::_end_offset", LockStack::end_offset()) \
   declare_constant_with_value("OMCache::oop_to_oop_difference", OMCache::oop_to_oop_difference()) \
   declare_constant_with_value("OMCache::oop_to_monitor_difference", OMCache::oop_to_monitor_difference()) \
@@ -928,7 +929,6 @@
   declare_function(JVMCIRuntime::vm_error)                                \
   declare_function(JVMCIRuntime::load_and_clear_exception)                \
   G1GC_ONLY(declare_function(JVMCIRuntime::write_barrier_pre))            \
-  G1GC_ONLY(declare_function(JVMCIRuntime::write_barrier_post))           \
   SHENANDOAHGC_ONLY(declare_function(ShenandoahRuntime::load_reference_barrier_strong))         \
   SHENANDOAHGC_ONLY(declare_function(ShenandoahRuntime::load_reference_barrier_strong_narrow))  \
   SHENANDOAHGC_ONLY(declare_function(ShenandoahRuntime::load_reference_barrier_weak))           \
@@ -947,12 +947,10 @@
   static_field(G1HeapRegion, LogOfHRGrainBytes, uint)
 
 #define VM_INT_CONSTANTS_JVMCI_G1GC(declare_constant, declare_constant_with_value, declare_preprocessor_constant) \
-  declare_constant_with_value("G1CardTable::g1_young_gen", G1CardTable::g1_young_card_val()) \
   declare_constant_with_value("G1ThreadLocalData::satb_mark_queue_active_offset", in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset())) \
   declare_constant_with_value("G1ThreadLocalData::satb_mark_queue_index_offset", in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset())) \
   declare_constant_with_value("G1ThreadLocalData::satb_mark_queue_buffer_offset", in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset())) \
-  declare_constant_with_value("G1ThreadLocalData::dirty_card_queue_index_offset", in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset())) \
-  declare_constant_with_value("G1ThreadLocalData::dirty_card_queue_buffer_offset", in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()))
+  declare_constant_with_value("G1ThreadLocalData::card_table_base_offset", in_bytes(G1ThreadLocalData::card_table_base_offset())) \
 
 #endif // INCLUDE_G1GC
 
diff --git a/src/hotspot/share/oops/oop.cpp b/src/hotspot/share/oops/oop.cpp
index 51480c68c22..f874a39bf31 100644
--- a/src/hotspot/share/oops/oop.cpp
+++ b/src/hotspot/share/oops/oop.cpp
@@ -87,7 +87,16 @@ void oopDesc::print_value_on(outputStream* st) const {
     java_lang_String::print(obj, st);
     print_address_on(st);
   } else {
-    klass()->oop_print_value_on(obj, st);
+    Klass* k = klass_without_asserts();
+    if (k == nullptr) {
+      st->print("null klass");
+    } else if (!Metaspace::contains(k)) {
+      st->print("klass not in Metaspace");
+    } else if (!k->is_klass()) {
+      st->print("klass not a Klass");
+    } else {
+      k->oop_print_value_on(obj, st);
+    }
   }
 }
 
diff --git a/src/hotspot/share/runtime/arguments.cpp b/src/hotspot/share/runtime/arguments.cpp
index 6cfeb1dcb0f..b7ab68e143c 100644
--- a/src/hotspot/share/runtime/arguments.cpp
+++ b/src/hotspot/share/runtime/arguments.cpp
@@ -546,6 +546,7 @@ static SpecialFlag const special_jvm_flags[] = {
   { "MetaspaceReclaimPolicy",       JDK_Version::undefined(), JDK_Version::jdk(21), JDK_Version::undefined() },
   { "ZGenerational",                JDK_Version::jdk(23), JDK_Version::jdk(24), JDK_Version::undefined() },
   { "ZMarkStackSpaceLimit",         JDK_Version::undefined(), JDK_Version::jdk(25), JDK_Version::undefined() },
+  { "G1UpdateBufferSize",           JDK_Version::undefined(), JDK_Version::jdk(26), JDK_Version::jdk(27) },
 #if defined(AARCH64)
   { "NearCpool",                    JDK_Version::undefined(), JDK_Version::jdk(25), JDK_Version::undefined() },
 #endif
diff --git a/src/hotspot/share/runtime/cpuTimeCounters.cpp b/src/hotspot/share/runtime/cpuTimeCounters.cpp
index c7e48441662..e5364550b6c 100644
--- a/src/hotspot/share/runtime/cpuTimeCounters.cpp
+++ b/src/hotspot/share/runtime/cpuTimeCounters.cpp
@@ -36,6 +36,8 @@ const char* CPUTimeGroups::to_string(CPUTimeType val) {
       return "gc_conc_mark";
     case CPUTimeType::gc_conc_refine:
       return "gc_conc_refine";
+    case CPUTimeType::gc_conc_refine_control:
+      return "gc_conc_refine_control";
     case CPUTimeType::gc_service:
       return "gc_service";
     case CPUTimeType::vm:
@@ -53,6 +55,7 @@ bool CPUTimeGroups::is_gc_counter(CPUTimeType val) {
     case CPUTimeType::gc_parallel_workers:
     case CPUTimeType::gc_conc_mark:
     case CPUTimeType::gc_conc_refine:
+    case CPUTimeType::gc_conc_refine_control:
     case CPUTimeType::gc_service:
       return true;
     default:
diff --git a/src/hotspot/share/runtime/cpuTimeCounters.hpp b/src/hotspot/share/runtime/cpuTimeCounters.hpp
index efa44f9173d..9ad00492731 100644
--- a/src/hotspot/share/runtime/cpuTimeCounters.hpp
+++ b/src/hotspot/share/runtime/cpuTimeCounters.hpp
@@ -40,6 +40,7 @@ public:
     gc_parallel_workers,
     gc_conc_mark,
     gc_conc_refine,
+    gc_conc_refine_control,
     gc_service,
     vm,
     conc_dedup,
diff --git a/src/hotspot/share/runtime/mutexLocker.cpp b/src/hotspot/share/runtime/mutexLocker.cpp
index e0eafbc416b..8274d767e4e 100644
--- a/src/hotspot/share/runtime/mutexLocker.cpp
+++ b/src/hotspot/share/runtime/mutexLocker.cpp
@@ -98,15 +98,15 @@ Mutex*   PerfDataManager_lock         = nullptr;
 
 #if INCLUDE_G1GC
 Monitor* G1CGC_lock                   = nullptr;
-Mutex*   G1DetachedRefinementStats_lock = nullptr;
 Mutex*   G1FreeList_lock              = nullptr;
 Mutex*   G1MarkStackChunkList_lock    = nullptr;
 Mutex*   G1MarkStackFreeList_lock     = nullptr;
 Monitor* G1OldGCCount_lock            = nullptr;
 Mutex*   G1OldSets_lock               = nullptr;
-Mutex*   G1Uncommit_lock              = nullptr;
+Mutex*   G1ReviseYoungLength_lock     = nullptr;
 Monitor* G1RootRegionScan_lock        = nullptr;
 Mutex*   G1RareEvent_lock             = nullptr;
+Mutex*   G1Uncommit_lock              = nullptr;
 #endif
 
 Mutex*   Management_lock              = nullptr;
@@ -211,7 +211,6 @@ void mutex_init() {
 #if INCLUDE_G1GC
   if (UseG1GC) {
     MUTEX_DEFN(G1CGC_lock                    , PaddedMonitor, nosafepoint);
-    MUTEX_DEFN(G1DetachedRefinementStats_lock, PaddedMutex  , nosafepoint-2);
     MUTEX_DEFN(G1FreeList_lock               , PaddedMutex  , service-1);
     MUTEX_DEFN(G1MarkStackChunkList_lock     , PaddedMutex  , nosafepoint);
     MUTEX_DEFN(G1MarkStackFreeList_lock      , PaddedMutex  , nosafepoint);
@@ -341,8 +340,9 @@ void mutex_init() {
 
 #if INCLUDE_G1GC
   if (UseG1GC) {
-    MUTEX_DEFL(G1OldGCCount_lock             , PaddedMonitor, Threads_lock, true);
-    MUTEX_DEFL(G1RareEvent_lock              , PaddedMutex  , Threads_lock, true);
+    MUTEX_DEFL(G1OldGCCount_lock            , PaddedMonitor, Threads_lock, true);
+    MUTEX_DEFL(G1RareEvent_lock             , PaddedMutex  , Threads_lock, true);
+    MUTEX_DEFL(G1ReviseYoungLength_lock     , PaddedMutex  , Threads_lock, true);
   }
 #endif
 
diff --git a/src/hotspot/share/runtime/mutexLocker.hpp b/src/hotspot/share/runtime/mutexLocker.hpp
index 3a73edc7bf2..8cd408c99c9 100644
--- a/src/hotspot/share/runtime/mutexLocker.hpp
+++ b/src/hotspot/share/runtime/mutexLocker.hpp
@@ -93,13 +93,13 @@ extern Mutex*   FullGCALot_lock;                 // a lock to make FullGCALot MT
 
 #if INCLUDE_G1GC
 extern Monitor* G1CGC_lock;                      // used for coordination between fore- & background G1 concurrent GC threads.
-extern Mutex*   G1DetachedRefinementStats_lock;  // Lock protecting detached refinement stats for G1.
 extern Mutex*   G1FreeList_lock;                 // protects the G1 free region list during safepoints
 extern Mutex*   G1MarkStackChunkList_lock;       // Protects access to the G1 global mark stack chunk list.
 extern Mutex*   G1MarkStackFreeList_lock;        // Protects access to the G1 global mark stack free list.
 extern Monitor* G1OldGCCount_lock;               // in support of "concurrent" full gc
 extern Mutex*   G1OldSets_lock;                  // protects the G1 old region sets
 extern Mutex*   G1RareEvent_lock;                // Synchronizes (rare) parallel GC operations.
+extern Mutex*   G1ReviseYoungLength_lock;        // Protects access to young gen length revising operations.
 extern Monitor* G1RootRegionScan_lock;           // used to notify that the G1 CM threads have finished scanning the root regions
 extern Mutex*   G1Uncommit_lock;                 // protects the G1 uncommit list when not at safepoints
 #endif
diff --git a/src/hotspot/share/runtime/vmOperation.hpp b/src/hotspot/share/runtime/vmOperation.hpp
index 89a806bb75d..ada5014beee 100644
--- a/src/hotspot/share/runtime/vmOperation.hpp
+++ b/src/hotspot/share/runtime/vmOperation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -59,6 +59,7 @@
   template(G1PauseRemark)                         \
   template(G1PauseCleanup)                        \
   template(G1TryInitiateConcMark)                 \
+  template(G1RendezvousGCThreads)                 \
   template(ZMarkEndOld)                           \
   template(ZMarkEndYoung)                         \
   template(ZMarkFlushOperation)                   \
diff --git a/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java b/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java
index a9df0019ab1..01e015d50cb 100644
--- a/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java
+++ b/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java
@@ -506,10 +506,10 @@ public class TestG1BarrierGeneration {
     @Test
     @IR(failOn = IRNode.SAFEPOINT)
     @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "false"},
-        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, POST_ONLY, "1"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, POST_ONLY, ">1"},
         phase = CompilePhase.FINAL_CODE)
     @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "false"},
-        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, POST_ONLY, "1"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, POST_ONLY, ">1"},
         phase = CompilePhase.FINAL_CODE)
     @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "true"},
         failOn = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, ANY},
diff --git a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
index 17ae437358d..d28c0888579 100644
--- a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
+++ b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
@@ -108,8 +108,7 @@ public class TestGCLogMessages {
         new LogMessageWithLevel("Other:", Level.INFO),
 
         // Pre Evacuate Collection Set
-        new LogMessageWithLevel("JT Retire TLABs And Flush Logs \\(ms\\):", Level.DEBUG),
-        new LogMessageWithLevel("Non-JT Flush Logs \\(ms\\):", Level.DEBUG),
+        new LogMessageWithLevel("JavaThread Retire TLABs \\(ms\\):", Level.DEBUG),
         new LogMessageWithLevel("Choose Collection Set:", Level.DEBUG),
         new LogMessageWithLevel("Region Register:", Level.DEBUG),
         new LogMessageWithLevel("Prepare Heap Roots:", Level.DEBUG),
@@ -126,10 +125,11 @@ public class TestGCLogMessages {
         new LogMessageWithLevel("Merged Howl ArrayOfCards:", Level.DEBUG),
         new LogMessageWithLevel("Merged Howl BitMap:", Level.DEBUG),
         new LogMessageWithLevel("Merged Howl Full:", Level.DEBUG),
-        new LogMessageWithLevel("Log Buffers \\(ms\\):", Level.DEBUG),
-        new LogMessageWithLevel("Dirty Cards:", Level.DEBUG),
-        new LogMessageWithLevel("Merged Cards:", Level.DEBUG),
-        new LogMessageWithLevel("Skipped Cards:", Level.DEBUG),
+        new LogMessageWithLevel("Merged From RS Cards:", Level.DEBUG),
+        new LogMessageWithLevel("Total Cards:", Level.DEBUG),
+        new LogMessageWithLevel("Merge Refinement Table:", Level.DEBUG),
+        new LogMessageWithLevel("Sweep \\(ms\\):", Level.DEBUG),
+
         // Evacuate Collection Set
         new LogMessageWithLevel("Ext Root Scanning \\(ms\\):", Level.DEBUG),
         new LogMessageWithLevel("Thread Roots \\(ms\\):", Level.TRACE),
@@ -173,15 +173,16 @@ public class TestGCLogMessages {
         new LogMessageWithLevel("Merge Per-Thread State \\(ms\\):", Level.DEBUG),
         new LogMessageWithLevel("LAB Waste:", Level.DEBUG),
         new LogMessageWithLevel("LAB Undo Waste:", Level.DEBUG),
-        new LogMessageWithLevel("Evac Fail Extra Cards:", Level.DEBUG),
-        new LogMessageWithLevel("Clear Logged Cards \\(ms\\):", Level.DEBUG),
+        new LogMessageWithLevel("Pending Cards:", Level.DEBUG),
+        new LogMessageWithLevel("To-Young-Gen Cards:", Level.DEBUG),
+        new LogMessageWithLevel("Evac-Fail Cards:", Level.DEBUG),
+        new LogMessageWithLevel("Marked Cards:", Level.DEBUG),
+        new LogMessageWithLevel("Clear Pending Cards \\(ms\\):", Level.DEBUG),
         new LogMessageWithLevel("Recalculate Used Memory \\(ms\\):", Level.DEBUG),
 
         // Post Evacuate Cleanup 2
         new LogMessageWithLevel("Post Evacuate Cleanup 2:", Level.DEBUG),
         new LogMessageWithLevelC2OrJVMCIOnly("Update Derived Pointers", Level.DEBUG),
-        new LogMessageWithLevel("Redirty Logged Cards \\(ms\\):", Level.DEBUG),
-        new LogMessageWithLevel("Redirtied Cards:", Level.DEBUG),
         new LogMessageWithLevel("Resize TLABs \\(ms\\):", Level.DEBUG),
         new LogMessageWithLevel("Free Collection Set \\(ms\\):", Level.DEBUG),
         new LogMessageWithLevel("Serial Free Collection Set:", Level.TRACE),
@@ -243,9 +244,7 @@ public class TestGCLogMessages {
     }
 
     LogMessageWithLevel concRefineMessages[] = new LogMessageWithLevel[] {
-        new LogMessageWithLevel("Mutator refinement: ", Level.DEBUG),
-        new LogMessageWithLevel("Concurrent refinement: ", Level.DEBUG),
-        new LogMessageWithLevel("Total refinement: ", Level.DEBUG),
+        new LogMessageWithLevel("Refinement: sweep: ", Level.DEBUG),
         // "Concurrent refinement rate" optionally printed if any.
         // "Generate dirty cards rate" optionally printed if any.
     };
diff --git a/test/hotspot/jtreg/runtime/CommandLine/OptionsValidation/TestOptionsWithRanges.java b/test/hotspot/jtreg/runtime/CommandLine/OptionsValidation/TestOptionsWithRanges.java
index 033b74f7eb1..d4b47422c38 100644
--- a/test/hotspot/jtreg/runtime/CommandLine/OptionsValidation/TestOptionsWithRanges.java
+++ b/test/hotspot/jtreg/runtime/CommandLine/OptionsValidation/TestOptionsWithRanges.java
@@ -235,7 +235,6 @@ public class TestOptionsWithRanges {
          */
         excludeTestMaxRange("ConcGCThreads");
         excludeTestMaxRange("G1ConcRefinementThreads");
-        excludeTestMaxRange("G1UpdateBufferSize");
         excludeTestMaxRange("InitialHeapSize");
         excludeTestMaxRange("MaxHeapSize");
         excludeTestMaxRange("MaxRAM");
diff --git a/test/hotspot/jtreg/testlibrary_tests/ir_framework/tests/TestIRMatching.java b/test/hotspot/jtreg/testlibrary_tests/ir_framework/tests/TestIRMatching.java
index c7f8badf83b..5615cce983a 100644
--- a/test/hotspot/jtreg/testlibrary_tests/ir_framework/tests/TestIRMatching.java
+++ b/test/hotspot/jtreg/testlibrary_tests/ir_framework/tests/TestIRMatching.java
@@ -1486,7 +1486,7 @@ class CompilationOutputOfFails {
 
     @Test
     @IR(failOn = IRNode.ALLOC)
-    @IR(counts = {IRNode.COUNTED_LOOP, "1"}) // not fail
+    @IR(counts = {IRNode.COUNTED_LOOP, ">1"}) // not fail
     public void macro3() {
         for (int i = 0; i < 100; i++) {
             obj = new Object();
diff --git a/test/hotspot/jtreg/vmTestbase/gc/ArrayJuggle/Juggle2.java b/test/hotspot/jtreg/vmTestbase/gc/ArrayJuggle/Juggle2.java
index 1ab01e8179f..eff35559626 100644
--- a/test/hotspot/jtreg/vmTestbase/gc/ArrayJuggle/Juggle2.java
+++ b/test/hotspot/jtreg/vmTestbase/gc/ArrayJuggle/Juggle2.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -30,6 +30,11 @@
 /* @test @key stress randomness @library /vmTestbase /test/lib @run main/othervm -Xlog:gc=debug:gc.log gc.ArrayJuggle.Juggle2 */
 /* @test @key stress randomness @library /vmTestbase /test/lib @run main/othervm -Xlog:gc=debug:gc.log gc.ArrayJuggle.Juggle2 -tg */
 
+/*
+ * The next test stresses the interaction between (mostly) full garbage collections and refinement.
+ */
+/* @test @key stress randomness @library /vmTestbase /test/lib @run main/othervm -XX:-G1UseAdaptiveIHOP -XX:InitiatingHeapOccupancyPercent=0 -XX:G1HeapRegionSize=1m -XX:G1RSetUpdatingPauseTimePercent=0 -XX:+UnlockDiagnosticVMOptions -XX:G1PerThreadPendingCardThreshold=0 -XX:+VerifyAfterGC -Xlog:gc=debug,gc+refine=debug:gc.log gc.ArrayJuggle.Juggle2 -tg */
+
 package gc.ArrayJuggle;
 
 import nsk.share.test.*;
diff --git a/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java b/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java
index 568104a7b50..d69d47f1911 100644
--- a/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java
+++ b/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java
@@ -87,8 +87,6 @@ public class TestG1ParallelPhases {
             .collect(toSet());
 
         Set<String> allPhases = of(
-            "RetireTLABsAndFlushLogs",
-            "NonJavaThreadFlushLogs",
             "ExtRootScan",
             "ThreadRoots",
             "VM Global",
@@ -100,31 +98,32 @@ public class TestG1ParallelPhases {
             "CMRefRoots",
             "MergeER",
             "MergeRS",
-            "MergeLB",
             "ScanHR",
             "CodeRoots",
             "ObjCopy",
             "Termination",
-            "RedirtyCards",
             "RecalculateUsed",
             "ResizeTLABs",
             "FreeCSet",
             "UpdateDerivedPointers",
             "EagerlyReclaimHumongousObjects",
             "ResetPartialArrayStateManager",
-            "ClearLoggedCards",
+            "ClearPendingCards",
             "MergePSS",
             "NonYoungFreeCSet",
             "YoungFreeCSet",
             "RebuildFreeList",
             "SampleCandidates",
             "ResetMarkingState",
-            "NoteStartOfMark"
+            "NoteStartOfMark",
+            "RetireTLABs"
         );
 
         // Some GC phases may or may not occur depending on environment. Filter them out
         // since we can not reliably guarantee that they occur (or not).
         Set<String> optPhases = of(
+            // Does not always occur
+            "SweepRT",
             // The following phases only occur on evacuation failure.
             "RestoreEvacuationFailedRegions",
             "RemoveSelfForwards",