8342382: Implement JEP 522: G1 GC: Improve Throughput by Reducing Synchronization

Co-authored-by: Amit Kumar <amitkumar@openjdk.org> Co-authored-by: Martin Doerr <mdoerr@openjdk.org> Co-authored-by: Carlo Refice <carlo.refice@oracle.com> Co-authored-by: Fei Yang <fyang@openjdk.org> Reviewed-by: iwalulya, rcastanedalo, aph, ayang
2026-03-14 18:03:44 +00:00 · 2025-09-22 13:47:45 +00:00 · 2025-09-22 13:47:45 +00:00 · 8d5c005642
commit 8d5c005642
parent ca182912a3
114 changed files with 3625 additions and 4681 deletions
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
@ -86,15 +86,48 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
  }
 }

-void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
-                                                             Register start, Register count, Register scratch, RegSet saved_regs) {
-  __ push(saved_regs, sp);
-  assert_different_registers(start, count, scratch);
-  assert_different_registers(c_rarg0, count);
-  __ mov(c_rarg0, start);
-  __ mov(c_rarg1, count);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop(saved_regs, sp);
+void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm,
+                                                             DecoratorSet decorators,
+                                                             Register start,
+                                                             Register count,
+                                                             Register scratch,
+                                                             RegSet saved_regs) {
+
+  Label done;
+  Label loop;
+  Label next;
+
+  __ cbz(count, done);
+
+  // Calculate the number of card marks to set. Since the object might start and
+  // end within a card, we need to calculate this via the card table indexes of
+  // the actual start and last addresses covered by the object.
+  // Temporarily use the count register for the last element address.
+  __ lea(count, Address(start, count, Address::lsl(LogBytesPerHeapOop))); // end = start + count << LogBytesPerHeapOop
+  __ sub(count, count, BytesPerHeapOop);                                  // Use last element address for end.
+
+  __ lsr(start, start, CardTable::card_shift());
+  __ lsr(count, count, CardTable::card_shift());
+  __ sub(count, count, start);                                            // Number of bytes to mark - 1.
+
+  // Add card table base offset to start.
+  __ ldr(scratch, Address(rthread, in_bytes(G1ThreadLocalData::card_table_base_offset())));
+  __ add(start, start, scratch);
+
+  __ bind(loop);
+  if (UseCondCardMark) {
+    __ ldrb(scratch, Address(start, count));
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(scratch, 0, next);
+  }
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ strb(zr, Address(start, count));
+  __ bind(next);
+  __ subs(count, count, 1);
+  __ br(Assembler::GE, loop);
+
+  __ bind(done);
 }

 static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
@ -202,10 +235,14 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                            const Register store_addr,
                                            const Register new_val,
+                                            const Register thread,
                                            const Register tmp1,
                                            const Register tmp2,
                                            Label& done,
                                            bool new_val_may_be_null) {
+  assert(thread == rthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg, rscratch1);
+
  // Does store cross heap regions?
  __ eor(tmp1, store_addr, new_val);                     // tmp1 := store address ^ new value
  __ lsr(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);   // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
@ -214,33 +251,19 @@ static void generate_post_barrier_fast_path(MacroAssembler* masm,
  if (new_val_may_be_null) {
    __ cbz(new_val, done);
  }
-  // Storing region crossing non-null, is card young?
+  // Storing region crossing non-null.
  __ lsr(tmp1, store_addr, CardTable::card_shift());     // tmp1 := card address relative to card table base
-  __ load_byte_map_base(tmp2);                           // tmp2 := card table base address
-  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
-  __ ldrb(tmp2, Address(tmp1));                          // tmp2 := card
-  __ cmpw(tmp2, (int)G1CardTable::g1_young_card_val());  // tmp2 := card == young_card_val?
-}

-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(Assembler::StoreLoad);  // StoreLoad membar
-  __ ldrb(tmp2, Address(tmp1));     // tmp2 := card
-  __ cbzw(tmp2, done);
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ strb(zr, Address(tmp1));       // *(card address) := dirty_card_val
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, tmp1, tmp2, rscratch1);
-  __ b(done);
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ ldr(tmp2, card_table_addr);                         // tmp2 := card table base address
+  if (UseCondCardMark) {
+    __ ldrb(rscratch1, Address(tmp1, tmp2));             // rscratch1 := card
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(rscratch1, 0, done);
+  }
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ strb(zr, Address(tmp1, tmp2));                      // *(card address) := dirty_card_val
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
@ -249,27 +272,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                  Register thread,
                                                  Register tmp1,
                                                  Register tmp2) {
-  assert(thread == rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
-                             rscratch1);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
-         && tmp2 != noreg, "expecting a register");
-
  Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  __ br(Assembler::EQ, done);
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push(saved, sp);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
-  __ pop(saved, sp);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, false /* new_val_may_be_null */);
  __ bind(done);
 }

@ -329,38 +333,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register thread,
                                                     Register tmp1,
                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
-                             rscratch1);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
-         && tmp2 != noreg, "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ br(Assembler::NE, *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-  assert(stub->tmp3() == noreg, "not needed in this platform");
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ b(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -456,20 +432,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ b(*stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cbz(new_val_reg, *stub->continuation());
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ b(*stub->continuation());
-}
-
 #undef __

+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -521,74 +496,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ epilogue();
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  // arg0: store_address
-  Address store_addr(rfp, 2*BytesPerWord);
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-
-  Label done;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-
-  const Register thread = rthread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  const Register card_offset = rscratch2;
-  // LR is free here, so we can use it to hold the byte_map_base.
-  const Register byte_map_base = lr;
-
-  assert_different_registers(card_offset, byte_map_base, rscratch1);
-
-  __ load_parameter(0, card_offset);
-  __ lsr(card_offset, card_offset, CardTable::card_shift());
-  __ load_byte_map_base(byte_map_base);
-  __ ldrb(rscratch1, Address(byte_map_base, card_offset));
-  __ cmpw(rscratch1, (int)G1CardTable::g1_young_card_val());
-  __ br(Assembler::EQ, done);
-
-  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
-
-  __ membar(Assembler::StoreLoad);
-  __ ldrb(rscratch1, Address(byte_map_base, card_offset));
-  __ cbzw(rscratch1, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-  __ strb(zr, Address(byte_map_base, card_offset));
-
-  // Convert card offset into an address in card_addr
-  Register card_addr = card_offset;
-  __ add(card_addr, byte_map_base, card_addr);
-
-  __ ldr(rscratch1, queue_index);
-  __ cbz(rscratch1, runtime);
-  __ sub(rscratch1, rscratch1, wordSize);
-  __ str(rscratch1, queue_index);
-
-  // Reuse LR to hold buffer_addr
-  const Register buffer_addr = lr;
-
-  __ ldr(buffer_addr, buffer);
-  __ str(card_addr, Address(buffer_addr, rscratch1));
-  __ b(done);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-  __ pop_call_clobbered_registers();
-  __ bind(done);
-  __ epilogue();
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -32,9 +32,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -65,10 +63,15 @@ protected:
 public:
 #ifdef COMPILER1
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
 #endif

 #ifdef COMPILER2
@ -87,9 +90,7 @@ public:
                                Register thread,
                                Register tmp1,
                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif

  void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
--- a/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm,
                               Register new_val,
                               Register tmp1,
                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, new_val_may_be_null);
 }

 %}
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
@ -201,12 +201,15 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                            const Register store_addr,
                                            const Register new_val,
+                                            const Register thread,
                                            const Register tmp1,
                                            const Register tmp2,
                                            Label& done,
                                            bool new_val_may_be_null) {
-  // Does store cross heap regions?
+  assert(thread == Rthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);

+  // Does store cross heap regions?
  __ eor(tmp1, store_addr, new_val);
  __ movs(tmp1, AsmOperand(tmp1, lsr, G1HeapRegion::LogOfHRGrainBytes));
  __ b(done, eq);
@ -215,76 +218,34 @@ static void generate_post_barrier_fast_path(MacroAssembler* masm,
  if (new_val_may_be_null) {
    __ cbz(new_val, done);
  }
-  // storing region crossing non-null, is card already dirty?
-  const Register card_addr = tmp1;

-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  __ mov_address(tmp2, (address)ct->card_table()->byte_map_base());
-  __ add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift()));
+  // storing region crossing non-null, is card already non-clean?
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ ldr(tmp2, card_table_addr);
+  __ add(tmp1, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift()));

-  __ ldrb(tmp2, Address(card_addr));
-  __ cmp(tmp2, (int)G1CardTable::g1_young_card_val());
+  if (UseCondCardMark) {
+    __ ldrb(tmp2, Address(tmp1));
+    // Instead of loading clean_card_val and comparing, we exploit the fact that
+    // the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
+    __ tbz(tmp2, 0, done);
+  }
+
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zero_register()");
+  __ zero_register(tmp2);
+  __ strb(tmp2, Address(tmp1));                   // *(card address) := dirty_card_val
 }

-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            const Register tmp3,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2);
-  assert(CardTable::dirty_card_val() == 0, "adjust this code");
-  // card_addr is loaded by generate_post_barrier_fast_path
-  const Register card_addr = tmp1;
-  __ ldrb(tmp2, Address(card_addr));
-  __ cbz(tmp2, done);
-
-  // storing a region crossing, non-null oop, card is clean.
-  // dirty card and log.
-
-  __ strb(__ zero_register(tmp2), Address(card_addr));
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, card_addr, tmp2, tmp3);
-  __ b(done);
-}
-
-
 // G1 post-barrier.
 // Blows all volatile registers R0-R3,  LR).
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
-                                           Register store_addr,
-                                           Register new_val,
-                                           Register tmp1,
-                                           Register tmp2,
-                                           Register tmp3) {
+                                                  Register store_addr,
+                                                  Register new_val,
+                                                  Register tmp1,
+                                                  Register tmp2,
+                                                  Register tmp3) {
  Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  // card_addr and card are loaded by generate_post_barrier_fast_path
-  const Register card      = tmp2;
-  const Register card_addr = tmp1;
-   __ b(done, eq);
-  generate_post_barrier_slow_path(masm, Rthread, card_addr, tmp2, tmp3, done, runtime);
-
-  __ bind(runtime);
-
-  RegisterSet set = RegisterSet(store_addr) | RegisterSet(R0, R3) | RegisterSet(R12);
-  __ push(set);
-
-  if (card_addr != R0) {
-    __ mov(R0, card_addr);
-  }
-  __ mov(R1, Rthread);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), R0, R1);
-
-  __ pop(set);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, Rthread, tmp1, tmp2, done, true /* new_val_may_be_null */);
  __ bind(done);
 }

@ -344,35 +305,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register tmp1,
                                                     Register tmp2,
                                                     Register tmp3,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == Rthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
-
-  stub->initialize_registers(thread, tmp1, tmp2, tmp3);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ b(*stub->entry(), ne);
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-  Register tmp3 = stub->tmp3();
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, tmp3,  *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp2);
-  __ b(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -463,20 +399,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ b(*stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cbz(new_val_reg, *stub->continuation());
-  ce->verify_reserved_argument_area_size(1);
-  __ str(stub->addr()->as_pointer_register(), Address(SP));
-  __ call(bs->post_barrier_c1_runtime_code_blob()->code_begin(), relocInfo::runtime_call_type);
-  __ b(*stub->continuation());
+#undef __
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
 }

-#undef __
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -536,102 +471,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ b(done);
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  // Input:
-  // - store_addr, pushed on the stack
-
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  Label done;
-  Label recheck;
-  Label runtime;
-
-  Address queue_index(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  AddressLiteral cardtable(ci_card_table_address_as<address>(), relocInfo::none);
-
-  // save at least the registers that need saving if the runtime is called
-  const RegisterSet saved_regs = RegisterSet(R0,R3) | RegisterSet(R12) | RegisterSet(LR);
-  const int nb_saved_regs = 6;
-  assert(nb_saved_regs == saved_regs.size(), "fix nb_saved_regs");
-  __ push(saved_regs);
-
-  const Register r_card_addr_0 = R0; // must be R0 for the slow case
-  const Register r_obj_0 = R0;
-  const Register r_card_base_1 = R1;
-  const Register r_tmp2 = R2;
-  const Register r_index_2 = R2;
-  const Register r_buffer_3 = R3;
-  const Register tmp1 = Rtemp;
-
-  __ ldr(r_obj_0, Address(SP, nb_saved_regs*wordSize));
-  // Note: there is a comment in x86 code about not using
-  // ExternalAddress / lea, due to relocation not working
-  // properly for that address. Should be OK for arm, where we
-  // explicitly specify that 'cardtable' has a relocInfo::none
-  // type.
-  __ lea(r_card_base_1, cardtable);
-  __ add(r_card_addr_0, r_card_base_1, AsmOperand(r_obj_0, lsr, CardTable::card_shift()));
-
-  // first quick check without barrier
-  __ ldrb(r_tmp2, Address(r_card_addr_0));
-
-  __ cmp(r_tmp2, (int)G1CardTable::g1_young_card_val());
-  __ b(recheck, ne);
-
-  __ bind(done);
-
-  __ pop(saved_regs);
-
-  __ ret();
-
-  __ bind(recheck);
-
-  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp1);
-
-  // reload card state after the barrier that ensures the stored oop was visible
-  __ ldrb(r_tmp2, Address(r_card_addr_0));
-
-  assert(CardTable::dirty_card_val() == 0, "adjust this code");
-  __ cbz(r_tmp2, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-
-  assert(0 == (int)CardTable::dirty_card_val(), "adjust this code");
-  if ((ci_card_table_address_as<intptr_t>() & 0xff) == 0) {
-    // Card table is aligned so the lowest byte of the table address base is zero.
-    __ strb(r_card_base_1, Address(r_card_addr_0));
-  } else {
-    __ strb(__ zero_register(r_tmp2), Address(r_card_addr_0));
-  }
-
-  __ ldr(r_index_2, queue_index);
-  __ ldr(r_buffer_3, buffer);
-
-  __ subs(r_index_2, r_index_2, wordSize);
-  __ b(runtime, lt); // go to runtime if now negative
-
-  __ str(r_index_2, queue_index);
-
-  __ str(r_card_addr_0, Address(r_buffer_3, r_index_2));
-
-  __ b(done);
-
-  __ bind(runtime);
-
-  __ save_live_registers();
-
-  assert(r_card_addr_0 == c_rarg0, "card_addr should be in R0");
-  __ mov(c_rarg1, Rthread);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), c_rarg0, c_rarg1);
-
-  __ restore_live_registers_without_return();
-
-  __ b(done);
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -32,9 +32,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -66,10 +64,15 @@ public:
 #ifdef COMPILER1
 public:
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
 #endif

 #ifdef COMPILER2
@ -89,9 +92,7 @@ public:
                                Register tmp1,
                                Register tmp2,
                                Register tmp3,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif

 };
--- a/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
+++ b/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -63,13 +63,13 @@ static void write_barrier_post(MacroAssembler* masm,
                               Register tmp1,
                               Register tmp2,
                               Register tmp3) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, new_val_may_be_null);
 }

 %}
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
@ -28,7 +28,6 @@
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1BarrierSetRuntime.hpp"
 #include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
@ -230,78 +229,52 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
  __ bind(filtered);
 }

-static void generate_region_crossing_test(MacroAssembler* masm, const Register store_addr, const Register new_val) {
-  __ xorr(R0, store_addr, new_val);                  // tmp1 := store address ^ new value
-  __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
-}
+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+  assert_different_registers(store_addr, new_val, tmp1, R0);
+  assert_different_registers(store_addr, tmp1, tmp2, R0);

-static Address generate_card_young_test(MacroAssembler* masm, const Register store_addr, const Register tmp1, const Register tmp2) {
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  __ load_const_optimized(tmp1, (address)(ct->card_table()->byte_map_base()), tmp2);
-  __ srdi(tmp2, store_addr, CardTable::card_shift());        // tmp1 := card address relative to card table base
-  __ lbzx(R0, tmp1, tmp2);                                   // tmp1 := card address
-  __ cmpwi(CR0, R0, (int)G1CardTable::g1_young_card_val());
-  return Address(tmp1, tmp2); // return card address
-}
+  __ xorr(R0, store_addr, new_val);                          // R0 := store address ^ new value
+  __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes);         // R0 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ beq(CR0, done);

-static void generate_card_dirty_test(MacroAssembler* masm, Address card_addr) {
-  __ membar(Assembler::StoreLoad);                        // Must reload after StoreLoad membar due to concurrent refinement
-  __ lbzx(R0, card_addr.base(), card_addr.index());       // tmp2 := card
-  __ cmpwi(CR0, R0, (int)G1CardTable::dirty_card_val()); // tmp2 := card == dirty_card_val?
+  // Crosses regions, storing null?
+  if (!new_val_may_be_null) {
+#ifdef ASSERT
+    __ cmpdi(CR0, new_val, 0);
+    __ asm_assert_ne("null oop not allowed (G1 post)");      // Checked by caller.
+#endif
+  } else {
+    __ cmpdi(CR0, new_val, 0);
+    __ beq(CR0, done);
+  }
+
+  __ ld(tmp1, G1ThreadLocalData::card_table_base_offset(), thread);
+  __ srdi(tmp2, store_addr, CardTable::card_shift());        // tmp2 := card address relative to card table base
+  if (UseCondCardMark) {
+    __ lbzx(R0, tmp1, tmp2);
+    __ cmpwi(CR0, R0, (int)G1CardTable::clean_card_val());
+    __ bne(CR0, done);
+  }
+
+  __ li(R0, G1CardTable::dirty_card_val());
+  __ stbx(R0, tmp1, tmp2);
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators,
                                                  Register store_addr, Register new_val,
-                                                  Register tmp1, Register tmp2, Register tmp3,
-                                                  MacroAssembler::PreservationLevel preservation_level) {
+                                                  Register tmp1, Register tmp2) {
  bool not_null = (decorators & IS_NOT_NULL) != 0;

-  Label runtime, filtered;
-  assert_different_registers(store_addr, new_val, tmp1, tmp2);
-
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  generate_region_crossing_test(masm, store_addr, new_val);
-  __ beq(CR0, filtered);
-
-  // Crosses regions, storing null?
-  if (not_null) {
-#ifdef ASSERT
-    __ cmpdi(CR0, new_val, 0);
-    __ asm_assert_ne("null oop not allowed (G1 post)"); // Checked by caller.
-#endif
-  } else {
-    __ cmpdi(CR0, new_val, 0);
-    __ beq(CR0, filtered);
-  }
-
-  Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2);
-  __ beq(CR0, filtered);
-
-  generate_card_dirty_test(masm, card_addr);
-  __ beq(CR0, filtered);
-
-  __ li(R0, (int)G1CardTable::dirty_card_val());
-  __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val
-
-  Register Rcard_addr = tmp3;
-  __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued.
-
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime, Rcard_addr, tmp1);
-  __ b(filtered);
-
-  __ bind(runtime);
-
-  assert(preservation_level == MacroAssembler::PRESERVATION_NONE,
-         "g1_write_barrier_post doesn't support preservation levels higher than PRESERVATION_NONE");
-
-  // Save the live input values.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), Rcard_addr, R16_thread);
-
-  __ bind(filtered);
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, R16_thread, tmp1, tmp2, done, !not_null);
+  __ bind(done);
 }

 void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@ -333,8 +306,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco
    }
    g1_write_barrier_post(masm, decorators,
                          base, val,
-                          tmp1, tmp2, tmp3,
-                          preservation_level);
+                          tmp1, tmp2);
  }
 }

@ -457,70 +429,29 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register new_val,
                                                     Register tmp1,
                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub,
+                                                     bool new_val_may_be_null,
                                                     bool decode_new_val) {
  assert_different_registers(store_addr, new_val, tmp1, R0);
  assert_different_registers(store_addr, tmp1, tmp2, R0);

-  stub->initialize_registers(R16_thread, tmp1, tmp2);
+  Label done;

-  bool null_check_required = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
  Register new_val_decoded = new_val;

  if (decode_new_val) {
    assert(UseCompressedOops, "or should not be here");
-    if (null_check_required && CompressedOops::base() != nullptr) {
+    if (new_val_may_be_null && CompressedOops::base() != nullptr) {
      // We prefer doing the null check after the region crossing check.
      // Only compressed oop modes with base != null require a null check here.
      __ cmpwi(CR0, new_val, 0);
-      __ beq(CR0, *stub->continuation());
-      null_check_required = false;
+      __ beq(CR0, done);
+      new_val_may_be_null = false;
    }
    new_val_decoded = __ decode_heap_oop_not_null(tmp2, new_val);
  }

-  generate_region_crossing_test(masm, store_addr, new_val_decoded);
-  __ beq(CR0, *stub->continuation());
-
-  // crosses regions, storing null?
-  if (null_check_required) {
-    __ cmpdi(CR0, new_val_decoded, 0);
-    __ beq(CR0, *stub->continuation());
-  }
-
-  Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2);
-  assert(card_addr.base() == tmp1 && card_addr.index() == tmp2, "needed by post barrier stub");
-  __ bc_far_optimized(Assembler::bcondCRbiIs0, __ bi0(CR0, Assembler::equal), *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Address card_addr(stub->tmp1(), stub->tmp2()); // See above.
-
-  __ bind(*stub->entry());
-
-  generate_card_dirty_test(masm, card_addr);
-  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CR0, Assembler::equal), *stub->continuation());
-
-  __ li(R0, (int)G1CardTable::dirty_card_val());
-  __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val
-
-  Register Rcard_addr = stub->tmp1();
-  __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued.
-
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime, Rcard_addr, stub->tmp2());
-  __ b(*stub->continuation());
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, Rcard_addr, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ b(*stub->continuation());
+  generate_post_barrier_fast_path(masm, store_addr, new_val_decoded, R16_thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -558,28 +489,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ b(*stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
+#undef __

-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register addr_reg = stub->addr()->as_pointer_register();
-  Register new_val_reg = stub->new_val()->as_register();
-
-  __ cmpdi(CR0, new_val_reg, 0);
-  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CR0, Assembler::equal), *stub->continuation());
-
-  address c_code = bs->post_barrier_c1_runtime_code_blob()->code_begin();
-  //__ load_const_optimized(R0, c_code);
-  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(c_code));
-  __ mtctr(R0);
-  __ mr(R0, addr_reg); // Pass addr in R0.
-  __ bctrl();
-  __ b(*stub->continuation());
+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
 }

-#undef __
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -642,86 +564,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ b(restart);
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  G1BarrierSet* bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
-
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  // Using stack slots: spill addr, spill tmp2
-  const int stack_slots = 2;
-  Register tmp = R0;
-  Register addr = R14;
-  Register tmp2 = R15;
-  CardTable::CardValue* byte_map_base = bs->card_table()->byte_map_base();
-
-  Label restart, refill, ret;
-
-  // Spill
-  __ std(addr, -8, R1_SP);
-  __ std(tmp2, -16, R1_SP);
-
-  __ srdi(addr, R0, CardTable::card_shift()); // Addr is passed in R0.
-  __ load_const_optimized(/*cardtable*/ tmp2, byte_map_base, tmp);
-  __ add(addr, tmp2, addr);
-  __ lbz(tmp, 0, addr); // tmp := [addr + cardtable]
-
-  // Return if young card.
-  __ cmpwi(CR0, tmp, G1CardTable::g1_young_card_val());
-  __ beq(CR0, ret);
-
-  // Return if sequential consistent value is already dirty.
-  __ membar(Assembler::StoreLoad);
-  __ lbz(tmp, 0, addr); // tmp := [addr + cardtable]
-
-  __ cmpwi(CR0, tmp, G1CardTable::dirty_card_val());
-  __ beq(CR0, ret);
-
-  // Not dirty.
-
-  // First, dirty it.
-  __ li(tmp, G1CardTable::dirty_card_val());
-  __ stb(tmp, 0, addr);
-
-  int dirty_card_q_index_byte_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset());
-  int dirty_card_q_buf_byte_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset());
-
-  __ bind(restart);
-
-  // Get the index into the update buffer. G1DirtyCardQueue::_index is
-  // a size_t so ld_ptr is appropriate here.
-  __ ld(tmp2, dirty_card_q_index_byte_offset, R16_thread);
-
-  // index == 0?
-  __ cmpdi(CR0, tmp2, 0);
-  __ beq(CR0, refill);
-
-  __ ld(tmp, dirty_card_q_buf_byte_offset, R16_thread);
-  __ addi(tmp2, tmp2, -oopSize);
-
-  __ std(tmp2, dirty_card_q_index_byte_offset, R16_thread);
-  __ add(tmp2, tmp, tmp2);
-  __ std(addr, 0, tmp2); // [_buf + index] := <address_of_card>
-
-  // Restore temp registers and return-from-leaf.
-  __ bind(ret);
-  __ ld(tmp2, -16, R1_SP);
-  __ ld(addr, -8, R1_SP);
-  __ blr();
-
-  __ bind(refill);
-  const int nbytes_save = (MacroAssembler::num_volatile_regs + stack_slots) * BytesPerWord;
-  __ save_volatile_gprs(R1_SP, -nbytes_save); // except R0
-  __ mflr(R0);
-  __ std(R0, _abi0(lr), R1_SP);
-  __ push_frame_reg_args(nbytes_save, R0); // dummy frame for C call
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1DirtyCardQueueSet::handle_zero_index_for_thread), R16_thread);
-  __ pop_frame();
-  __ ld(R0, _abi0(lr), R1_SP);
-  __ mtlr(R0);
-  __ restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
-  __ b(restart);
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2018, 2021 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -37,9 +37,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -56,8 +54,7 @@ protected:
                            MacroAssembler::PreservationLevel preservation_level);
  void g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators,
                             Register store_addr, Register new_val,
-                             Register tmp1, Register tmp2, Register tmp3,
-                             MacroAssembler::PreservationLevel preservation_level);
+                             Register tmp1, Register tmp2);

  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                            Register base, RegisterOrConstant ind_or_offs, Register val,
@ -79,17 +76,21 @@ public:
                                Register new_val,
                                Register tmp1,
                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub,
+                                bool new_val_may_be_null,
                                bool decode_new_val);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
 #endif
 #ifdef COMPILER1
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif

  virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
--- a/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
+++ b/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2025 SAP SE. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@ -64,13 +64,13 @@ static void post_write_barrier(MacroAssembler* masm,
                               Register tmp1,
                               Register tmp2,
                               bool decode_new_val = false) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub, decode_new_val);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, new_val_may_be_null, decode_new_val);
 }

 %}
--- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
@ -87,15 +87,54 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
  }
 }

-void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
-                                                             Register start, Register count, Register tmp, RegSet saved_regs) {
-  __ push_reg(saved_regs, sp);
+void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm,
+                                                             DecoratorSet decorators,
+                                                             Register start,
+                                                             Register count,
+                                                             Register tmp,
+                                                             RegSet saved_regs) {
  assert_different_registers(start, count, tmp);
-  assert_different_registers(c_rarg0, count);
-  __ mv(c_rarg0, start);
-  __ mv(c_rarg1, count);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop_reg(saved_regs, sp);
+
+  Label loop, next, done;
+
+  // Zero count? Nothing to do.
+  __ beqz(count, done);
+
+  // Calculate the number of card marks to set. Since the object might start and
+  // end within a card, we need to calculate this via the card table indexes of
+  // the actual start and last addresses covered by the object.
+  // Temporarily use the count register for the last element address.
+  __ shadd(count, count, start, tmp, LogBytesPerHeapOop); // end = start + count << LogBytesPerHeapOop
+  __ subi(count, count, BytesPerHeapOop);                 // Use last element address for end.
+
+  __ srli(start, start, CardTable::card_shift());
+  __ srli(count, count, CardTable::card_shift());
+  __ sub(count, count, start);                            // Number of bytes to mark - 1.
+
+  // Add card table base offset to start.
+  Address card_table_address(xthread, G1ThreadLocalData::card_table_base_offset());
+  __ ld(tmp, card_table_address);
+  __ add(start, start, tmp);
+
+  __ bind(loop);
+  if (UseCondCardMark) {
+    __ add(tmp, start, count);
+    __ lbu(tmp, Address(tmp, 0));
+    static_assert((uint)G1CardTable::clean_card_val() == 0xff, "must be");
+    __ subi(tmp, tmp, G1CardTable::clean_card_val()); // Convert to clean_card_value() to a comparison
+                                                      // against zero to avoid use of an extra temp.
+    __ bnez(tmp, next);
+  }
+
+  __ add(tmp, start, count);
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ sb(zr, Address(tmp, 0));
+
+  __ bind(next);
+  __ subi(count, count, 1);
+  __ bgez(count, loop);
+
+  __ bind(done);
 }

 static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
@ -192,44 +231,37 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                            const Register store_addr,
                                            const Register new_val,
-                                            const Register tmp1,
-                                            const Register tmp2,
-                                            Label& done,
-                                            bool new_val_may_be_null) {
-  // Does store cross heap regions?
-  __ xorr(tmp1, store_addr, new_val);                    // tmp1 := store address ^ new value
-  __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);  // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
-  __ beqz(tmp1, done);
-  // Crosses regions, storing null?
-  if (new_val_may_be_null) {
-    __ beqz(new_val, done);
-  }
-  // Storing region crossing non-null, is card young?
-  __ srli(tmp1, store_addr, CardTable::card_shift());    // tmp1 := card address relative to card table base
-  __ load_byte_map_base(tmp2);                           // tmp2 := card table base address
-  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
-  __ lbu(tmp2, Address(tmp1));                           // tmp2 := card
-}
-
-static void generate_post_barrier_slow_path(MacroAssembler* masm,
                                            const Register thread,
                                            const Register tmp1,
                                            const Register tmp2,
                                            Label& done,
-                                            Label& runtime) {
-  __ membar(MacroAssembler::StoreLoad);  // StoreLoad membar
-  __ lbu(tmp2, Address(tmp1));           // tmp2 := card
-  __ beqz(tmp2, done, true);
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ sb(zr, Address(tmp1));       // *(card address) := dirty_card_val
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    thread, tmp1, tmp2, t0);
-  __ j(done);
+                                            bool new_val_may_be_null) {
+  assert(thread == xthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
+  // Does store cross heap regions?
+  __ xorr(tmp1, store_addr, new_val);                    // tmp1 := store address ^ new value
+  __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);  // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ beqz(tmp1, done);
+
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ beqz(new_val, done);
+  }
+  // Storing region crossing non-null, is card clean?
+  __ srli(tmp1, store_addr, CardTable::card_shift());    // tmp1 := card address relative to card table base
+
+  Address card_table_address(xthread, G1ThreadLocalData::card_table_base_offset());
+  __ ld(tmp2, card_table_address);                       // tmp2 := card table base address
+  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
+  if (UseCondCardMark) {
+    static_assert((uint)G1CardTable::clean_card_val() == 0xff, "must be");
+    __ lbu(tmp2, Address(tmp1, 0));                      // tmp2 := card
+    __ subi(tmp2, tmp2, G1CardTable::clean_card_val());  // Convert to clean_card_value() to a comparison
+                                                         // against zero to avoid use of an extra temp.
+    __ bnez(tmp2, done);
+  }
+  static_assert((uint)G1CardTable::dirty_card_val() == 0, "must be to use zr");
+  __ sb(zr, Address(tmp1, 0));
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
@ -238,27 +270,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                  Register thread,
                                                  Register tmp1,
                                                  Register tmp2) {
-  assert(thread == xthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg,
-         "expecting a register");
-
  Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done (tmp2 holds the card value)
-  __ mv(t0, (int)G1CardTable::g1_young_card_val());
-  __ beq(tmp2, t0, done);   // card == young_card_val?
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push_reg(saved, sp);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
-  __ pop_reg(saved, sp);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
  __ bind(done);
 }

@ -318,37 +331,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register thread,
                                                     Register tmp1,
                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  assert(thread == xthread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg,
-         "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path) (tmp2 holds the card value)
-  __ mv(t0, (int)G1CardTable::g1_young_card_val());
-  __ bne(tmp2, t0, *stub->entry(), true);
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2 = stub->tmp2();
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ j(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -443,20 +429,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ j(*stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition");
-  assert(stub->new_val()->is_register(), "Precondition");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ beqz(new_val_reg, *stub->continuation(), /* is_far */ true);
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ j(*stub->continuation());
-}
-
 #undef __

+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -507,74 +492,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ epilogue();
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  // arg0 : store_address
-  Address store_addr(fp, 2 * BytesPerWord); // 2 BytesPerWord from fp
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-
-  Label done;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-  const Register thread = xthread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  const Register card_offset = t1;
-  // RA is free here, so we can use it to hold the byte_map_base.
-  const Register byte_map_base = ra;
-
-  assert_different_registers(card_offset, byte_map_base, t0);
-
-  __ load_parameter(0, card_offset);
-  __ srli(card_offset, card_offset, CardTable::card_shift());
-  __ load_byte_map_base(byte_map_base);
-
-  // Convert card offset into an address in card_addr
-  Register card_addr = card_offset;
-  __ add(card_addr, byte_map_base, card_addr);
-
-  __ lbu(t0, Address(card_addr, 0));
-  __ sub(t0, t0, (int)G1CardTable::g1_young_card_val());
-  __ beqz(t0, done);
-
-  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
-
-  __ membar(MacroAssembler::StoreLoad);
-  __ lbu(t0, Address(card_addr, 0));
-  __ beqz(t0, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-  __ sb(zr, Address(card_addr, 0));
-
-  __ ld(t0, queue_index);
-  __ beqz(t0, runtime);
-  __ subi(t0, t0, wordSize);
-  __ sd(t0, queue_index);
-
-  // Reuse RA to hold buffer_addr
-  const Register buffer_addr = ra;
-
-  __ ld(buffer_addr, buffer);
-  __ add(t0, buffer_addr, t0);
-  __ sd(card_addr, Address(t0, 0));
-  __ j(done);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-  __ pop_call_clobbered_registers();
-  __ bind(done);
-  __ epilogue();
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -35,9 +35,7 @@ class LIR_Assembler;
 #endif
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -68,10 +66,16 @@ protected:
 public:
 #ifdef COMPILER1
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif

 #ifdef COMPILER2
@ -90,9 +94,7 @@ public:
                                Register thread,
                                Register tmp1,
                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif

  void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
--- a/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
+++ b/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@ -63,13 +63,13 @@ static void write_barrier_post(MacroAssembler* masm,
                               Register new_val,
                               Register tmp1,
                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, new_val_may_be_null);
 }

 %}
--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
@ -28,7 +28,6 @@
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1BarrierSetRuntime.hpp"
 #include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
@ -205,104 +204,71 @@ void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
  BLOCK_COMMENT("} generate_c2_pre_barrier_stub");
 }

+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+
+  __ block_comment("generate_post_barrier_fast_path {");
+
+  assert(thread == Z_thread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
+
+  // Does store cross heap regions?
+  if (VM_Version::has_DistinctOpnds()) {
+    __ z_xgrk(tmp1, store_addr, new_val);    // tmp1 := store address ^ new value
+  } else {
+    __ z_lgr(tmp1, store_addr);
+    __ z_xgr(tmp1, new_val);
+  }
+  __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ branch_optimized(Assembler::bcondEqual, done);
+
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ z_ltgr(new_val, new_val);
+    __ z_bre(done);
+  } else {
+#ifdef ASSERT
+    __ z_ltgr(new_val, new_val);
+    __ asm_assert(Assembler::bcondNotZero, "null oop not allowed (G1 post)", 0x322); // Checked by caller.
+#endif
+  }
+
+  __ z_srag(tmp1, store_addr, CardTable::card_shift());
+
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ z_alg(tmp1, card_table_addr);     // tmp1 := card address
+
+  if(UseCondCardMark) {
+    __ z_cli(0, tmp1, G1CardTable::clean_card_val());
+    __ branch_optimized(Assembler::bcondNotEqual, done);
+  }
+
+  static_assert(G1CardTable::dirty_card_val() == 0, "must be to use z_mvi");
+  __ z_mvi(0, tmp1, G1CardTable::dirty_card_val()); // *(card address) := dirty_card_val
+
+  __ block_comment("} generate_post_barrier_fast_path");
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register store_addr,
                                                     Register new_val,
                                                     Register thread,
                                                     Register tmp1,
                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
+                                                     bool new_val_may_be_null) {
  BLOCK_COMMENT("g1_write_barrier_post_c2 {");
-
-  assert(thread == Z_thread, "must be");
-  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, Z_R1_scratch);
-
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
-
-  stub->initialize_registers(thread, tmp1, tmp2);
-
-  BLOCK_COMMENT("generate_region_crossing_test {");
-  if (VM_Version::has_DistinctOpnds()) {
-    __ z_xgrk(tmp1, store_addr, new_val);
-  } else {
-    __ z_lgr(tmp1, store_addr);
-    __ z_xgr(tmp1, new_val);
-  }
-  __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);
-  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  BLOCK_COMMENT("} generate_region_crossing_test");
-
-  // crosses regions, storing null?
-  if ((stub->barrier_data() & G1C2BarrierPostNotNull) == 0) {
-    __ z_ltgr(new_val, new_val);
-    __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  }
-
-  BLOCK_COMMENT("generate_card_young_test {");
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-  // calculate address of card
-  __ load_const_optimized(tmp2, (address)ct->card_table()->byte_map_base());      // Card table base.
-  __ z_srlg(tmp1, store_addr, CardTable::card_shift());         // Index into card table.
-  __ z_algr(tmp1, tmp2);                                      // Explicit calculation needed for cli.
-
-  // Filter young.
-  __ z_cli(0, tmp1, G1CardTable::g1_young_card_val());
-
-  BLOCK_COMMENT("} generate_card_young_test");
-
-  // From here on, tmp1 holds the card address.
-  __ branch_optimized(Assembler::bcondNotEqual, *stub->entry());
-
-  __ bind(*stub->continuation());
-
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null);
+  __ bind(done);
  BLOCK_COMMENT("} g1_write_barrier_post_c2");
 }

-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-
-  BLOCK_COMMENT("generate_c2_post_barrier_stub {");
-
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-
-  Register thread     = stub->thread();
-  Register tmp1       = stub->tmp1(); // tmp1 holds the card address.
-  Register tmp2       = stub->tmp2();
-  Register Rcard_addr = tmp1;
-
-  __ bind(*stub->entry());
-
-  BLOCK_COMMENT("generate_card_clean_test {");
-  __ z_sync(); // Required to support concurrent cleaning.
-  __ z_cli(0, Rcard_addr, 0); // Reload after membar.
-  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
-  BLOCK_COMMENT("} generate_card_clean_test");
-
-  BLOCK_COMMENT("generate_dirty_card {");
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
-  __ z_mvi(0, Rcard_addr, CardTable::dirty_card_val());
-  BLOCK_COMMENT("} generate_dirty_card");
-
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    runtime,
-                                    Z_thread, tmp1, tmp2);
-
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-
-  __ bind(runtime);
-
-  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-
-  BLOCK_COMMENT("} generate_c2_post_barrier_stub");
-}
-
 #endif //COMPILER2

 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@ -451,99 +417,9 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Decorato
                                                  Register Rtmp1, Register Rtmp2, Register Rtmp3) {
  bool not_null = (decorators & IS_NOT_NULL) != 0;

-  assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); // Most probably, Rnew_val == Rtmp3.
-
-  Label callRuntime, filtered;
-
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  BLOCK_COMMENT("g1_write_barrier_post {");
-
-  // Does store cross heap regions?
-  // It does if the two addresses specify different grain addresses.
-  if (VM_Version::has_DistinctOpnds()) {
-    __ z_xgrk(Rtmp1, Rstore_addr, Rnew_val);
-  } else {
-    __ z_lgr(Rtmp1, Rstore_addr);
-    __ z_xgr(Rtmp1, Rnew_val);
-  }
-  __ z_srag(Rtmp1, Rtmp1, G1HeapRegion::LogOfHRGrainBytes);
-  __ z_bre(filtered);
-
-  // Crosses regions, storing null?
-  if (not_null) {
-#ifdef ASSERT
-    __ z_ltgr(Rnew_val, Rnew_val);
-    __ asm_assert(Assembler::bcondNotZero, "null oop not allowed (G1 post)", 0x322); // Checked by caller.
-#endif
-  } else {
-    __ z_ltgr(Rnew_val, Rnew_val);
-    __ z_bre(filtered);
-  }
-
-  Rnew_val = noreg; // end of lifetime
-
-  // Storing region crossing non-null, is card already dirty?
-  assert_different_registers(Rtmp1, Rtmp2, Rtmp3);
-  // Make sure not to use Z_R0 for any of these registers.
-  Register Rcard_addr = (Rtmp1 != Z_R0_scratch) ? Rtmp1 : Rtmp3;
-  Register Rbase      = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp3;
-
-  // calculate address of card
-  __ load_const_optimized(Rbase, (address)ct->card_table()->byte_map_base());      // Card table base.
-  __ z_srlg(Rcard_addr, Rstore_addr, CardTable::card_shift());         // Index into card table.
-  __ z_algr(Rcard_addr, Rbase);                                      // Explicit calculation needed for cli.
-  Rbase = noreg; // end of lifetime
-
-  // Filter young.
-  __ z_cli(0, Rcard_addr, G1CardTable::g1_young_card_val());
-  __ z_bre(filtered);
-
-  // Check the card value. If dirty, we're done.
-  // This also avoids false sharing of the (already dirty) card.
-  __ z_sync(); // Required to support concurrent cleaning.
-  __ z_cli(0, Rcard_addr, G1CardTable::dirty_card_val()); // Reload after membar.
-  __ z_bre(filtered);
-
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  __ z_mvi(0, Rcard_addr, G1CardTable::dirty_card_val());
-
-  Register Rcard_addr_x = Rcard_addr;
-  Register Rqueue_index = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp1;
-  if (Rcard_addr == Rqueue_index) {
-    Rcard_addr_x = Z_R0_scratch;  // Register shortage. We have to use Z_R0.
-  }
-  __ lgr_if_needed(Rcard_addr_x, Rcard_addr);
-
-  generate_queue_test_and_insertion(masm,
-                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
-                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                                    callRuntime,
-                                    Z_thread, Rcard_addr_x, Rqueue_index);
-  __ z_bru(filtered);
-
-  __ bind(callRuntime);
-
-  // TODO: do we need a frame? Introduced to be on the safe side.
-  bool needs_frame = true;
-  __ lgr_if_needed(Rcard_addr, Rcard_addr_x); // copy back asap. push_frame will destroy Z_R0_scratch!
-
-  // VM call need frame to access(write) O register.
-  if (needs_frame) {
-    __ save_return_pc();
-    __ push_frame_abi160(0); // Will use Z_R0 as tmp on old CPUs.
-  }
-
-  // Save the live input values.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), Rcard_addr, Z_thread);
-
-  if (needs_frame) {
-    __ pop_frame();
-    __ restore_return_pc();
-  }
-
-  __ bind(filtered);
+  Label done;
+  generate_post_barrier_fast_path(masm, Rstore_addr, Rnew_val, Z_thread, Rtmp1, Rtmp2, done, !not_null);
+  __ bind(done);

  BLOCK_COMMENT("} g1_write_barrier_post");
 }
@ -615,22 +491,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  ce->check_reserved_argument_area(16); // RT stub needs 2 spill slots.
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ z_ltgr(new_val_reg, new_val_reg);
-  __ branch_optimized(Assembler::bcondZero, *stub->continuation());
-  __ z_lgr(Z_R1_scratch, stub->addr()->as_pointer_register());
-  ce->emit_call_c(bs->post_barrier_c1_runtime_code_blob()->code_begin());
-  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
-}
-
 #undef __

+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+   Label done;
+   generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */);
+   masm->bind(done);
+}
+
 #define __ sasm->

 static OopMap* save_volatile_registers(StubAssembler* sasm, Register return_pc = Z_R14) {
@ -705,92 +578,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ z_bru(restart);
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  // Z_R1_scratch: oop address, address of updated memory slot
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  __ set_info("g1_post_barrier_slow_id", false);
-
-  Register addr_oop  = Z_R1_scratch;
-  Register addr_card = Z_R1_scratch;
-  Register r1        = Z_R6; // Must be saved/restored.
-  Register r2        = Z_R7; // Must be saved/restored.
-  Register cardtable = r1;   // Must be non-volatile, because it is used to save addr_card.
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-  CardTable::CardValue* byte_map_base = ct->byte_map_base();
-
-  // Save registers used below (see assertion in G1PreBarrierStub::emit_code()).
-  __ z_stg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-
-  Label not_already_dirty, restart, refill, young_card;
-
-  // Calculate address of card corresponding to the updated oop slot.
-  AddressLiteral rs(byte_map_base);
-  __ z_srlg(addr_card, addr_oop, CardTable::card_shift());
-  addr_oop = noreg; // dead now
-  __ load_const_optimized(cardtable, rs); // cardtable := <card table base>
-  __ z_agr(addr_card, cardtable); // addr_card := addr_oop>>card_shift + cardtable
-
-  __ z_cli(0, addr_card, (int)G1CardTable::g1_young_card_val());
-  __ z_bre(young_card);
-
-  __ z_sync(); // Required to support concurrent cleaning.
-
-  __ z_cli(0, addr_card, (int)CardTable::dirty_card_val());
-  __ z_brne(not_already_dirty);
-
-  __ bind(young_card);
-  // We didn't take the branch, so we're already dirty: restore
-  // used registers and return.
-  __ z_lg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_br(Z_R14);
-
-  // Not dirty.
-  __ bind(not_already_dirty);
-
-  // First, dirty it: [addr_card] := 0
-  __ z_mvi(0, addr_card, CardTable::dirty_card_val());
-
-  Register idx = cardtable; // Must be non-volatile, because it is used to save addr_card.
-  Register buf = r2;
-  cardtable = noreg; // now dead
-
-  // Save registers used below (see assertion in G1PreBarrierStub::emit_code()).
-  __ z_stg(r2, 1*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-
-  ByteSize dirty_card_q_index_byte_offset = G1ThreadLocalData::dirty_card_queue_index_offset();
-  ByteSize dirty_card_q_buf_byte_offset = G1ThreadLocalData::dirty_card_queue_buffer_offset();
-
-  __ bind(restart);
-
-  // Get the index into the update buffer. G1DirtyCardQueue::_index is
-  // a size_t so z_ltg is appropriate here.
-  __ z_ltg(idx, Address(Z_thread, dirty_card_q_index_byte_offset));
-
-  // index == 0?
-  __ z_brz(refill);
-
-  __ z_lg(buf, Address(Z_thread, dirty_card_q_buf_byte_offset));
-  __ add2reg(idx, -oopSize);
-
-  __ z_stg(addr_card, 0, idx, buf); // [_buf + index] := <address_of_card>
-  __ z_stg(idx, Address(Z_thread, dirty_card_q_index_byte_offset));
-  // Restore killed registers and return.
-  __ z_lg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_lg(r2, 1*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP);
-  __ z_br(Z_R14);
-
-  __ bind(refill);
-  save_volatile_registers(sasm);
-  __ z_lgr(idx, addr_card); // Save addr_card, tmp3 must be non-volatile.
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1DirtyCardQueueSet::handle_zero_index_for_thread),
-                                   Z_thread);
-  __ z_lgr(addr_card, idx);
-  restore_volatile_registers(sasm); // Restore addr_card.
-  __ z_bru(restart);
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2018, 2024 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -33,9 +33,7 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -60,10 +58,16 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 public:
 #ifdef COMPILER1
  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+
 #endif // COMPILER1

 #ifdef COMPILER2
@ -81,9 +85,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
                                Register thread,
                                Register tmp1,
                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif // COMPILER2

  virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
--- a/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
+++ b/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright 2024 IBM Corporation. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm,
                               Register new_val,
                               Register tmp1,
                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, new_val_may_be_null);
 }

 %} // source
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
@ -89,19 +89,53 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm

 void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count, Register tmp) {
-  __ push_call_clobbered_registers(false /* save_fpu */);
-  if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
-    assert_different_registers(c_rarg1, addr);
-    __ mov(c_rarg1, count);
-    __ mov(c_rarg0, addr);
-  } else {
-    assert_different_registers(c_rarg0, count);
-    __ mov(c_rarg0, addr);
-    __ mov(c_rarg1, count);
-  }
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
-  __ pop_call_clobbered_registers(false /* save_fpu */);
+  Label done;

+  __ testptr(count, count);
+  __ jcc(Assembler::zero, done);
+
+  // Calculate end address in "count".
+  Address::ScaleFactor scale = UseCompressedOops ? Address::times_4 : Address::times_8;
+  __ leaq(count, Address(addr, count, scale));
+
+  // Calculate start card address in "addr".
+  __ shrptr(addr, CardTable::card_shift());
+
+  Register thread = r15_thread;
+
+  __ movptr(tmp, Address(thread, in_bytes(G1ThreadLocalData::card_table_base_offset())));
+  __ addptr(addr, tmp);
+
+  // Calculate address of card of last word in the array.
+  __ subptr(count, 1);
+  __ shrptr(count, CardTable::card_shift());
+  __ addptr(count, tmp);
+
+  Label loop;
+  // Iterate from start card to end card (inclusive).
+  __ bind(loop);
+
+  Label is_clean_card;
+  if (UseCondCardMark) {
+    __ cmpb(Address(addr, 0), G1CardTable::clean_card_val());
+    __ jcc(Assembler::equal, is_clean_card);
+  } else {
+   __ movb(Address(addr, 0), G1CardTable::dirty_card_val());
+  }
+
+  Label next_card;
+  __ bind(next_card);
+  __ addptr(addr, sizeof(CardTable::CardValue));
+  __ cmpptr(addr, count);
+  __ jcc(Assembler::belowEqual, loop);
+  __ jmp(done);
+
+  __ bind(is_clean_card);
+  // Card was clean. Dirty card and go to next..
+  __ movb(Address(addr, 0), G1CardTable::dirty_card_val());
+  __ jmp(next_card);
+
+  __ bind(done);
 }

 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@ -182,7 +216,6 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
  // If expand_call is true then we expand the call_VM_leaf macro
  // directly to skip generating the check by
  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
-
  const Register thread = r15_thread;

  Label done;
@ -238,73 +271,46 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 static void generate_post_barrier_fast_path(MacroAssembler* masm,
                                            const Register store_addr,
                                            const Register new_val,
-                                            const Register tmp,
-                                            const Register tmp2,
+                                            const Register tmp1,
                                            Label& done,
                                            bool new_val_may_be_null) {
-  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
+
+  assert_different_registers(store_addr, new_val, tmp1, noreg);
+
+  Register thread = r15_thread;
+
  // Does store cross heap regions?
-  __ movptr(tmp, store_addr);                                    // tmp := store address
-  __ xorptr(tmp, new_val);                                       // tmp := store address ^ new value
-  __ shrptr(tmp, G1HeapRegion::LogOfHRGrainBytes);               // ((store address ^ new value) >> LogOfHRGrainBytes) == 0?
+  __ movptr(tmp1, store_addr);                                    // tmp1 := store address
+  __ xorptr(tmp1, new_val);                                       // tmp1 := store address ^ new value
+  __ shrptr(tmp1, G1HeapRegion::LogOfHRGrainBytes);               // ((store address ^ new value) >> LogOfHRGrainBytes) == 0?
  __ jcc(Assembler::equal, done);
+
  // Crosses regions, storing null?
  if (new_val_may_be_null) {
-    __ cmpptr(new_val, NULL_WORD);                               // new value == null?
+    __ cmpptr(new_val, NULL_WORD);                                // new value == null?
    __ jcc(Assembler::equal, done);
  }
-  // Storing region crossing non-null, is card young?
-  __ movptr(tmp, store_addr);                                    // tmp := store address
-  __ shrptr(tmp, CardTable::card_shift());                       // tmp := card address relative to card table base
-  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
-  // a valid address and therefore is not properly handled by the relocation code.
-  __ movptr(tmp2, (intptr_t)ct->card_table()->byte_map_base());  // tmp2 := card table base address
-  __ addptr(tmp, tmp2);                                          // tmp := card address
-  __ cmpb(Address(tmp, 0), G1CardTable::g1_young_card_val());    // *(card address) == young_card_val?
-}

-static void generate_post_barrier_slow_path(MacroAssembler* masm,
-                                            const Register thread,
-                                            const Register tmp,
-                                            const Register tmp2,
-                                            Label& done,
-                                            Label& runtime) {
-  __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));  // StoreLoad membar
-  __ cmpb(Address(tmp, 0), G1CardTable::dirty_card_val());       // *(card address) == dirty_card_val?
-  __ jcc(Assembler::equal, done);
+  __ movptr(tmp1, store_addr);                                    // tmp1 := store address
+  __ shrptr(tmp1, CardTable::card_shift());                       // tmp1 := card address relative to card table base
+
+  Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
+  __ addptr(tmp1, card_table_addr);                               // tmp1 := card address
+  if (UseCondCardMark) {
+    __ cmpb(Address(tmp1, 0), G1CardTable::clean_card_val());     // *(card address) == clean_card_val?
+    __ jcc(Assembler::notEqual, done);
+  }
  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  __ movb(Address(tmp, 0), G1CardTable::dirty_card_val());       // *(card address) := dirty_card_val
-  generate_queue_insertion(masm,
-                           G1ThreadLocalData::dirty_card_queue_index_offset(),
-                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
-                           runtime,
-                           thread, tmp, tmp2);
-  __ jmp(done);
+  // Dirty card.
+  __ movb(Address(tmp1, 0), G1CardTable::dirty_card_val());       // *(card address) := dirty_card_val
 }

 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                  Register store_addr,
                                                  Register new_val,
-                                                  Register tmp,
-                                                  Register tmp2) {
-  const Register thread = r15_thread;
-
+                                                  Register tmp) {
  Label done;
-  Label runtime;
-
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, done, true /* new_val_may_be_null */);
-  // If card is young, jump to done
-  __ jcc(Assembler::equal, done);
-  generate_post_barrier_slow_path(masm, thread, tmp, tmp2, done, runtime);
-
-  __ bind(runtime);
-  // save the live input values
-  RegSet saved = RegSet::of(store_addr);
-  __ push_set(saved);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp, thread);
-  __ pop_set(saved);
-
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, done, true /* new_val_may_be_null */);
  __ bind(done);
 }

@ -367,34 +373,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
                                                     Register store_addr,
                                                     Register new_val,
                                                     Register tmp,
-                                                     Register tmp2,
-                                                     G1PostBarrierStubC2* stub) {
-  const Register thread = r15_thread;
-  stub->initialize_registers(thread, tmp, tmp2);
-
-  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
-  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, *stub->continuation(), new_val_may_be_null);
-  // If card is not young, jump to stub (slow path)
-  __ jcc(Assembler::notEqual, *stub->entry());
-
-  __ bind(*stub->continuation());
-}
-
-void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                                          G1PostBarrierStubC2* stub) const {
-  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
-  Label runtime;
-  Register thread = stub->thread();
-  Register tmp = stub->tmp1(); // tmp holds the card address.
-  Register tmp2 = stub->tmp2();
-  assert(stub->tmp3() == noreg, "not needed in this platform");
-
-  __ bind(*stub->entry());
-  generate_post_barrier_slow_path(masm, thread, tmp, tmp2, *stub->continuation(), runtime);
-
-  __ bind(runtime);
-  generate_c2_barrier_runtime_call(masm, stub, tmp, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
-  __ jmp(*stub->continuation());
+                                                     bool new_val_may_be_null) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, done, new_val_may_be_null);
+  __ bind(done);
 }

 #endif // COMPILER2
@ -441,8 +423,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco
      g1_write_barrier_post(masm /*masm*/,
                            tmp1 /* store_adr */,
                            new_val /* new_val */,
-                            tmp3 /* tmp */,
-                            tmp2 /* tmp2 */);
+                            tmp3 /* tmp */);
    }
  }
 }
@ -476,21 +457,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier

 }

-void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
-  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
-  __ bind(*stub->entry());
-  assert(stub->addr()->is_register(), "Precondition.");
-  assert(stub->new_val()->is_register(), "Precondition.");
-  Register new_val_reg = stub->new_val()->as_register();
-  __ cmpptr(new_val_reg, NULL_WORD);
-  __ jcc(Assembler::equal, *stub->continuation());
-  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
-  __ call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
-  __ jmp(*stub->continuation());
-}
-
 #undef __

+void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2 /* unused on x86 */) {
+  Label done;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, done, true /* new_val_may_be_null */);
+  masm->bind(done);
+}
+
 #define __ sasm->

 void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
@ -555,78 +534,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
  __ epilogue();
 }

-void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
-  __ prologue("g1_post_barrier", false);
-
-  CardTableBarrierSet* ct =
-    barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
-  Label done;
-  Label enqueued;
-  Label runtime;
-
-  // At this point we know new_value is non-null and the new_value crosses regions.
-  // Must check to see if card is already dirty
-
-  const Register thread = r15_thread;
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  __ push_ppx(rax);
-  __ push_ppx(rcx);
-
-  const Register cardtable = rax;
-  const Register card_addr = rcx;
-
-  __ load_parameter(0, card_addr);
-  __ shrptr(card_addr, CardTable::card_shift());
-  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
-  // a valid address and therefore is not properly handled by the relocation code.
-  __ movptr(cardtable, (intptr_t)ct->card_table()->byte_map_base());
-  __ addptr(card_addr, cardtable);
-
-  __ cmpb(Address(card_addr, 0), G1CardTable::g1_young_card_val());
-  __ jcc(Assembler::equal, done);
-
-  __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
-  __ cmpb(Address(card_addr, 0), CardTable::dirty_card_val());
-  __ jcc(Assembler::equal, done);
-
-  // storing region crossing non-null, card is clean.
-  // dirty card and log.
-
-  __ movb(Address(card_addr, 0), CardTable::dirty_card_val());
-
-  const Register tmp = rdx;
-  __ push_ppx(rdx);
-
-  __ movptr(tmp, queue_index);
-  __ testptr(tmp, tmp);
-  __ jcc(Assembler::zero, runtime);
-  __ subptr(tmp, wordSize);
-  __ movptr(queue_index, tmp);
-  __ addptr(tmp, buffer);
-  __ movptr(Address(tmp, 0), card_addr);
-  __ jmp(enqueued);
-
-  __ bind(runtime);
-  __ push_call_clobbered_registers();
-
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
-
-  __ pop_call_clobbered_registers();
-
-  __ bind(enqueued);
-  __ pop_ppx(rdx);
-
-  __ bind(done);
-  __ pop_ppx(rcx);
-  __ pop_ppx(rax);
-
-  __ epilogue();
-}
-
 #undef __

 #endif // COMPILER1
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -31,10 +31,8 @@
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
-class G1PostBarrierStub;
 class G1BarrierStubC2;
 class G1PreBarrierStubC2;
-class G1PostBarrierStubC2;

 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@ -51,22 +49,28 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
  void g1_write_barrier_post(MacroAssembler* masm,
                             Register store_addr,
                             Register new_val,
-                             Register tmp,
-                             Register tmp2);
+                             Register tmp);

  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                            Address dst, Register val, Register tmp1, Register tmp2, Register tmp3);

 public:
-  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
-  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
-
-  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
-  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
-
  virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                       Register dst, Address src, Register tmp1);

+#ifdef COMPILER1
+  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
+
+  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
+
+  void g1_write_barrier_post_c1(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2);
+#endif
+
 #ifdef COMPILER2
  void g1_write_barrier_pre_c2(MacroAssembler* masm,
                               Register obj,
@ -79,10 +83,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
                                Register store_addr,
                                Register new_val,
                                Register tmp,
-                                Register tmp2,
-                                G1PostBarrierStubC2* c2_stub);
-  void generate_c2_post_barrier_stub(MacroAssembler* masm,
-                                     G1PostBarrierStubC2* stub) const;
+                                bool new_val_may_be_null);
 #endif // COMPILER2
 };

--- a/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
+++ b/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -59,15 +59,14 @@ static void write_barrier_post(MacroAssembler* masm,
                               const MachNode* node,
                               Register store_addr,
                               Register new_val,
-                               Register tmp1,
-                               Register tmp2) {
-  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+                               Register tmp1) {
+  if (!G1BarrierStubC2::needs_post_barrier(node)) {
    return;
  }
  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
-  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub);
+  bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, new_val_may_be_null);
 }

 %}
@ -95,8 +94,7 @@ instruct g1StoreP(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rRegP tmp3,
    write_barrier_post(masm, this,
                       $tmp1$$Register /* store_addr */,
                       $src$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
  %}
  ins_pipe(ialu_mem_reg);
 %}
@ -127,8 +125,7 @@ instruct g1StoreN(memory mem, rRegN src, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFl
    write_barrier_post(masm, this,
                       $tmp1$$Register /* store_addr */,
                       $tmp2$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
  %}
  ins_pipe(ialu_mem_reg);
 %}
@ -158,8 +155,7 @@ instruct g1EncodePAndStoreN(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rR
    write_barrier_post(masm, this,
                       $tmp1$$Register /* store_addr */,
                       $src$$Register /* new_val */,
-                       $tmp3$$Register /* tmp1 */,
-                       $tmp2$$Register /* tmp2 */);
+                       $tmp3$$Register /* tmp1 */);
  %}
  ins_pipe(ialu_mem_reg);
 %}
@ -187,8 +183,7 @@ instruct g1CompareAndExchangeP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -214,8 +209,7 @@ instruct g1CompareAndExchangeN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -246,8 +240,7 @@ instruct g1CompareAndSwapP(rRegI res, indirect mem, rRegP newval, rRegP tmp1, rR
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -279,8 +272,7 @@ instruct g1CompareAndSwapN(rRegI res, indirect mem, rRegN newval, rRegP tmp1, rR
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -303,8 +295,7 @@ instruct g1GetAndSetP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp2, rRegP
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
@ -328,8 +319,7 @@ instruct g1GetAndSetN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp2, rRegP
    write_barrier_post(masm, this,
                       $mem$$Register /* store_addr */,
                       $tmp1$$Register /* new_val */,
-                       $tmp2$$Register /* tmp1 */,
-                       $tmp3$$Register /* tmp2 */);
+                       $tmp2$$Register /* tmp1 */);
  %}
  ins_pipe(pipe_cmpxchg);
 %}
--- a/src/hotspot/share/code/aotCodeCache.cpp
+++ b/src/hotspot/share/code/aotCodeCache.cpp
@ -1365,7 +1365,6 @@ void AOTCodeAddressTable::init_extrs() {
 #endif // COMPILER2

 #if INCLUDE_G1GC
-  SET_ADDRESS(_extrs, G1BarrierSetRuntime::write_ref_field_post_entry);
  SET_ADDRESS(_extrs, G1BarrierSetRuntime::write_ref_field_pre_entry);
 #endif
 #if INCLUDE_SHENANDOAHGC
--- a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp
+++ b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp
@ -23,12 +23,15 @@
 */

 #include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_LIRGenerator.hpp"
+#include "c1/c1_MacroAssembler.hpp"
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
+#include "utilities/formatBuffer.hpp"
 #include "utilities/macros.hpp"

 #ifdef ASSERT
@ -42,11 +45,6 @@ void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
  bs->gen_pre_barrier_stub(ce, this);
 }

-void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
-  G1BarrierSetAssembler* bs = (G1BarrierSetAssembler*)BarrierSet::barrier_set()->barrier_set_assembler();
-  bs->gen_post_barrier_stub(ce, this);
-}
-
 void G1BarrierSetC1::pre_barrier(LIRAccess& access, LIR_Opr addr_opr,
                                 LIR_Opr pre_val, CodeEmitInfo* info) {
  LIRGenerator* gen = access.gen();
@ -114,6 +112,87 @@ void G1BarrierSetC1::pre_barrier(LIRAccess& access, LIR_Opr addr_opr,
  __ branch_destination(slow->continuation());
 }

+class LIR_OpG1PostBarrier : public LIR_Op {
+ friend class LIR_OpVisitState;
+
+private:
+  LIR_Opr       _addr;
+  LIR_Opr       _new_val;
+  LIR_Opr       _thread;
+  LIR_Opr       _tmp1;
+  LIR_Opr       _tmp2;
+
+public:
+  LIR_OpG1PostBarrier(LIR_Opr addr,
+                      LIR_Opr new_val,
+                      LIR_Opr thread,
+                      LIR_Opr tmp1,
+                      LIR_Opr tmp2)
+    : LIR_Op(lir_none, lir_none, nullptr),
+      _addr(addr),
+      _new_val(new_val),
+      _thread(thread),
+      _tmp1(tmp1),
+      _tmp2(tmp2)
+    {}
+
+  virtual void visit(LIR_OpVisitState* state) {
+    state->do_input(_addr);
+    state->do_input(_new_val);
+    state->do_input(_thread);
+
+    // Use temps to enforce different registers.
+    state->do_temp(_addr);
+    state->do_temp(_new_val);
+    state->do_temp(_thread);
+    state->do_temp(_tmp1);
+    state->do_temp(_tmp2);
+
+    if (_info != nullptr) {
+      state->do_info(_info);
+    }
+  }
+
+  virtual void emit_code(LIR_Assembler* ce) {
+    if (_info != nullptr) {
+      ce->add_debug_info_for_null_check_here(_info);
+    }
+
+    Register addr = _addr->as_pointer_register();
+    Register new_val = _new_val->as_pointer_register();
+    Register thread = _thread->as_pointer_register();
+    Register tmp1 = _tmp1->as_pointer_register();
+    Register tmp2 = _tmp2->as_pointer_register();
+
+    // This may happen for a store of x.a = x - we do not need a post barrier for those
+    // as the cross-region test will always exit early anyway.
+    // The post barrier implementations can assume that addr and new_val are different
+    // then.
+    if (addr == new_val) {
+      ce->masm()->block_comment(err_msg("same addr/new_val due to self-referential store with imprecise card mark %s", addr->name()));
+      return;
+    }
+
+    G1BarrierSetAssembler* bs_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+    bs_asm->g1_write_barrier_post_c1(ce->masm(), addr, new_val, thread, tmp1, tmp2);
+  }
+
+  virtual void print_instr(outputStream* out) const {
+    _addr->print(out);     out->print(" ");
+    _new_val->print(out);  out->print(" ");
+    _thread->print(out);   out->print(" ");
+    _tmp1->print(out);     out->print(" ");
+    _tmp2->print(out);     out->print(" ");
+    out->cr();
+  }
+
+#ifndef PRODUCT
+  virtual const char* name() const  {
+    return "lir_g1_post_barrier";
+  }
+#endif // PRODUCT
+};
+
 void G1BarrierSetC1::post_barrier(LIRAccess& access, LIR_Opr addr, LIR_Opr new_val) {
  LIRGenerator* gen = access.gen();
  DecoratorSet decorators = access.decorators();
@ -150,29 +229,11 @@ void G1BarrierSetC1::post_barrier(LIRAccess& access, LIR_Opr addr, LIR_Opr new_v
  }
  assert(addr->is_register(), "must be a register at this point");

-  LIR_Opr xor_res = gen->new_pointer_register();
-  LIR_Opr xor_shift_res = gen->new_pointer_register();
-  if (two_operand_lir_form) {
-    __ move(addr, xor_res);
-    __ logical_xor(xor_res, new_val, xor_res);
-    __ move(xor_res, xor_shift_res);
-    __ unsigned_shift_right(xor_shift_res,
-                            LIR_OprFact::intConst(checked_cast<jint>(G1HeapRegion::LogOfHRGrainBytes)),
-                            xor_shift_res,
-                            LIR_Opr::illegalOpr());
-  } else {
-    __ logical_xor(addr, new_val, xor_res);
-    __ unsigned_shift_right(xor_res,
-                            LIR_OprFact::intConst(checked_cast<jint>(G1HeapRegion::LogOfHRGrainBytes)),
-                            xor_shift_res,
-                            LIR_Opr::illegalOpr());
-  }
-
-  __ cmp(lir_cond_notEqual, xor_shift_res, LIR_OprFact::intptrConst(NULL_WORD));
-
-  CodeStub* slow = new G1PostBarrierStub(addr, new_val);
-  __ branch(lir_cond_notEqual, slow);
-  __ branch_destination(slow->continuation());
+  __ append(new LIR_OpG1PostBarrier(addr,
+                                    new_val,
+                                    gen->getThreadPointer() /* thread */,
+                                    gen->new_pointer_register() /* tmp1 */,
+                                    gen->new_pointer_register() /* tmp2 */));
 }

 void G1BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) {
@ -207,20 +268,9 @@ class C1G1PreBarrierCodeGenClosure : public StubAssemblerCodeGenClosure {
  }
 };

-class C1G1PostBarrierCodeGenClosure : public StubAssemblerCodeGenClosure {
-  virtual OopMapSet* generate_code(StubAssembler* sasm) {
-    G1BarrierSetAssembler* bs = (G1BarrierSetAssembler*)BarrierSet::barrier_set()->barrier_set_assembler();
-    bs->generate_c1_post_barrier_runtime_stub(sasm);
-    return nullptr;
-  }
-};
-
 bool G1BarrierSetC1::generate_c1_runtime_stubs(BufferBlob* buffer_blob) {
  C1G1PreBarrierCodeGenClosure pre_code_gen_cl;
-  C1G1PostBarrierCodeGenClosure post_code_gen_cl;
  _pre_barrier_c1_runtime_code_blob = Runtime1::generate_blob(buffer_blob, StubId::NO_STUBID, "g1_pre_barrier_slow",
                                                              false, &pre_code_gen_cl);
-  _post_barrier_c1_runtime_code_blob = Runtime1::generate_blob(buffer_blob, StubId::NO_STUBID, "g1_post_barrier_slow",
-                                                               false, &post_code_gen_cl);
-  return _pre_barrier_c1_runtime_code_blob != nullptr && _post_barrier_c1_runtime_code_blob != nullptr;
+  return _pre_barrier_c1_runtime_code_blob != nullptr;
 }
--- a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp
+++ b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp
@ -91,40 +91,11 @@ class G1PreBarrierStub: public CodeStub {
 #endif // PRODUCT
 };

-class G1PostBarrierStub: public CodeStub {
-  friend class G1BarrierSetC1;
- private:
-  LIR_Opr _addr;
-  LIR_Opr _new_val;
-
- public:
-  // addr (the address of the object head) and new_val must be registers.
-  G1PostBarrierStub(LIR_Opr addr, LIR_Opr new_val): _addr(addr), _new_val(new_val) {
-    FrameMap* f = Compilation::current()->frame_map();
-    f->update_reserved_argument_area_size(2 * BytesPerWord);
-  }
-
-  LIR_Opr addr() const { return _addr; }
-  LIR_Opr new_val() const { return _new_val; }
-
-  virtual void emit_code(LIR_Assembler* e);
-  virtual void visit(LIR_OpVisitState* visitor) {
-    // don't pass in the code emit info since it's processed in the fast path
-    visitor->do_slow_case();
-    visitor->do_input(_addr);
-    visitor->do_input(_new_val);
-  }
-#ifndef PRODUCT
-  virtual void print_name(outputStream* out) const { out->print("G1PostBarrierStub"); }
-#endif // PRODUCT
-};
-
 class CodeBlob;

 class G1BarrierSetC1 : public ModRefBarrierSetC1 {
 protected:
  CodeBlob* _pre_barrier_c1_runtime_code_blob;
-  CodeBlob* _post_barrier_c1_runtime_code_blob;

  virtual void pre_barrier(LIRAccess& access, LIR_Opr addr_opr,
                           LIR_Opr pre_val, CodeEmitInfo* info);
@ -134,11 +105,9 @@ class G1BarrierSetC1 : public ModRefBarrierSetC1 {

 public:
  G1BarrierSetC1()
-    : _pre_barrier_c1_runtime_code_blob(nullptr),
-      _post_barrier_c1_runtime_code_blob(nullptr) {}
+    : _pre_barrier_c1_runtime_code_blob(nullptr) {}

  CodeBlob* pre_barrier_c1_runtime_code_blob() { return _pre_barrier_c1_runtime_code_blob; }
-  CodeBlob* post_barrier_c1_runtime_code_blob() { return _post_barrier_c1_runtime_code_blob; }

  virtual bool generate_c1_runtime_stubs(BufferBlob* buffer_blob);
 };
--- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
+++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
@ -298,7 +298,13 @@ uint G1BarrierSetC2::estimated_barrier_size(const Node* node) const {
    nodes += 6;
  }
  if ((barrier_data & G1C2BarrierPost) != 0) {
-    nodes += 60;
+    // Approximate the number of nodes needed; an if costs 4 nodes (Cmp, Bool,
+    // If, If projection), any other (Assembly) instruction is approximated with
+    // a cost of 1.
+    nodes +=   4  // base cost for the card write containing getting base offset, address calculation and the card write;
+             + 6  // same region check: Uncompress (new_val) oop, xor, shr, (cmp), jmp
+             + 4  // new_val is null check
+             + (UseCondCardMark ? 4 : 0); // card not clean check.
  }
  return nodes;
 }
@ -386,8 +392,9 @@ public:
  }

  bool needs_liveness_data(const MachNode* mach) const {
-    return G1PreBarrierStubC2::needs_barrier(mach) ||
-           G1PostBarrierStubC2::needs_barrier(mach);
+    // Liveness data is only required to compute registers that must be preserved
+    // across the runtime call in the pre-barrier stub.
+    return G1BarrierStubC2::needs_pre_barrier(mach);
  }

  bool needs_livein_data() const {
@ -401,10 +408,22 @@ static G1BarrierSetC2State* barrier_set_state() {

 G1BarrierStubC2::G1BarrierStubC2(const MachNode* node) : BarrierStubC2(node) {}

+bool G1BarrierStubC2::needs_pre_barrier(const MachNode* node) {
+  return (node->barrier_data() & G1C2BarrierPre) != 0;
+}
+
+bool G1BarrierStubC2::needs_post_barrier(const MachNode* node) {
+  return (node->barrier_data() & G1C2BarrierPost) != 0;
+}
+
+bool G1BarrierStubC2::post_new_val_may_be_null(const MachNode* node) {
+  return (node->barrier_data() & G1C2BarrierPostNotNull) == 0;
+}
+
 G1PreBarrierStubC2::G1PreBarrierStubC2(const MachNode* node) : G1BarrierStubC2(node) {}

 bool G1PreBarrierStubC2::needs_barrier(const MachNode* node) {
-  return (node->barrier_data() & G1C2BarrierPre) != 0;
+  return needs_pre_barrier(node);
 }

 G1PreBarrierStubC2* G1PreBarrierStubC2::create(const MachNode* node) {
@ -448,48 +467,6 @@ void G1PreBarrierStubC2::emit_code(MacroAssembler& masm) {
  bs->generate_c2_pre_barrier_stub(&masm, this);
 }

-G1PostBarrierStubC2::G1PostBarrierStubC2(const MachNode* node) : G1BarrierStubC2(node) {}
-
-bool G1PostBarrierStubC2::needs_barrier(const MachNode* node) {
-  return (node->barrier_data() & G1C2BarrierPost) != 0;
-}
-
-G1PostBarrierStubC2* G1PostBarrierStubC2::create(const MachNode* node) {
-  G1PostBarrierStubC2* const stub = new (Compile::current()->comp_arena()) G1PostBarrierStubC2(node);
-  if (!Compile::current()->output()->in_scratch_emit_size()) {
-    barrier_set_state()->stubs()->append(stub);
-  }
-  return stub;
-}
-
-void G1PostBarrierStubC2::initialize_registers(Register thread, Register tmp1, Register tmp2, Register tmp3) {
-  _thread = thread;
-  _tmp1 = tmp1;
-  _tmp2 = tmp2;
-  _tmp3 = tmp3;
-}
-
-Register G1PostBarrierStubC2::thread() const {
-  return _thread;
-}
-
-Register G1PostBarrierStubC2::tmp1() const {
-  return _tmp1;
-}
-
-Register G1PostBarrierStubC2::tmp2() const {
-  return _tmp2;
-}
-
-Register G1PostBarrierStubC2::tmp3() const {
-  return _tmp3;
-}
-
-void G1PostBarrierStubC2::emit_code(MacroAssembler& masm) {
-  G1BarrierSetAssembler* bs = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
-  bs->generate_c2_post_barrier_stub(&masm, this);
-}
-
 void* G1BarrierSetC2::create_barrier_state(Arena* comp_arena) const {
  return new (comp_arena) G1BarrierSetC2State(comp_arena);
 }
--- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp
+++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp
@ -37,6 +37,10 @@ const int G1C2BarrierPostNotNull = 4;

 class G1BarrierStubC2 : public BarrierStubC2 {
 public:
+  static bool needs_pre_barrier(const MachNode* node);
+  static bool needs_post_barrier(const MachNode* node);
+  static bool post_new_val_may_be_null(const MachNode* node);
+
  G1BarrierStubC2(const MachNode* node);
  virtual void emit_code(MacroAssembler& masm) = 0;
 };
@ -64,27 +68,6 @@ public:
  virtual void emit_code(MacroAssembler& masm);
 };

-class G1PostBarrierStubC2 : public G1BarrierStubC2 {
-private:
-  Register _thread;
-  Register _tmp1;
-  Register _tmp2;
-  Register _tmp3;
-
-protected:
-  G1PostBarrierStubC2(const MachNode* node);
-
-public:
-  static bool needs_barrier(const MachNode* node);
-  static G1PostBarrierStubC2* create(const MachNode* node);
-  void initialize_registers(Register thread, Register tmp1 = noreg, Register tmp2 = noreg, Register tmp3 = noreg);
-  Register thread() const;
-  Register tmp1() const;
-  Register tmp2() const;
-  Register tmp3() const;
-  virtual void emit_code(MacroAssembler& masm);
-};
-
 class G1BarrierSetC2: public CardTableBarrierSetC2 {
 private:
  void analyze_dominating_barriers() const;
--- a/src/hotspot/share/gc/g1/g1Allocator.cpp
+++ b/src/hotspot/share/gc/g1/g1Allocator.cpp
@ -262,9 +262,6 @@ HeapWord* G1Allocator::survivor_attempt_allocation(uint node_index,
      }
    }
  }
-  if (result != nullptr) {
-    _g1h->dirty_young_block(result, *actual_word_size);
-  }
  return result;
 }

--- a/src/hotspot/share/gc/g1/g1Analytics.cpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.cpp
@ -37,12 +37,10 @@
 // They were chosen by running GCOld and SPECjbb on debris with different
 //   numbers of GC threads and choosing them based on the results

-static double cost_per_logged_card_ms_defaults[] = {
-  0.01, 0.005, 0.005, 0.003, 0.003, 0.002, 0.002, 0.0015
-};
+static double cost_per_pending_card_ms_default = 0.01;

 // all the same
-static double young_card_scan_to_merge_ratio_defaults[] = {
+static double young_card_merge_to_scan_ratio_defaults[] = {
  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 };

@ -78,8 +76,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
    _concurrent_gc_cpu_time_ms(),
    _concurrent_refine_rate_ms_seq(TruncatedSeqLength),
    _dirtied_cards_rate_ms_seq(TruncatedSeqLength),
-    _dirtied_cards_in_thread_buffers_seq(TruncatedSeqLength),
-    _card_scan_to_merge_ratio_seq(TruncatedSeqLength),
+    _card_merge_to_scan_ratio_seq(TruncatedSeqLength),
    _cost_per_card_scan_ms_seq(TruncatedSeqLength),
    _cost_per_card_merge_ms_seq(TruncatedSeqLength),
    _cost_per_code_root_ms_seq(TruncatedSeqLength),
@ -87,6 +84,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
    _pending_cards_seq(TruncatedSeqLength),
    _card_rs_length_seq(TruncatedSeqLength),
    _code_root_rs_length_seq(TruncatedSeqLength),
+    _merge_refinement_table_ms_seq(TruncatedSeqLength),
    _constant_other_time_ms_seq(TruncatedSeqLength),
    _young_other_cost_per_region_ms_seq(TruncatedSeqLength),
    _non_young_other_cost_per_region_ms_seq(TruncatedSeqLength),
@ -100,17 +98,17 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :

  uint index = MIN2(ParallelGCThreads - 1, 7u);

-  // Start with inverse of maximum STW cost.
-  _concurrent_refine_rate_ms_seq.add(1/cost_per_logged_card_ms_defaults[0]);
-  // Some applications have very low rates for logging cards.
+  _concurrent_refine_rate_ms_seq.add(1 / cost_per_pending_card_ms_default);
+  // Some applications have very low rates for dirtying cards.
  _dirtied_cards_rate_ms_seq.add(0.0);

-  _card_scan_to_merge_ratio_seq.set_initial(young_card_scan_to_merge_ratio_defaults[index]);
+  _card_merge_to_scan_ratio_seq.set_initial(young_card_merge_to_scan_ratio_defaults[index]);
  _cost_per_card_scan_ms_seq.set_initial(young_only_cost_per_card_scan_ms_defaults[index]);
  _card_rs_length_seq.set_initial(0);
  _code_root_rs_length_seq.set_initial(0);
  _cost_per_byte_copied_ms_seq.set_initial(cost_per_byte_ms_defaults[index]);

+  _merge_refinement_table_ms_seq.add(0);
  _constant_other_time_ms_seq.add(constant_other_time_ms_defaults[index]);
  _young_other_cost_per_region_ms_seq.add(young_other_cost_per_region_ms_defaults[index]);
  _non_young_other_cost_per_region_ms_seq.add(non_young_other_cost_per_region_ms_defaults[index]);
@ -196,10 +194,6 @@ void G1Analytics::report_dirtied_cards_rate_ms(double cards_per_ms) {
  _dirtied_cards_rate_ms_seq.add(cards_per_ms);
 }

-void G1Analytics::report_dirtied_cards_in_thread_buffers(size_t cards) {
-  _dirtied_cards_in_thread_buffers_seq.add(double(cards));
-}
-
 void G1Analytics::report_cost_per_card_scan_ms(double cost_per_card_ms, bool for_young_only_phase) {
  _cost_per_card_scan_ms_seq.add(cost_per_card_ms, for_young_only_phase);
 }
@ -212,8 +206,8 @@ void G1Analytics::report_cost_per_code_root_scan_ms(double cost_per_code_root_ms
  _cost_per_code_root_ms_seq.add(cost_per_code_root_ms, for_young_only_phase);
 }

-void G1Analytics::report_card_scan_to_merge_ratio(double merge_to_scan_ratio, bool for_young_only_phase) {
-  _card_scan_to_merge_ratio_seq.add(merge_to_scan_ratio, for_young_only_phase);
+void G1Analytics::report_card_merge_to_scan_ratio(double merge_to_scan_ratio, bool for_young_only_phase) {
+  _card_merge_to_scan_ratio_seq.add(merge_to_scan_ratio, for_young_only_phase);
 }

 void G1Analytics::report_cost_per_byte_ms(double cost_per_byte_ms, bool for_young_only_phase) {
@ -228,6 +222,10 @@ void G1Analytics::report_non_young_other_cost_per_region_ms(double other_cost_pe
  _non_young_other_cost_per_region_ms_seq.add(other_cost_per_region_ms);
 }

+void G1Analytics::report_merge_refinement_table_time_ms(double merge_refinement_table_time_ms) {
+  _merge_refinement_table_ms_seq.add(merge_refinement_table_time_ms);
+}
+
 void G1Analytics::report_constant_other_time_ms(double constant_other_time_ms) {
  _constant_other_time_ms_seq.add(constant_other_time_ms);
 }
@ -260,12 +258,8 @@ double G1Analytics::predict_dirtied_cards_rate_ms() const {
  return predict_zero_bounded(&_dirtied_cards_rate_ms_seq);
 }

-size_t G1Analytics::predict_dirtied_cards_in_thread_buffers() const {
-  return predict_size(&_dirtied_cards_in_thread_buffers_seq);
-}
-
 size_t G1Analytics::predict_scan_card_num(size_t card_rs_length, bool for_young_only_phase) const {
-  return card_rs_length * predict_in_unit_interval(&_card_scan_to_merge_ratio_seq, for_young_only_phase);
+  return card_rs_length * predict_in_unit_interval(&_card_merge_to_scan_ratio_seq, for_young_only_phase);
 }

 double G1Analytics::predict_card_merge_time_ms(size_t card_num, bool for_young_only_phase) const {
@ -284,6 +278,10 @@ double G1Analytics::predict_object_copy_time_ms(size_t bytes_to_copy, bool for_y
  return bytes_to_copy * predict_zero_bounded(&_cost_per_byte_copied_ms_seq, for_young_only_phase);
 }

+double G1Analytics::predict_merge_refinement_table_time_ms() const {
+  return predict_zero_bounded(&_merge_refinement_table_ms_seq);
+}
+
 double G1Analytics::predict_constant_other_time_ms() const {
  return predict_zero_bounded(&_constant_other_time_ms_seq);
 }
--- a/src/hotspot/share/gc/g1/g1Analytics.hpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -56,14 +56,13 @@ class G1Analytics: public CHeapObj<mtGC> {

  TruncatedSeq _concurrent_refine_rate_ms_seq;
  TruncatedSeq _dirtied_cards_rate_ms_seq;
-  TruncatedSeq _dirtied_cards_in_thread_buffers_seq;
-  // The ratio between the number of scanned cards and actually merged cards, for
-  // young-only and mixed gcs.
-  G1PhaseDependentSeq _card_scan_to_merge_ratio_seq;
+  // The ratio between the number of merged cards to actually scanned cards for
+  // card based remembered sets, for young-only and mixed gcs.
+  G1PhaseDependentSeq _card_merge_to_scan_ratio_seq;

  // The cost to scan a card during young-only and mixed gcs in ms.
  G1PhaseDependentSeq _cost_per_card_scan_ms_seq;
-  // The cost to merge a card during young-only and mixed gcs in ms.
+  // The cost to merge a card from the remembered sets for non-young regions in ms.
  G1PhaseDependentSeq _cost_per_card_merge_ms_seq;
  // The cost to scan entries in the code root remembered set in ms.
  G1PhaseDependentSeq _cost_per_code_root_ms_seq;
@ -74,6 +73,8 @@ class G1Analytics: public CHeapObj<mtGC> {
  G1PhaseDependentSeq _card_rs_length_seq;
  G1PhaseDependentSeq _code_root_rs_length_seq;

+  // Prediction for merging the refinement table to the card table during GC.
+  TruncatedSeq _merge_refinement_table_ms_seq;
  TruncatedSeq _constant_other_time_ms_seq;
  TruncatedSeq _young_other_cost_per_region_ms_seq;
  TruncatedSeq _non_young_other_cost_per_region_ms_seq;
@ -149,14 +150,14 @@ public:
  void report_alloc_rate_ms(double alloc_rate);
  void report_concurrent_refine_rate_ms(double cards_per_ms);
  void report_dirtied_cards_rate_ms(double cards_per_ms);
-  void report_dirtied_cards_in_thread_buffers(size_t num_cards);
  void report_cost_per_card_scan_ms(double cost_per_remset_card_ms, bool for_young_only_phase);
  void report_cost_per_card_merge_ms(double cost_per_card_ms, bool for_young_only_phase);
  void report_cost_per_code_root_scan_ms(double cost_per_code_root_ms, bool for_young_only_phase);
-  void report_card_scan_to_merge_ratio(double cards_per_entry_ratio, bool for_young_only_phase);
+  void report_card_merge_to_scan_ratio(double merge_to_scan_ratio, bool for_young_only_phase);
  void report_cost_per_byte_ms(double cost_per_byte_ms, bool for_young_only_phase);
  void report_young_other_cost_per_region_ms(double other_cost_per_region_ms);
  void report_non_young_other_cost_per_region_ms(double other_cost_per_region_ms);
+  void report_merge_refinement_table_time_ms(double pending_card_merge_time_ms);
  void report_constant_other_time_ms(double constant_other_time_ms);
  void report_pending_cards(double pending_cards, bool for_young_only_phase);
  void report_card_rs_length(double card_rs_length, bool for_young_only_phase);
@ -167,7 +168,6 @@ public:

  double predict_concurrent_refine_rate_ms() const;
  double predict_dirtied_cards_rate_ms() const;
-  size_t predict_dirtied_cards_in_thread_buffers() const;

  // Predict how many of the given remembered set of length card_rs_length will add to
  // the number of total cards scanned.
@ -180,6 +180,7 @@ public:

  double predict_object_copy_time_ms(size_t bytes_to_copy, bool for_young_only_phase) const;

+  double predict_merge_refinement_table_time_ms() const;
  double predict_constant_other_time_ms() const;

  double predict_young_other_time_ms(size_t young_num) const;
--- a/src/hotspot/share/gc/g1/g1Arguments.cpp
+++ b/src/hotspot/share/gc/g1/g1Arguments.cpp
@ -68,6 +68,12 @@ void G1Arguments::initialize_alignments() {
  if (FLAG_IS_DEFAULT(G1EagerReclaimRemSetThreshold)) {
    FLAG_SET_ERGO(G1EagerReclaimRemSetThreshold, G1RemSetArrayOfCardsEntries);
  }
+  // G1 prefers to use conditional card marking to avoid overwriting cards that
+  // have already been found to contain a to-collection set reference. This reduces
+  // refinement effort.
+  if (FLAG_IS_DEFAULT(UseCondCardMark)) {
+    FLAG_SET_ERGO(UseCondCardMark, true);
+  }
 }

 size_t G1Arguments::conservative_max_heap_alignment() {
@ -241,9 +247,8 @@ void G1Arguments::initialize() {

  // Verify that the maximum parallelism isn't too high to eventually overflow
  // the refcount in G1CardSetContainer.
-  uint max_parallel_refinement_threads = G1ConcRefinementThreads + G1DirtyCardQueueSet::num_par_ids();
  uint const divisor = 3;  // Safe divisor; we increment by 2 for each claim, but there is a small initial value.
-  if (max_parallel_refinement_threads > UINT_MAX / divisor) {
+  if (G1ConcRefinementThreads > UINT_MAX / divisor) {
    vm_exit_during_initialization("Too large parallelism for remembered sets.");
  }

--- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp
@ -32,12 +32,14 @@
 #include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/shared/satbMarkQueue.hpp"
 #include "logging/log.hpp"
+#include "memory/iterator.hpp"
 #include "oops/access.inline.hpp"
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/javaThread.hpp"
 #include "runtime/orderAccess.hpp"
+#include "runtime/threads.hpp"
 #include "utilities/macros.hpp"
 #ifdef COMPILER1
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
@ -49,18 +51,38 @@
 class G1BarrierSetC1;
 class G1BarrierSetC2;

-G1BarrierSet::G1BarrierSet(G1CardTable* card_table) :
+G1BarrierSet::G1BarrierSet(G1CardTable* card_table,
+                           G1CardTable* refinement_table) :
  CardTableBarrierSet(make_barrier_set_assembler<G1BarrierSetAssembler>(),
                      make_barrier_set_c1<G1BarrierSetC1>(),
                      make_barrier_set_c2<G1BarrierSetC2>(),
                      card_table,
                      BarrierSet::FakeRtti(BarrierSet::G1BarrierSet)),
  _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize),
-  _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize),
  _satb_mark_queue_set(&_satb_mark_queue_buffer_allocator),
-  _dirty_card_queue_set(&_dirty_card_queue_buffer_allocator)
+  _refinement_table(refinement_table)
 {}

+G1BarrierSet::~G1BarrierSet() {
+  delete _refinement_table;
+}
+
+void G1BarrierSet::swap_global_card_table() {
+  G1CardTable* temp = static_cast<G1CardTable*>(_card_table);
+  _card_table = _refinement_table;
+  _refinement_table = temp;
+}
+
+void G1BarrierSet::update_card_table_base(Thread* thread) {
+#ifdef ASSERT
+  {
+    ResourceMark rm;
+    assert(thread->is_Java_thread(), "may only update card table base of JavaThreads, not %s", thread->name());
+  }
+#endif
+  G1ThreadLocalData::set_byte_map_base(thread, _card_table->byte_map_base());
+}
+
 template <class T> void
 G1BarrierSet::write_ref_array_pre_work(T* dst, size_t count) {
  G1SATBMarkQueueSet& queue_set = G1BarrierSet::satb_mark_queue_set();
@ -89,28 +111,14 @@ void G1BarrierSet::write_ref_array_pre(narrowOop* dst, size_t count, bool dest_u
  }
 }

-void G1BarrierSet::write_ref_field_post_slow(volatile CardValue* byte) {
-  // In the slow path, we know a card is not young
-  assert(*byte != G1CardTable::g1_young_card_val(), "slow path invoked without filtering");
-  OrderAccess::storeload();
-  if (*byte != G1CardTable::dirty_card_val()) {
-    *byte = G1CardTable::dirty_card_val();
-    Thread* thr = Thread::current();
-    G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thr);
-    G1BarrierSet::dirty_card_queue_set().enqueue(queue, byte);
-  }
-}
-
 void G1BarrierSet::write_region(JavaThread* thread, MemRegion mr) {
  if (mr.is_empty()) {
    return;
  }
-  volatile CardValue* byte = _card_table->byte_for(mr.start());
-  CardValue* last_byte = _card_table->byte_for(mr.last());

-  // skip young gen cards
-  if (*byte == G1CardTable::g1_young_card_val()) {
-    // MemRegion should not span multiple regions for the young gen.
+  // Skip writes to young gen.
+  if (G1CollectedHeap::heap()->heap_region_containing(mr.start())->is_young()) {
+    // MemRegion should not span multiple regions for arrays in young gen.
    DEBUG_ONLY(G1HeapRegion* containing_hr = G1CollectedHeap::heap()->heap_region_containing(mr.start());)
    assert(containing_hr->is_young(), "it should be young");
    assert(containing_hr->is_in(mr.start()), "it should contain start");
@ -118,16 +126,25 @@ void G1BarrierSet::write_region(JavaThread* thread, MemRegion mr) {
    return;
  }

-  OrderAccess::storeload();
-  // Enqueue if necessary.
-  G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-  G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
+  // We need to make sure that we get the start/end byte information for the area
+  // to mark from the same card table to avoid getting confused in the mark loop
+  // further below - we might execute while the global card table is being switched.
+  //
+  // It does not matter which card table we write to: at worst we may write to the
+  // new card table (after the switching), which means that we will catch the
+  // marks next time.
+  // If we write to the old card table (after the switching, then the refinement
+  // table) the oncoming handshake will do the memory synchronization.
+  CardTable* card_table = AtomicAccess::load(&_card_table);
+
+  volatile CardValue* byte = card_table->byte_for(mr.start());
+  CardValue* last_byte = card_table->byte_for(mr.last());
+
+  // Dirty cards only if necessary.
  for (; byte <= last_byte; byte++) {
    CardValue bv = *byte;
-    assert(bv != G1CardTable::g1_young_card_val(), "Invalid card");
-    if (bv != G1CardTable::dirty_card_val()) {
+    if (bv == G1CardTable::clean_card_val()) {
      *byte = G1CardTable::dirty_card_val();
-      qset.enqueue(queue, byte);
    }
  }
 }
@ -148,14 +165,15 @@ void G1BarrierSet::on_thread_attach(Thread* thread) {
  assert(!satbq.is_active(), "SATB queue should not be active");
  assert(satbq.buffer() == nullptr, "SATB queue should not have a buffer");
  assert(satbq.index() == 0, "SATB queue index should be zero");
-  G1DirtyCardQueue& dirtyq = G1ThreadLocalData::dirty_card_queue(thread);
-  assert(dirtyq.buffer() == nullptr, "Dirty Card queue should not have a buffer");
-  assert(dirtyq.index() == 0, "Dirty Card queue index should be zero");
-
  // If we are creating the thread during a marking cycle, we should
  // set the active field of the SATB queue to true.  That involves
  // copying the global is_active value to this thread's queue.
  satbq.set_active(_satb_mark_queue_set.is_active());
+
+  if (thread->is_Java_thread()) {
+    assert(Threads_lock->is_locked(), "must be, synchronization with refinement.");
+    update_card_table_base(thread);
+  }
 }

 void G1BarrierSet::on_thread_detach(Thread* thread) {
@ -165,14 +183,13 @@ void G1BarrierSet::on_thread_detach(Thread* thread) {
    SATBMarkQueue& queue = G1ThreadLocalData::satb_mark_queue(thread);
    G1BarrierSet::satb_mark_queue_set().flush_queue(queue);
  }
-  {
-    G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
-    G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-    qset.flush_queue(queue);
-    qset.record_detached_refinement_stats(queue.refinement_stats());
-  }
  {
    G1RegionPinCache& cache = G1ThreadLocalData::pin_count_cache(thread);
    cache.flush();
  }
 }
+
+void G1BarrierSet::print_on(outputStream* st) const {
+  _card_table->print_on(st, "Card");
+  _refinement_table->print_on(st, "Refinement");
+}
--- a/src/hotspot/share/gc/g1/g1BarrierSet.hpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -25,32 +25,65 @@
 #ifndef SHARE_GC_G1_G1BARRIERSET_HPP
 #define SHARE_GC_G1_G1BARRIERSET_HPP

-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
 #include "gc/shared/bufferNode.hpp"
 #include "gc/shared/cardTable.hpp"
 #include "gc/shared/cardTableBarrierSet.hpp"

 class G1CardTable;
+class Thread;

-// This barrier is specialized to use a logging barrier to support
-// snapshot-at-the-beginning marking.
-
+// This barrier set is specialized to manage two card tables:
+// * one the mutator is currently working on ("card table")
+// * one the refinement threads or GC during pause are working on ("refinement table")
+//
+// The card table acts like a regular card table where the mutator dirties cards
+// containing potentially interesting references.
+//
+// When the amount of dirty cards on the card table exceeds a threshold, G1 swaps
+// the card tables and has the refinement threads reduce them by "refining"
+// them.
+// I.e. refinement looks at all dirty cards on the refinement table, and updates
+// the remembered sets accordingly, clearing the cards on the refinement table.
+//
+// Meanwhile the mutator continues dirtying the now empty card table.
+//
+// This separation of data the mutator and refinement threads are working on
+// removes the need for any fine-grained (per mutator write) synchronization between
+// them, keeping the write barrier simple.
+//
+// The refinement threads mark cards in the current collection set specially on the
+// card table - this is fine wrt synchronization with the mutator, because at
+// most the mutator will overwrite it again if there is a race, as G1 will scan the
+// entire card either way during the GC pause.
+//
+// During garbage collection, if the refinement table is known to be non-empty, G1
+// merges it back (and cleaning it) to the card table which is scanned for dirty
+// cards.
+//
 class G1BarrierSet: public CardTableBarrierSet {
  friend class VMStructs;
 private:
  BufferNode::Allocator _satb_mark_queue_buffer_allocator;
-  BufferNode::Allocator _dirty_card_queue_buffer_allocator;
  G1SATBMarkQueueSet _satb_mark_queue_set;
-  G1DirtyCardQueueSet _dirty_card_queue_set;
+
+  G1CardTable* _refinement_table;
+
+ public:
+  G1BarrierSet(G1CardTable* card_table, G1CardTable* refinement_table);
+  virtual ~G1BarrierSet();

  static G1BarrierSet* g1_barrier_set() {
    return barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
  }

- public:
-  G1BarrierSet(G1CardTable* table);
-  ~G1BarrierSet() { }
+  G1CardTable* refinement_table() const { return _refinement_table; }
+
+  // Swap the global card table references, without synchronization.
+  void swap_global_card_table();
+
+  // Update the given thread's card table (byte map) base to the current card table's.
+  void update_card_table_base(Thread* thread);

  virtual bool card_mark_must_follow_store() const {
    return true;
@ -74,9 +107,8 @@ class G1BarrierSet: public CardTableBarrierSet {
  inline void write_region(MemRegion mr);
  void write_region(JavaThread* thread, MemRegion mr);

-  template <DecoratorSet decorators, typename T>
+  template <DecoratorSet decorators = DECORATORS_NONE, typename T>
  void write_ref_field_post(T* field);
-  void write_ref_field_post_slow(volatile CardValue* byte);

  virtual void on_thread_create(Thread* thread);
  virtual void on_thread_destroy(Thread* thread);
@ -87,9 +119,7 @@ class G1BarrierSet: public CardTableBarrierSet {
    return g1_barrier_set()->_satb_mark_queue_set;
  }

-  static G1DirtyCardQueueSet& dirty_card_queue_set() {
-    return g1_barrier_set()->_dirty_card_queue_set;
-  }
+  virtual void print_on(outputStream* st) const;

  // Callbacks for runtime accesses.
  template <DecoratorSet decorators, typename BarrierSetT = G1BarrierSet>
--- a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp
@ -75,9 +75,8 @@ inline void G1BarrierSet::write_region(MemRegion mr) {
 template <DecoratorSet decorators, typename T>
 inline void G1BarrierSet::write_ref_field_post(T* field) {
  volatile CardValue* byte = _card_table->byte_for(field);
-  if (*byte != G1CardTable::g1_young_card_val()) {
-    // Take a slow path for cards in old
-    write_ref_field_post_slow(byte);
+  if (*byte == G1CardTable::clean_card_val()) {
+    *byte = G1CardTable::dirty_card_val();
  }
 }

@ -127,7 +126,7 @@ inline void G1BarrierSet::AccessBarrier<decorators, BarrierSetT>::
 oop_store_not_in_heap(T* addr, oop new_value) {
  // Apply SATB barriers for all non-heap references, to allow
  // concurrent scanning of such references.
-  G1BarrierSet *bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
+  G1BarrierSet *bs = g1_barrier_set();
  bs->write_ref_field_pre<decorators>(addr);
  Raw::oop_store(addr, new_value);
 }
--- a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp
@ -29,17 +29,17 @@
 #include "utilities/macros.hpp"

 void G1BarrierSetRuntime::write_ref_array_pre_oop_entry(oop* dst, size_t length) {
-  G1BarrierSet *bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
+  G1BarrierSet *bs = G1BarrierSet::g1_barrier_set();
  bs->write_ref_array_pre(dst, length, false);
 }

 void G1BarrierSetRuntime::write_ref_array_pre_narrow_oop_entry(narrowOop* dst, size_t length) {
-  G1BarrierSet *bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
+  G1BarrierSet *bs = G1BarrierSet::g1_barrier_set();
  bs->write_ref_array_pre(dst, length, false);
 }

 void G1BarrierSetRuntime::write_ref_array_post_entry(HeapWord* dst, size_t length) {
-  G1BarrierSet *bs = barrier_set_cast<G1BarrierSet>(BarrierSet::barrier_set());
+  G1BarrierSet *bs = G1BarrierSet::g1_barrier_set();
  bs->G1BarrierSet::write_ref_array(dst, length);
 }

@ -53,14 +53,6 @@ JRT_LEAF(void, G1BarrierSetRuntime::write_ref_field_pre_entry(oopDesc* orig, Jav
  G1BarrierSet::satb_mark_queue_set().enqueue_known_active(queue, orig);
 JRT_END

-// G1 post write barrier slowpath
-JRT_LEAF(void, G1BarrierSetRuntime::write_ref_field_post_entry(volatile G1CardTable::CardValue* card_addr,
-                                                               JavaThread* thread))
-  assert(thread == JavaThread::current(), "pre-condition");
-  G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
-  G1BarrierSet::dirty_card_queue_set().enqueue(queue, card_addr);
-JRT_END
-
 JRT_LEAF(void, G1BarrierSetRuntime::clone(oopDesc* src, oopDesc* dst, size_t size))
  HeapAccess<>::clone(src, dst, size);
 JRT_END
--- a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -47,7 +47,6 @@ public:

  // C2 slow-path runtime calls.
  static void write_ref_field_pre_entry(oopDesc* orig, JavaThread *thread);
-  static void write_ref_field_post_entry(volatile CardValue* card_addr, JavaThread* thread);

  static address clone_addr();
 };
--- a/src/hotspot/share/gc/g1/g1CardTable.cpp
+++ b/src/hotspot/share/gc/g1/g1CardTable.cpp
@ -28,18 +28,37 @@
 #include "logging/log.hpp"
 #include "runtime/os.hpp"

-void G1CardTable::g1_mark_as_young(const MemRegion& mr) {
-  CardValue *const first = byte_for(mr.start());
-  CardValue *const last = byte_after(mr.last());
+void G1CardTable::verify_region(MemRegion mr, CardValue val, bool val_equals) {
+  if (mr.is_empty()) {
+    return;
+  }
+  CardValue* start    = byte_for(mr.start());
+  CardValue* end      = byte_for(mr.last());

-  memset_with_concurrent_readers(first, g1_young_gen, pointer_delta(last, first, sizeof(CardValue)));
-}
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  G1HeapRegion* r = g1h->heap_region_containing(mr.start());

-#ifndef PRODUCT
-void G1CardTable::verify_g1_young_region(MemRegion mr) {
-  verify_region(mr, g1_young_gen,  true);
+  assert(r == g1h->heap_region_containing(mr.last()), "MemRegion crosses region");
+
+  bool failures = false;
+  for (CardValue* curr = start; curr <= end; ++curr) {
+    CardValue curr_val = *curr;
+    bool failed = (val_equals) ? (curr_val != val) : (curr_val == val);
+    if (failed) {
+      if (!failures) {
+        log_error(gc, verify)("== CT verification failed: [" PTR_FORMAT "," PTR_FORMAT "] r: %d (%s) %sexpecting value: %d",
+                              p2i(start), p2i(end), r->hrm_index(), r->get_short_type_str(),
+                              (val_equals) ? "" : "not ", val);
+        failures = true;
+      }
+      log_error(gc, verify)("==   card " PTR_FORMAT " [" PTR_FORMAT "," PTR_FORMAT "], val: %d",
+                            p2i(curr), p2i(addr_for(curr)),
+                            p2i((HeapWord*) (((size_t) addr_for(curr)) + _card_size)),
+                            (int) curr_val);
+    }
+  }
+  guarantee(!failures, "there should not have been any failures");
 }
-#endif

 void G1CardTableChangedListener::on_commit(uint start_idx, size_t num_regions, bool zero_filled) {
  // Default value for a clean card on the card table is -1. So we cannot take advantage of the zero_filled parameter.
@ -74,6 +93,5 @@ void G1CardTable::initialize(G1RegionToSpaceMapper* mapper) {
 }

 bool G1CardTable::is_in_young(const void* p) const {
-  volatile CardValue* card = byte_for(p);
-  return *card == G1CardTable::g1_young_card_val();
+  return G1CollectedHeap::heap()->heap_region_containing(p)->is_young();
 }
--- a/src/hotspot/share/gc/g1/g1CardTable.hpp
+++ b/src/hotspot/share/gc/g1/g1CardTable.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -52,8 +52,6 @@ class G1CardTable : public CardTable {

 public:
  enum G1CardValues {
-    g1_young_gen = CT_MR_BS_last_reserved << 1,
-
    // During evacuation we use the card table to consolidate the cards we need to
    // scan for roots onto the card table from the various sources. Further it is
    // used to record already completely scanned cards to avoid re-scanning them
@ -63,18 +61,43 @@ public:
    // The merge at the start of each evacuation round simply sets cards to dirty
    // that are clean; scanned cards are set to 0x1.
    //
-    // This means that the LSB determines what to do with the card during evacuation
-    // given the following possible values:
+    // This means that the LSB determines whether the card is clean or non-clean
+    // (LSB is 1 -> clean, LSB is 0 -> non-clean) given the following possible values:
    //
-    // 11111111 - clean, do not scan
-    // 00000001 - already scanned, do not scan
+    // xxxxxxx1 - clean, already scanned, do not scan again (during GC only).
+    // 00000100 - dirty, needs to be scanned, dirty from remembered set (during GC only)
+    // 00000010 - dirty, needs to be scanned, contains reference to collection set.
    // 00000000 - dirty, needs to be scanned.
    //
-    g1_card_already_scanned = 0x1
+    // g1_to_cset_card and g1_from_remset_card are both used for optimization and
+    // needed for more accurate prediction of card generation rate.
+    //
+    // g1_to_cset_card allows to separate dirty card generation rate by the mutator
+    // (which just dirties cards) from cards that will be scanned during next garbage
+    // collection anyway.
+    // Further it allows the optimization to not refine them, assuming that their
+    // references to young gen does not change, and not add this card to any other
+    // remembered set.
+    // This color is sticky during mutator time: refinement threads encountering
+    // this card on the refinement table will just copy it over to the regular card
+    // table without re-refining this card. This saves on refinement effort spent
+    // on that card because most of the time already found interesting references
+    // stay interesting.
+    //
+    // g1_from_remset_card allows separation of cards generated by the mutator from
+    // cards in the remembered set, again to make mutator dirty card generation
+    // prediction more accurate.
+    //
+    // More accurate prediction allow better (less wasteful) refinement control.
+    g1_dirty_card = dirty_card,
+    g1_card_already_scanned = 0x1,
+    g1_to_cset_card = 0x2,
+    g1_from_remset_card = 0x4
  };

  static const size_t WordAllClean = SIZE_MAX;
  static const size_t WordAllDirty = 0;
+  static const size_t WordAllFromRemset = (SIZE_MAX / 255) * g1_from_remset_card;

  STATIC_ASSERT(BitsPerByte == 8);
  static const size_t WordAlreadyScanned = (SIZE_MAX / 255) * g1_card_already_scanned;
@ -83,27 +106,27 @@ public:
    _listener.set_card_table(this);
  }

-  static CardValue g1_young_card_val() { return g1_young_gen; }
  static CardValue g1_scanned_card_val() { return g1_card_already_scanned; }

-  void verify_g1_young_region(MemRegion mr) PRODUCT_RETURN;
-  void g1_mark_as_young(const MemRegion& mr);
+  void verify_region(MemRegion mr, CardValue val, bool val_equals) override;

  size_t index_for_cardvalue(CardValue const* p) const {
    return pointer_delta(p, _byte_map, sizeof(CardValue));
  }

-  // Mark the given card as Dirty if it is Clean. Returns whether the card was
+  // Mark the given card as From Remset if it is Clean. Returns whether the card was
  // Clean before this operation. This result may be inaccurate as it does not
  // perform the dirtying atomically.
-  inline bool mark_clean_as_dirty(CardValue* card);
+  inline bool mark_clean_as_from_remset(CardValue* card);

-  // Change Clean cards in a (large) area on the card table as Dirty, preserving
-  // already scanned cards. Assumes that most cards in that area are Clean.
-  inline void mark_range_dirty(size_t start_card_index, size_t num_cards);
+  // Change Clean cards in a (large) area on the card table as From_Remset, preserving
+  // cards already marked otherwise. Assumes that most cards in that area are Clean.
+  // Not atomic.
+  inline size_t mark_clean_range_as_from_remset(size_t start_card_index, size_t num_cards);

-  // Change the given range of dirty cards to "which". All of these cards must be Dirty.
-  inline void change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which);
+  // Change the given range of dirty cards to "which". All of these cards must be non-clean.
+  // Returns the number of pending cards found.
+  inline size_t change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which);

  inline uint region_idx_for(CardValue* p);

--- a/src/hotspot/share/gc/g1/g1CardTable.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1CardTable.inline.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -28,25 +28,39 @@
 #include "gc/g1/g1CardTable.hpp"

 #include "gc/g1/g1HeapRegion.hpp"
+#include "utilities/population_count.hpp"

 inline uint G1CardTable::region_idx_for(CardValue* p) {
  size_t const card_idx = pointer_delta(p, _byte_map, sizeof(CardValue));
  return (uint)(card_idx >> G1HeapRegion::LogCardsPerRegion);
 }

-inline bool G1CardTable::mark_clean_as_dirty(CardValue* card) {
+inline bool G1CardTable::mark_clean_as_from_remset(CardValue* card) {
  CardValue value = *card;
  if (value == clean_card_val()) {
-    *card = dirty_card_val();
+    *card = g1_from_remset_card;
    return true;
  }
  return false;
 }

-inline void G1CardTable::mark_range_dirty(size_t start_card_index, size_t num_cards) {
+// Returns bits from a where mask is 0, and bits from b where mask is 1.
+//
+// Example:
+// a      = 0xAAAAAAAA
+// b      = 0xBBBBBBBB
+// mask   = 0xFF00FF00
+// result = 0xBBAABBAA
+inline size_t blend(size_t a, size_t b, size_t mask) {
+  return (a & ~mask) | (b & mask);
+}
+
+inline size_t G1CardTable::mark_clean_range_as_from_remset(size_t start_card_index, size_t num_cards) {
  assert(is_aligned(start_card_index, sizeof(size_t)), "Start card index must be aligned.");
  assert(is_aligned(num_cards, sizeof(size_t)), "Number of cards to change must be evenly divisible.");

+  size_t result = 0;
+
  size_t const num_chunks = num_cards / sizeof(size_t);

  size_t* cur_word = (size_t*)&_byte_map[start_card_index];
@ -54,31 +68,33 @@ inline void G1CardTable::mark_range_dirty(size_t start_card_index, size_t num_ca
  while (cur_word < end_word_map) {
    size_t value = *cur_word;
    if (value == WordAllClean) {
-      *cur_word = WordAllDirty;
-    } else if (value == WordAllDirty) {
-      // do nothing.
+      *cur_word = WordAllFromRemset;
+      result += sizeof(size_t);
+    } else if ((value & WordAlreadyScanned) == 0) {
+      // Do nothing if there is no "Clean" card in it.
    } else {
-      // There is a mix of cards in there. Tread slowly.
-      CardValue* cur = (CardValue*)cur_word;
-      for (size_t i = 0; i < sizeof(size_t); i++) {
-        CardValue value = *cur;
-        if (value == clean_card_val()) {
-          *cur = dirty_card_val();
-        }
-        cur++;
-      }
+      // There is a mix of cards in there. Tread "slowly".
+      size_t clean_card_mask = (value & WordAlreadyScanned) * 0xff; // All "Clean" cards have 0xff, all other places 0x00 now.
+      result += population_count(clean_card_mask) / BitsPerByte;
+      *cur_word = blend(value, WordAllFromRemset, clean_card_mask);
    }
    cur_word++;
  }
+  return result;
 }

-inline void G1CardTable::change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which) {
+inline size_t G1CardTable::change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which) {
+  size_t result = 0;
  for (CardValue* i_card = start_card; i_card < end_card; ++i_card) {
    CardValue value = *i_card;
-    assert(value == dirty_card_val(),
+    assert((value & g1_card_already_scanned) == 0,
           "Must have been dirty %d start " PTR_FORMAT " " PTR_FORMAT, value, p2i(start_card), p2i(end_card));
+    if (value == g1_dirty_card) {
+      result++;
+    }
    *i_card = which;
  }
+  return result;
 }

 #endif /* SHARE_GC_G1_G1CARDTABLE_INLINE_HPP */
--- a/src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp
+++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp
@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1HeapRegion.inline.hpp"
+#include "gc/shared/workerThread.hpp"
+#include "memory/allocation.hpp"
+#include "utilities/checkedCast.hpp"
+#include "utilities/powerOfTwo.hpp"
+
+G1CardTableClaimTable::G1CardTableClaimTable(uint chunks_per_region) :
+  _max_reserved_regions(0),
+  _card_claims(nullptr),
+  _cards_per_chunk(checked_cast<uint>(G1HeapRegion::CardsPerRegion / chunks_per_region))
+{
+  guarantee(chunks_per_region > 0, "%u chunks per region", chunks_per_region);
+}
+
+G1CardTableClaimTable::~G1CardTableClaimTable() {
+  FREE_C_HEAP_ARRAY(uint, _card_claims);
+}
+
+void G1CardTableClaimTable::initialize(uint max_reserved_regions) {
+  assert(_card_claims == nullptr, "Must not be initialized twice");
+  _card_claims = NEW_C_HEAP_ARRAY(uint, max_reserved_regions, mtGC);
+  _max_reserved_regions = max_reserved_regions;
+  reset_all_to_unclaimed();
+}
+
+void G1CardTableClaimTable::reset_all_to_unclaimed() {
+  for (uint i = 0; i < _max_reserved_regions; i++) {
+    _card_claims[i] = 0;
+  }
+}
+
+void G1CardTableClaimTable::reset_all_to_claimed() {
+  for (uint i = 0; i < _max_reserved_regions; i++) {
+    _card_claims[i] = (uint)G1HeapRegion::CardsPerRegion;
+  }
+}
+
+void G1CardTableClaimTable::heap_region_iterate_from_worker_offset(G1HeapRegionClosure* cl, uint worker_id, uint max_workers) {
+  // Every worker will actually look at all regions, skipping over regions that
+  // are completed.
+  const size_t n_regions = _max_reserved_regions;
+  const uint start_index = (uint)(worker_id * n_regions / max_workers);
+
+  for (uint count = 0; count < n_regions; count++) {
+    const uint index = (start_index + count) % n_regions;
+    assert(index < n_regions, "sanity");
+    // Skip over fully processed regions
+    if (!has_unclaimed_cards(index)) {
+      continue;
+    }
+    G1HeapRegion* r = G1CollectedHeap::heap()->region_at(index);
+    bool res = cl->do_heap_region(r);
+    if (res) {
+      return;
+    }
+  }
+}
+
+G1CardTableChunkClaimer::G1CardTableChunkClaimer(G1CardTableClaimTable* scan_state, uint region_idx) :
+  _claim_values(scan_state),
+  _region_idx(region_idx),
+  _cur_claim(0) {
+  guarantee(size() <= G1HeapRegion::CardsPerRegion, "Should not claim more space than possible.");
+}
+
+G1ChunkScanner::G1ChunkScanner(CardValue* const start_card, CardValue* const end_card) :
+  _start_card(start_card),
+  _end_card(end_card) {
+    assert(is_word_aligned(start_card), "precondition");
+    assert(is_word_aligned(end_card), "precondition");
+}
--- a/src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp
+++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp
@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP
+#define SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP
+
+#include "gc/g1/g1CardTable.hpp"
+#include "memory/allocation.hpp"
+
+class G1HeapRegionClosure;
+
+// Helper class representing claim values for the cards in the card table corresponding
+// to a region.
+// I.e. for every region this class stores an atomic counter that represents the
+// number of cards from 0 to the number of cards per region already claimed for
+// this region.
+// If the claimed value is >= the number of cards of a region, the region can be
+// considered fully claimed.
+//
+// Claiming works on full region (all cards in region) or a range of contiguous cards
+// (chunk). Chunk size is given at construction time.
+class G1CardTableClaimTable : public CHeapObj<mtGC> {
+  uint _max_reserved_regions;
+
+  // Card table iteration claim values for every heap region, from 0 (completely unclaimed)
+  // to (>=) G1HeapRegion::CardsPerRegion (completely claimed).
+  uint volatile* _card_claims;
+
+  uint _cards_per_chunk;           // For conversion between card index and chunk index.
+
+  // Claim increment number of cards, returning the previous claim value.
+  inline uint claim_cards(uint region, uint increment);
+
+public:
+  G1CardTableClaimTable(uint chunks_per_region);
+  ~G1CardTableClaimTable();
+
+  // Allocates the data structure and initializes the claims to unclaimed.
+  void initialize(uint max_reserved_regions);
+
+  void reset_all_to_unclaimed();
+  void reset_all_to_claimed();
+
+  inline bool has_unclaimed_cards(uint region);
+  inline void reset_to_unclaimed(uint region);
+
+  // Claims all cards in that region, returning the previous claim value.
+  inline uint claim_all_cards(uint region);
+
+  // Claim a single chunk in that region, returning the previous claim value.
+  inline uint claim_chunk(uint region);
+  inline uint cards_per_chunk() const;
+
+  size_t max_reserved_regions() { return _max_reserved_regions; }
+
+  void heap_region_iterate_from_worker_offset(G1HeapRegionClosure* cl, uint worker_id, uint max_workers);
+};
+
+// Helper class to claim dirty chunks within the card table for a given region.
+class G1CardTableChunkClaimer {
+  G1CardTableClaimTable* _claim_values;
+
+  uint _region_idx;
+  uint _cur_claim;
+
+public:
+  G1CardTableChunkClaimer(G1CardTableClaimTable* claim_table, uint region_idx);
+
+  inline bool has_next();
+
+  inline uint value() const;
+  inline uint size() const;
+};
+
+// Helper class to locate consecutive dirty cards inside a range of cards.
+class G1ChunkScanner {
+  using Word = size_t;
+  using CardValue = G1CardTable::CardValue;
+
+  CardValue* const _start_card;
+  CardValue* const _end_card;
+
+  static const size_t ExpandedToScanMask = G1CardTable::WordAlreadyScanned;
+  static const size_t ToScanMask = G1CardTable::g1_card_already_scanned;
+
+  inline bool is_card_dirty(const CardValue* const card) const;
+
+  inline bool is_word_aligned(const void* const addr) const;
+
+  inline CardValue* find_first_dirty_card(CardValue* i_card) const;
+  inline CardValue* find_first_non_dirty_card(CardValue* i_card) const;
+
+public:
+  G1ChunkScanner(CardValue* const start_card, CardValue* const end_card);
+
+  template<typename Func>
+  void on_dirty_cards(Func&& f) {
+    for (CardValue* cur_card = _start_card; cur_card < _end_card; /* empty */) {
+      CardValue* dirty_l = find_first_dirty_card(cur_card);
+      CardValue* dirty_r = find_first_non_dirty_card(dirty_l);
+
+      assert(dirty_l <= dirty_r, "inv");
+
+      if (dirty_l == dirty_r) {
+        assert(dirty_r == _end_card, "finished the entire chunk");
+        return;
+      }
+
+      f(dirty_l, dirty_r);
+
+      cur_card = dirty_r + 1;
+    }
+  }
+};
+
+#endif // SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP
--- a/src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp
@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP
+#define SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP
+
+#include "gc/g1/g1CardTableClaimTable.hpp"
+
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1HeapRegion.inline.hpp"
+#include "runtime/atomicAccess.hpp"
+
+bool G1CardTableClaimTable::has_unclaimed_cards(uint region) {
+  assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
+  return AtomicAccess::load(&_card_claims[region]) < G1HeapRegion::CardsPerRegion;
+}
+
+void G1CardTableClaimTable::reset_to_unclaimed(uint region) {
+  assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
+  AtomicAccess::store(&_card_claims[region], 0u);
+}
+
+uint G1CardTableClaimTable::claim_cards(uint region, uint increment) {
+  assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
+  return AtomicAccess::fetch_then_add(&_card_claims[region], increment, memory_order_relaxed);
+}
+
+uint G1CardTableClaimTable::claim_chunk(uint region) {
+  assert(region < _max_reserved_regions, "Tried to access invalid region %u", region);
+  return AtomicAccess::fetch_then_add(&_card_claims[region], cards_per_chunk(), memory_order_relaxed);
+}
+
+uint G1CardTableClaimTable::claim_all_cards(uint region) {
+  return claim_cards(region, (uint)G1HeapRegion::CardsPerRegion);
+}
+
+uint G1CardTableClaimTable::cards_per_chunk() const { return _cards_per_chunk; }
+
+bool G1CardTableChunkClaimer::has_next() {
+  _cur_claim = _claim_values->claim_chunk(_region_idx);
+  return (_cur_claim < G1HeapRegion::CardsPerRegion);
+}
+
+uint G1CardTableChunkClaimer::value() const { return _cur_claim; }
+uint G1CardTableChunkClaimer::size() const { return _claim_values->cards_per_chunk(); }
+
+bool G1ChunkScanner::is_card_dirty(const CardValue* const card) const {
+  return (*card & ToScanMask) == 0;
+}
+
+bool G1ChunkScanner::is_word_aligned(const void* const addr) const {
+  return ((uintptr_t)addr) % sizeof(Word) == 0;
+}
+
+G1CardTable::CardValue* G1ChunkScanner::find_first_dirty_card(CardValue* i_card) const {
+  while (!is_word_aligned(i_card)) {
+    if (is_card_dirty(i_card)) {
+      return i_card;
+    }
+    i_card++;
+  }
+
+  for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) {
+    Word word_value = *reinterpret_cast<Word*>(i_card);
+    bool has_dirty_cards_in_word = (~word_value & ExpandedToScanMask) != 0;
+
+    if (has_dirty_cards_in_word) {
+      for (uint i = 0; i < sizeof(Word); ++i) {
+        if (is_card_dirty(i_card)) {
+          return i_card;
+        }
+        i_card++;
+      }
+      ShouldNotReachHere();
+    }
+  }
+
+  return _end_card;
+}
+
+G1CardTable::CardValue* G1ChunkScanner::find_first_non_dirty_card(CardValue* i_card) const {
+  while (!is_word_aligned(i_card)) {
+    if (!is_card_dirty(i_card)) {
+      return i_card;
+    }
+    i_card++;
+  }
+
+  for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) {
+    Word word_value = *reinterpret_cast<Word*>(i_card);
+    bool all_cards_dirty = (word_value & ExpandedToScanMask) == 0;
+
+    if (!all_cards_dirty) {
+      for (uint i = 0; i < sizeof(Word); ++i) {
+        if (!is_card_dirty(i_card)) {
+          return i_card;
+        }
+        i_card++;
+      }
+      ShouldNotReachHere();
+    }
+  }
+
+  return _end_card;
+}
+
+#endif // SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@ -38,7 +38,6 @@
 #include "gc/g1/g1ConcurrentMarkThread.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1EvacStats.inline.hpp"
 #include "gc/g1/g1FullCollector.hpp"
 #include "gc/g1/g1GCCounters.hpp"
@ -60,10 +59,10 @@
 #include "gc/g1/g1ParScanThreadState.inline.hpp"
 #include "gc/g1/g1PeriodicGCTask.hpp"
 #include "gc/g1/g1Policy.hpp"
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
 #include "gc/g1/g1RegionPinCache.inline.hpp"
 #include "gc/g1/g1RegionToSpaceMapper.hpp"
 #include "gc/g1/g1RemSet.hpp"
+#include "gc/g1/g1ReviseYoungLengthTask.hpp"
 #include "gc/g1/g1RootClosures.hpp"
 #include "gc/g1/g1RootProcessor.hpp"
 #include "gc/g1/g1SATBMarkQueueSet.hpp"
@ -111,6 +110,7 @@
 #include "runtime/init.hpp"
 #include "runtime/java.hpp"
 #include "runtime/orderAccess.hpp"
+#include "runtime/threads.hpp"
 #include "runtime/threadSMR.hpp"
 #include "runtime/vmThread.hpp"
 #include "utilities/align.hpp"
@ -146,7 +146,7 @@ void G1CollectedHeap::run_batch_task(G1BatchedTask* cl) {
  workers()->run_task(cl, num_workers);
 }

-uint G1CollectedHeap::get_chunks_per_region() {
+uint G1CollectedHeap::get_chunks_per_region_for_scan() {
  uint log_region_size = G1HeapRegion::LogOfHRGrainBytes;
  // Limit the expected input values to current known possible values of the
  // (log) region size. Adjust as necessary after testing if changing the permissible
@ -156,6 +156,18 @@ uint G1CollectedHeap::get_chunks_per_region() {
  return 1u << (log_region_size / 2 - 4);
 }

+uint G1CollectedHeap::get_chunks_per_region_for_merge() {
+  uint log_region_size = G1HeapRegion::LogOfHRGrainBytes;
+  // Limit the expected input values to current known possible values of the
+  // (log) region size. Adjust as necessary after testing if changing the permissible
+  // values for region size.
+  assert(log_region_size >= 20 && log_region_size <= 29,
+         "expected value in [20,29], but got %u", log_region_size);
+
+  uint half_log_region_size = (log_region_size + 1) / 2;
+  return 1 << (half_log_region_size - 9);
+}
+
 G1HeapRegion* G1CollectedHeap::new_heap_region(uint hrs_index,
                                               MemRegion mr) {
  return new G1HeapRegion(hrs_index, bot(), mr, &_card_set_config);
@ -614,7 +626,6 @@ inline HeapWord* G1CollectedHeap::attempt_allocation(size_t min_word_size,
  assert_heap_not_locked();
  if (result != nullptr) {
    assert(*actual_word_size != 0, "Actual size must have been set here");
-    dirty_young_block(result, *actual_word_size);
  } else {
    *actual_word_size = 0;
  }
@ -809,11 +820,27 @@ void G1CollectedHeap::prepare_for_mutator_after_full_collection(size_t allocatio
 }

 void G1CollectedHeap::abort_refinement() {
-  // Discard all remembered set updates and reset refinement statistics.
-  G1BarrierSet::dirty_card_queue_set().abandon_logs_and_stats();
-  assert(G1BarrierSet::dirty_card_queue_set().num_cards() == 0,
-         "DCQS should be empty");
-  concurrent_refine()->get_and_reset_refinement_stats();
+  G1ConcurrentRefineSweepState& sweep_state = concurrent_refine()->sweep_state();
+  if (sweep_state.is_in_progress()) {
+
+    if (!sweep_state.are_java_threads_synched()) {
+      // Synchronize Java threads with global card table that has already been swapped.
+      class SwapThreadCardTableClosure : public ThreadClosure {
+      public:
+
+        virtual void do_thread(Thread* t) {
+          G1BarrierSet* bs = G1BarrierSet::g1_barrier_set();
+          bs->update_card_table_base(t);
+        }
+      } cl;
+      Threads::java_threads_do(&cl);
+    }
+
+    // Record any available refinement statistics.
+    policy()->record_refinement_stats(sweep_state.stats());
+    sweep_state.complete_work(false /* concurrent */, false /* print_log */);
+  }
+  sweep_state.reset_stats();
 }

 void G1CollectedHeap::verify_after_full_collection() {
@ -825,6 +852,7 @@ void G1CollectedHeap::verify_after_full_collection() {
  }
  _hrm.verify_optional();
  _verifier->verify_region_sets_optional();
+  _verifier->verify_card_tables_clean(true /* both_card_tables */);
  _verifier->verify_after_gc();
  _verifier->verify_bitmap_clear(false /* above_tams_only */);

@ -1168,8 +1196,13 @@ G1CollectedHeap::G1CollectedHeap() :
  _service_thread(nullptr),
  _periodic_gc_task(nullptr),
  _free_arena_memory_task(nullptr),
+  _revise_young_length_task(nullptr),
  _workers(nullptr),
-  _card_table(nullptr),
+  _refinement_epoch(0),
+  _last_synchronized_start(0),
+  _last_refinement_epoch_start(0),
+  _yield_duration_in_refinement_epoch(0),
+  _last_safepoint_refinement_epoch(0),
  _collection_pause_end(Ticks::now()),
  _old_set("Old Region Set", new OldRegionSetChecker()),
  _humongous_set("Humongous Region Set", new HumongousRegionSetChecker()),
@ -1289,7 +1322,7 @@ G1RegionToSpaceMapper* G1CollectedHeap::create_aux_memory_mapper(const char* des

 jint G1CollectedHeap::initialize_concurrent_refinement() {
  jint ecode = JNI_OK;
-  _cr = G1ConcurrentRefine::create(policy(), &ecode);
+  _cr = G1ConcurrentRefine::create(this, &ecode);
  return ecode;
 }

@ -1345,18 +1378,12 @@ jint G1CollectedHeap::initialize() {
  initialize_reserved_region(heap_rs);

  // Create the barrier set for the entire reserved region.
-  G1CardTable* ct = new G1CardTable(_reserved);
-  G1BarrierSet* bs = new G1BarrierSet(ct);
+  G1CardTable* card_table = new G1CardTable(_reserved);
+  G1CardTable* refinement_table = new G1CardTable(_reserved);
+
+  G1BarrierSet* bs = new G1BarrierSet(card_table, refinement_table);
  bs->initialize();
  assert(bs->is_a(BarrierSet::G1BarrierSet), "sanity");
-  BarrierSet::set_barrier_set(bs);
-  _card_table = ct;
-
-  {
-    G1SATBMarkQueueSet& satbqs = bs->satb_mark_queue_set();
-    satbqs.set_process_completed_buffers_threshold(G1SATBProcessCompletedThreshold);
-    satbqs.set_buffer_enqueue_threshold_percentage(G1SATBBufferEnqueueingThresholdPercent);
-  }

  // Create space mappers.
  size_t page_size = heap_rs.page_size();
@ -1391,12 +1418,26 @@ jint G1CollectedHeap::initialize() {
                             G1CardTable::compute_size(heap_rs.size() / HeapWordSize),
                             G1CardTable::heap_map_factor());

+  G1RegionToSpaceMapper* refinement_cards_storage =
+    create_aux_memory_mapper("Refinement Card Table",
+                             G1CardTable::compute_size(heap_rs.size() / HeapWordSize),
+                             G1CardTable::heap_map_factor());
+
  size_t bitmap_size = G1CMBitMap::compute_size(heap_rs.size());
  G1RegionToSpaceMapper* bitmap_storage =
    create_aux_memory_mapper("Mark Bitmap", bitmap_size, G1CMBitMap::heap_map_factor());

-  _hrm.initialize(heap_storage, bitmap_storage, bot_storage, cardtable_storage);
-  _card_table->initialize(cardtable_storage);
+  _hrm.initialize(heap_storage, bitmap_storage, bot_storage, cardtable_storage, refinement_cards_storage);
+  card_table->initialize(cardtable_storage);
+  refinement_table->initialize(refinement_cards_storage);
+
+  BarrierSet::set_barrier_set(bs);
+
+  {
+    G1SATBMarkQueueSet& satbqs = bs->satb_mark_queue_set();
+    satbqs.set_process_completed_buffers_threshold(G1SATBProcessCompletedThreshold);
+    satbqs.set_buffer_enqueue_threshold_percentage(G1SATBBufferEnqueueingThresholdPercent);
+  }

  // 6843694 - ensure that the maximum region index can fit
  // in the remembered set structures.
@ -1408,7 +1449,7 @@ jint G1CollectedHeap::initialize() {
  guarantee((uintptr_t)(heap_rs.base()) >= G1CardTable::card_size(), "Java heap must not start within the first card.");
  G1FromCardCache::initialize(max_num_regions());
  // Also create a G1 rem set.
-  _rem_set = new G1RemSet(this, _card_table);
+  _rem_set = new G1RemSet(this);
  _rem_set->initialize(max_num_regions());

  size_t max_cards_per_region = ((size_t)1 << (sizeof(CardIdx_t)*BitsPerByte-1)) - 1;
@ -1467,6 +1508,11 @@ jint G1CollectedHeap::initialize() {
  _free_arena_memory_task = new G1MonotonicArenaFreeMemoryTask("Card Set Free Memory Task");
  _service_thread->register_task(_free_arena_memory_task);

+  if (policy()->use_adaptive_young_list_length()) {
+    _revise_young_length_task = new G1ReviseYoungLengthTask("Revise Young Length List Task");
+    _service_thread->register_task(_revise_young_length_task);
+  }
+
  // Here we allocate the dummy G1HeapRegion that is required by the
  // G1AllocRegion class.
  G1HeapRegion* dummy_region = _hrm.get_dummy_region();
@ -1495,6 +1541,7 @@ jint G1CollectedHeap::initialize() {
  CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_parallel_workers);
  CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_mark);
  CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_refine);
+  CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_refine_control);
  CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_service);

  G1InitLogger::print();
@ -1519,12 +1566,35 @@ void G1CollectedHeap::stop() {

 void G1CollectedHeap::safepoint_synchronize_begin() {
  SuspendibleThreadSet::synchronize();
+
+  _last_synchronized_start = os::elapsed_counter();
 }

 void G1CollectedHeap::safepoint_synchronize_end() {
+  jlong now = os::elapsed_counter();
+  jlong synchronize_duration = now - _last_synchronized_start;
+
+  if (_last_safepoint_refinement_epoch == _refinement_epoch) {
+    _yield_duration_in_refinement_epoch += synchronize_duration;
+  } else {
+    _last_refinement_epoch_start = now;
+    _last_safepoint_refinement_epoch = _refinement_epoch;
+    _yield_duration_in_refinement_epoch = 0;
+  }
+
  SuspendibleThreadSet::desynchronize();
 }

+void G1CollectedHeap::set_last_refinement_epoch_start(jlong epoch_start, jlong last_yield_duration) {
+  _last_refinement_epoch_start = epoch_start;
+  guarantee(_yield_duration_in_refinement_epoch >= last_yield_duration, "should be");
+  _yield_duration_in_refinement_epoch -= last_yield_duration;
+}
+
+jlong G1CollectedHeap::yield_duration_in_refinement_epoch() {
+  return _yield_duration_in_refinement_epoch;
+}
+
 void G1CollectedHeap::post_initialize() {
  CollectedHeap::post_initialize();
  ref_processing_init();
@ -2336,6 +2406,7 @@ void G1CollectedHeap::gc_epilogue(bool full) {
                                            &_collection_set_candidates_card_set_stats);

  update_perf_counter_cpu_time();
+  _refinement_epoch++;
 }

 uint G1CollectedHeap::uncommit_regions(uint region_limit) {
@ -2468,7 +2539,6 @@ void G1CollectedHeap::verify_before_young_collection(G1HeapVerifier::G1VerifyTyp
  Ticks start = Ticks::now();
  _verifier->prepare_for_verify();
  _verifier->verify_region_sets_optional();
-  _verifier->verify_dirty_young_regions();
  _verifier->verify_before_gc();
  verify_numa_regions("GC Start");
  phase_times()->record_verify_before_time_ms((Ticks::now() - start).seconds() * MILLIUNITS);
@ -2734,6 +2804,11 @@ void G1CollectedHeap::free_region(G1HeapRegion* hr, G1FreeRegionList* free_list)
  if (free_list != nullptr) {
    free_list->add_ordered(hr);
  }
+  if (VerifyDuringGC) {
+    // Card and refinement table must be clear for freed regions.
+    card_table()->verify_region(MemRegion(hr->bottom(), hr->end()), G1CardTable::clean_card_val(), true);
+    refinement_table()->verify_region(MemRegion(hr->bottom(), hr->end()), G1CardTable::clean_card_val(), true);
+  }
 }

 void G1CollectedHeap::retain_region(G1HeapRegion* hr) {
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
@ -75,6 +75,7 @@ class G1GCPhaseTimes;
 class G1HeapSizingPolicy;
 class G1NewTracer;
 class G1RemSet;
+class G1ReviseYoungLengthTask;
 class G1ServiceTask;
 class G1ServiceThread;
 class GCMemoryManager;
@ -171,9 +172,23 @@ private:
  G1ServiceThread* _service_thread;
  G1ServiceTask* _periodic_gc_task;
  G1MonotonicArenaFreeMemoryTask* _free_arena_memory_task;
+  G1ReviseYoungLengthTask* _revise_young_length_task;

  WorkerThreads* _workers;
-  G1CardTable* _card_table;
+
+  // The current epoch for refinement, i.e. the number of times the card tables
+  // have been swapped by a garbage collection.
+  // Used for detecting whether concurrent refinement has been interrupted by a
+  // garbage collection.
+  size_t _refinement_epoch;
+
+  // The following members are for tracking safepoint durations between garbage
+  // collections.
+  jlong _last_synchronized_start;
+
+  jlong _last_refinement_epoch_start;
+  jlong _yield_duration_in_refinement_epoch;       // Time spent in safepoints since beginning of last refinement epoch.
+  size_t _last_safepoint_refinement_epoch;         // Refinement epoch before last safepoint.

  Ticks _collection_pause_end;

@ -541,12 +556,17 @@ public:
  void run_batch_task(G1BatchedTask* cl);

  // Return "optimal" number of chunks per region we want to use for claiming areas
-  // within a region to claim.
+  // within a region to claim during card table scanning.
  // The returned value is a trade-off between granularity of work distribution and
  // memory usage and maintenance costs of that table.
  // Testing showed that 64 for 1M/2M region, 128 for 4M/8M regions, 256 for 16/32M regions,
  // and so on seems to be such a good trade-off.
-  static uint get_chunks_per_region();
+  static uint get_chunks_per_region_for_scan();
+  // Return "optimal" number of chunks per region we want to use for claiming areas
+  // within a region to claim during card table merging.
+  // This is much smaller than for scanning as the merge work is much smaller.
+  // Currently 1 for 1M regions, 2 for 2/4M regions, 4 for 8/16M regions and so on.
+  static uint get_chunks_per_region_for_merge();

  G1Allocator* allocator() {
    return _allocator;
@ -687,11 +707,6 @@ public:

  // Add the given region to the retained regions collection set candidates.
  void retain_region(G1HeapRegion* hr);
-  // It dirties the cards that cover the block so that the post
-  // write barrier never queues anything when updating objects on this
-  // block. It is assumed (and in fact we assert) that the block
-  // belongs to a young region.
-  inline void dirty_young_block(HeapWord* start, size_t word_size);

  // Frees a humongous region by collapsing it into individual regions
  // and calling free_region() for each of them. The freed regions
@ -905,6 +920,10 @@ public:
  void safepoint_synchronize_begin() override;
  void safepoint_synchronize_end() override;

+  jlong last_refinement_epoch_start() const { return _last_refinement_epoch_start; }
+  void set_last_refinement_epoch_start(jlong epoch_start, jlong last_yield_duration);
+  jlong yield_duration_in_refinement_epoch();
+
  // Does operations required after initialization has been done.
  void post_initialize() override;

@ -1069,7 +1088,16 @@ public:
  }

  G1CardTable* card_table() const {
-    return _card_table;
+    return static_cast<G1CardTable*>(G1BarrierSet::g1_barrier_set()->card_table());
+  }
+
+  G1CardTable* refinement_table() const {
+    return G1BarrierSet::g1_barrier_set()->refinement_table();
+  }
+
+  G1CardTable::CardValue* card_table_base() const {
+    assert(card_table() != nullptr, "must be");
+    return card_table()->byte_map_base();
  }

  // Iteration functions.
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp
@ -149,30 +149,6 @@ inline void G1CollectedHeap::old_set_remove(G1HeapRegion* hr) {
  _old_set.remove(hr);
 }

-// It dirties the cards that cover the block so that the post
-// write barrier never queues anything when updating objects on this
-// block. It is assumed (and in fact we assert) that the block
-// belongs to a young region.
-inline void
-G1CollectedHeap::dirty_young_block(HeapWord* start, size_t word_size) {
-  assert_heap_not_locked();
-
-  // Assign the containing region to containing_hr so that we don't
-  // have to keep calling heap_region_containing() in the
-  // asserts below.
-  DEBUG_ONLY(G1HeapRegion* containing_hr = heap_region_containing(start);)
-  assert(word_size > 0, "pre-condition");
-  assert(containing_hr->is_in(start), "it should contain start");
-  assert(containing_hr->is_young(), "it should be young");
-  assert(!containing_hr->is_humongous(), "it should not be humongous");
-
-  HeapWord* end = start + word_size;
-  assert(containing_hr->is_in(end - 1), "it should also contain end - 1");
-
-  MemRegion mr(start, end);
-  card_table()->g1_mark_as_young(mr);
-}
-
 inline G1ScannerTasksQueueSet* G1CollectedHeap::task_queues() const {
  return _task_queues;
 }
--- a/src/hotspot/share/gc/g1/g1CollectionSet.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectionSet.cpp
@ -308,7 +308,8 @@ double G1CollectionSet::finalize_young_part(double target_pause_time_ms, G1Survi
  guarantee(target_pause_time_ms > 0.0,
            "target_pause_time_ms = %1.6lf should be positive", target_pause_time_ms);

-  size_t pending_cards = _policy->pending_cards_at_gc_start();
+  bool in_young_only_phase = _policy->collector_state()->in_young_only_phase();
+  size_t pending_cards = _policy->analytics()->predict_pending_cards(in_young_only_phase);

  log_trace(gc, ergo, cset)("Start choosing CSet. Pending cards: %zu target pause time: %1.2fms",
                            pending_cards, target_pause_time_ms);
@ -323,10 +324,8 @@ double G1CollectionSet::finalize_young_part(double target_pause_time_ms, G1Survi

  verify_young_cset_indices();

-  size_t num_young_cards = _g1h->young_regions_cardset()->occupied();
-  _policy->record_card_rs_length(num_young_cards);
-
-  double predicted_base_time_ms = _policy->predict_base_time_ms(pending_cards, num_young_cards);
+  size_t card_rs_length = _policy->analytics()->predict_card_rs_length(in_young_only_phase);
+  double predicted_base_time_ms = _policy->predict_base_time_ms(pending_cards, card_rs_length);
  // Base time already includes the whole remembered set related time, so do not add that here
  // again.
  double predicted_eden_time = _policy->predict_young_region_other_time_ms(eden_region_length) +
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
@ -27,6 +27,7 @@
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1BatchedTask.hpp"
 #include "gc/g1/g1CardSetMemory.hpp"
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectionSetChooser.hpp"
 #include "gc/g1/g1CollectorState.hpp"
@ -34,7 +35,7 @@
 #include "gc/g1/g1ConcurrentMarkRemarkTasks.hpp"
 #include "gc/g1/g1ConcurrentMarkThread.inline.hpp"
 #include "gc/g1/g1ConcurrentRebuildAndScrub.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
+#include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1HeapRegion.inline.hpp"
 #include "gc/g1/g1HeapRegionManager.hpp"
 #include "gc/g1/g1HeapRegionPrinter.hpp"
@ -483,7 +484,7 @@ G1ConcurrentMark::G1ConcurrentMark(G1CollectedHeap* g1h,

  // _finger set in set_non_marking_state

-  _worker_id_offset(G1DirtyCardQueueSet::num_par_ids() + G1ConcRefinementThreads),
+  _worker_id_offset(G1ConcRefinementThreads), // The refinement control thread does not refine cards, so it's just the worker threads.
  _max_num_tasks(MAX2(ConcGCThreads, ParallelGCThreads)),
  // _num_active_tasks set in set_non_marking_state()
  // _tasks set inside the constructor
@ -1141,7 +1142,7 @@ void G1ConcurrentMark::mark_from_roots() {
  // worker threads may currently exist and more may not be
  // available.
  active_workers = _concurrent_workers->set_active_workers(active_workers);
-  log_info(gc, task)("Using %u workers of %u for marking", active_workers, _concurrent_workers->max_workers());
+  log_info(gc, task)("Concurrent Mark Using %u of %u Workers", active_workers, _concurrent_workers->max_workers());

  _num_concurrent_workers = active_workers;

--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
@ -580,6 +580,8 @@ public:
  // TARS for the given region during remembered set rebuilding.
  inline HeapWord* top_at_rebuild_start(G1HeapRegion* r) const;

+  uint worker_id_offset() const { return _worker_id_offset; }
+
  // Clear statistics gathered during the concurrent cycle for the given region after
  // it has been reclaimed.
  void clear_statistics(G1HeapRegion* r);
--- a/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp
@ -25,6 +25,7 @@
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentMark.inline.hpp"
 #include "gc/g1/g1ConcurrentMarkRemarkTasks.hpp"
+#include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1HeapRegion.inline.hpp"
 #include "gc/g1/g1HeapRegionPrinter.hpp"
 #include "gc/g1/g1RemSetTrackingPolicy.hpp"
@ -54,15 +55,16 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public
    _num_humongous_regions_removed(0),
    _local_cleanup_list(local_cleanup_list) {}

-  void reclaim_empty_region(G1HeapRegion* hr) {
+  void reclaim_empty_region_common(G1HeapRegion* hr) {
    assert(!hr->has_pinned_objects(), "precondition");
    assert(hr->used() > 0, "precondition");

    _freed_bytes += hr->used();
    hr->set_containing_set(nullptr);
-    hr->clear_cardtable();
+    hr->clear_both_card_tables();
    _cm->clear_statistics(hr);
    G1HeapRegionPrinter::mark_reclaim(hr);
+    _g1h->concurrent_refine()->notify_region_reclaimed(hr);
  }

  void reclaim_empty_humongous_region(G1HeapRegion* hr) {
@ -71,8 +73,8 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public
    auto on_humongous_region = [&] (G1HeapRegion* hr) {
      assert(hr->is_humongous(), "precondition");

-      reclaim_empty_region(hr);
      _num_humongous_regions_removed++;
+      reclaim_empty_region_common(hr);
      _g1h->free_humongous_region(hr, _local_cleanup_list);
    };

@ -82,8 +84,8 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public
  void reclaim_empty_old_region(G1HeapRegion* hr) {
    assert(hr->is_old(), "precondition");

-    reclaim_empty_region(hr);
    _num_old_regions_removed++;
+    reclaim_empty_region_common(hr);
    _g1h->free_region(hr, _local_cleanup_list);
  }

--- a/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp
@ -245,7 +245,7 @@ class G1RebuildRSAndScrubTask : public WorkerTask {
    G1RebuildRSAndScrubRegionClosure(G1ConcurrentMark* cm, bool should_rebuild_remset, uint worker_id) :
      _cm(cm),
      _bitmap(_cm->mark_bitmap()),
-      _rebuild_closure(G1CollectedHeap::heap(), worker_id),
+      _rebuild_closure(G1CollectedHeap::heap(), worker_id + cm->worker_id_offset()),
      _should_rebuild_remset(should_rebuild_remset),
      _processed_words(0) { }

--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
@ -22,15 +22,20 @@
 *
 */

+#include "gc/g1/g1Analytics.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
+#include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectionSet.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
+#include "gc/g1/g1ConcurrentRefineSweepTask.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.inline.hpp"
 #include "gc/g1/g1HeapRegionRemSet.inline.hpp"
 #include "gc/g1/g1Policy.hpp"
 #include "gc/shared/gc_globals.hpp"
+#include "gc/shared/gcTraceTime.inline.hpp"
+#include "gc/shared/workerThread.hpp"
 #include "logging/log.hpp"
 #include "memory/allocation.inline.hpp"
 #include "memory/iterator.hpp"
@ -38,17 +43,15 @@
 #include "runtime/mutexLocker.hpp"
 #include "utilities/debug.hpp"
 #include "utilities/globalDefinitions.hpp"
+#include "utilities/ticks.hpp"

 #include <math.h>

-G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thread(uint worker_id, bool initializing) {
+G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thread() {
  G1ConcurrentRefineThread* result = nullptr;
-  if (initializing || !InjectGCWorkerCreationFailure) {
-    result = G1ConcurrentRefineThread::create(_cr, worker_id);
-  }
+  result = G1ConcurrentRefineThread::create(_cr);
  if (result == nullptr || result->osthread() == nullptr) {
-    log_warning(gc)("Failed to create refinement thread %u, no more %s",
-                    worker_id,
+    log_warning(gc)("Failed to create refinement control thread, no more %s",
                    result == nullptr ? "memory" : "OS threads");
    if (result != nullptr) {
      delete result;
@ -60,106 +63,392 @@ G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thr

 G1ConcurrentRefineThreadControl::G1ConcurrentRefineThreadControl(uint max_num_threads) :
  _cr(nullptr),
-  _threads(max_num_threads)
+  _control_thread(nullptr),
+  _workers(nullptr),
+  _max_num_threads(max_num_threads)
 {}

 G1ConcurrentRefineThreadControl::~G1ConcurrentRefineThreadControl() {
-  while (_threads.is_nonempty()) {
-    delete _threads.pop();
-  }
-}
-
-bool G1ConcurrentRefineThreadControl::ensure_threads_created(uint worker_id, bool initializing) {
-  assert(worker_id < max_num_threads(), "precondition");
-
-  while ((uint)_threads.length() <= worker_id) {
-    G1ConcurrentRefineThread* rt = create_refinement_thread(_threads.length(), initializing);
-    if (rt == nullptr) {
-      return false;
-    }
-    _threads.push(rt);
-  }
-
-  return true;
+  delete _control_thread;
+  delete _workers;
 }

 jint G1ConcurrentRefineThreadControl::initialize(G1ConcurrentRefine* cr) {
  assert(cr != nullptr, "G1ConcurrentRefine must not be null");
  _cr = cr;

-  if (max_num_threads() > 0) {
-    _threads.push(create_refinement_thread(0, true));
-    if (_threads.at(0) == nullptr) {
-      vm_shutdown_during_initialization("Could not allocate primary refinement thread");
+  if (is_refinement_enabled()) {
+    _control_thread = create_refinement_thread();
+    if (_control_thread == nullptr) {
+      vm_shutdown_during_initialization("Could not allocate refinement control thread");
      return JNI_ENOMEM;
    }
-
-    if (!UseDynamicNumberOfGCThreads) {
-      if (!ensure_threads_created(max_num_threads() - 1, true)) {
-        vm_shutdown_during_initialization("Could not allocate refinement threads");
-        return JNI_ENOMEM;
-      }
-    }
+    _workers = new WorkerThreads("G1 Refinement Workers", max_num_threads());
+    _workers->initialize_workers();
  }
-
  return JNI_OK;
 }

 #ifdef ASSERT
-void G1ConcurrentRefineThreadControl::assert_current_thread_is_primary_refinement_thread() const {
-  assert(Thread::current() == _threads.at(0), "Not primary thread");
+void G1ConcurrentRefineThreadControl::assert_current_thread_is_control_refinement_thread() const {
+  assert(Thread::current() == _control_thread, "Not refinement control thread");
 }
 #endif // ASSERT

-bool G1ConcurrentRefineThreadControl::activate(uint worker_id) {
-  if (ensure_threads_created(worker_id, false)) {
-    _threads.at(worker_id)->activate();
-    return true;
-  }
+void G1ConcurrentRefineThreadControl::activate() {
+  _control_thread->activate();
+}

-  return false;
+void G1ConcurrentRefineThreadControl::run_task(WorkerTask* task, uint num_workers) {
+  assert(num_workers >= 1, "must be");
+
+  WithActiveWorkers w(_workers, num_workers);
+  _workers->run_task(task);
+}
+
+void G1ConcurrentRefineThreadControl::control_thread_do(ThreadClosure* tc) {
+  if (is_refinement_enabled()) {
+    tc->do_thread(_control_thread);
+  }
 }

 void G1ConcurrentRefineThreadControl::worker_threads_do(ThreadClosure* tc) {
-  for (G1ConcurrentRefineThread* t : _threads) {
-    tc->do_thread(t);
+  if (is_refinement_enabled()) {
+    _workers->threads_do(tc);
  }
 }

 void G1ConcurrentRefineThreadControl::stop() {
-  for (G1ConcurrentRefineThread* t : _threads) {
-    t->stop();
+  if (is_refinement_enabled()) {
+    _control_thread->stop();
  }
 }

+G1ConcurrentRefineSweepState::G1ConcurrentRefineSweepState(uint max_reserved_regions) :
+  _state(State::Idle),
+  _sweep_table(new G1CardTableClaimTable(G1CollectedHeap::get_chunks_per_region_for_merge())),
+  _stats()
+{
+  _sweep_table->initialize(max_reserved_regions);
+}
+
+G1ConcurrentRefineSweepState::~G1ConcurrentRefineSweepState() {
+  delete _sweep_table;
+}
+
+void G1ConcurrentRefineSweepState::set_state_start_time() {
+  _state_start[static_cast<uint>(_state)] = Ticks::now();
+}
+
+Tickspan G1ConcurrentRefineSweepState::get_duration(State start, State end) {
+  return _state_start[static_cast<uint>(end)] - _state_start[static_cast<uint>(start)];
+}
+
+void G1ConcurrentRefineSweepState::reset_stats() {
+  stats()->reset();
+}
+
+void G1ConcurrentRefineSweepState::add_yield_during_sweep_duration(jlong duration) {
+  stats()->inc_yield_during_sweep_duration(duration);
+}
+
+bool G1ConcurrentRefineSweepState::advance_state(State next_state) {
+  bool result = is_in_progress();
+  if (result) {
+    _state = next_state;
+  } else {
+    _state = State::Idle;
+  }
+  return result;
+}
+
+void G1ConcurrentRefineSweepState::assert_state(State expected) {
+  assert(_state == expected, "must be %s but is %s", state_name(expected), state_name(_state));
+}
+
+void G1ConcurrentRefineSweepState::start_work() {
+  assert_state(State::Idle);
+
+  set_state_start_time();
+
+  _stats.reset();
+
+  _state = State::SwapGlobalCT;
+}
+
+bool G1ConcurrentRefineSweepState::swap_global_card_table() {
+  assert_state(State::SwapGlobalCT);
+
+  GCTraceTime(Info, gc, refine) tm("Concurrent Refine Global Card Table Swap");
+  set_state_start_time();
+
+  {
+    // We can't have any new threads being in the process of created while we
+    // swap the card table because we read the current card table state during
+    // initialization.
+    // A safepoint may occur during that time, so leave the STS temporarily.
+    SuspendibleThreadSetLeaver sts_leave;
+
+    MutexLocker mu(Threads_lock);
+    // A GC that advanced the epoch might have happened, which already switched
+    // The global card table. Do nothing.
+    if (is_in_progress()) {
+      G1BarrierSet::g1_barrier_set()->swap_global_card_table();
+    }
+  }
+
+  return advance_state(State::SwapJavaThreadsCT);
+}
+
+bool G1ConcurrentRefineSweepState::swap_java_threads_ct() {
+  assert_state(State::SwapJavaThreadsCT);
+
+  GCTraceTime(Info, gc, refine) tm("Concurrent Refine Java Thread CT swap");
+
+  set_state_start_time();
+
+  {
+    // Need to leave the STS to avoid potential deadlock in the handshake.
+    SuspendibleThreadSetLeaver sts;
+
+    class G1SwapThreadCardTableClosure : public HandshakeClosure {
+    public:
+      G1SwapThreadCardTableClosure() : HandshakeClosure("G1 Java Thread CT swap") { }
+
+      virtual void do_thread(Thread* thread) {
+        G1BarrierSet* bs = G1BarrierSet::g1_barrier_set();
+        bs->update_card_table_base(thread);
+      }
+    } cl;
+    Handshake::execute(&cl);
+  }
+
+  return advance_state(State::SynchronizeGCThreads);
+  }
+
+bool G1ConcurrentRefineSweepState::swap_gc_threads_ct() {
+  assert_state(State::SynchronizeGCThreads);
+
+  GCTraceTime(Info, gc, refine) tm("Concurrent Refine GC Thread CT swap");
+
+  set_state_start_time();
+
+  {
+    class RendezvousGCThreads: public VM_Operation {
+    public:
+      VMOp_Type type() const { return VMOp_G1RendezvousGCThreads; }
+
+      virtual bool evaluate_at_safepoint() const {
+        // We only care about synchronizing the GC threads.
+        // Leave the Java threads running.
+        return false;
+      }
+
+      virtual bool skip_thread_oop_barriers() const {
+        fatal("Concurrent VMOps should not call this");
+        return true;
+      }
+
+      void doit() {
+        // Light weight "handshake" of the GC threads for memory synchronization;
+        // both changes to the Java heap need to be synchronized as well as the
+        // previous global card table reference change, so that no GC thread
+        // accesses the wrong card table.
+        // For example in the rebuild remset process the marking threads write
+        // marks into the card table, and that card table reference must be the
+        // correct one.
+        SuspendibleThreadSet::synchronize();
+        SuspendibleThreadSet::desynchronize();
+      };
+    } op;
+
+    SuspendibleThreadSetLeaver sts_leave;
+    VMThread::execute(&op);
+  }
+
+  return advance_state(State::SnapshotHeap);
+}
+
+void G1ConcurrentRefineSweepState::snapshot_heap(bool concurrent) {
+  if (concurrent) {
+    GCTraceTime(Info, gc, refine) tm("Concurrent Refine Snapshot Heap");
+
+    assert_state(State::SnapshotHeap);
+
+    set_state_start_time();
+
+    snapshot_heap_inner();
+
+    advance_state(State::SweepRT);
+  } else {
+    assert_state(State::Idle);
+    assert_at_safepoint();
+
+    snapshot_heap_inner();
+  }
+}
+
+void G1ConcurrentRefineSweepState::sweep_refinement_table_start() {
+  assert_state(State::SweepRT);
+
+  set_state_start_time();
+}
+
+bool G1ConcurrentRefineSweepState::sweep_refinement_table_step() {
+  assert_state(State::SweepRT);
+
+  GCTraceTime(Info, gc, refine) tm("Concurrent Refine Table Step");
+
+  G1ConcurrentRefine* cr = G1CollectedHeap::heap()->concurrent_refine();
+
+  G1ConcurrentRefineSweepTask task(_sweep_table, &_stats, cr->num_threads_wanted());
+  cr->run_with_refinement_workers(&task);
+
+  if (task.sweep_completed()) {
+    advance_state(State::CompleteRefineWork);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool G1ConcurrentRefineSweepState::complete_work(bool concurrent, bool print_log) {
+  if (concurrent) {
+    assert_state(State::CompleteRefineWork);
+  } else {
+    // May have been forced to complete at any other time.
+    assert(is_in_progress() && _state != State::CompleteRefineWork, "must be but is %s", state_name(_state));
+  }
+
+  set_state_start_time();
+
+  if (print_log) {
+    G1ConcurrentRefineStats* s = &_stats;
+
+    log_debug(gc, refine)("Refinement took %.2fms (pre-sweep %.2fms card refine %.2f) "
+                          "(scanned %zu clean %zu (%.2f%%) not_clean %zu (%.2f%%) not_parsable %zu "
+                          "refers_to_cset %zu (%.2f%%) still_refers_to_cset %zu (%.2f%%) no_cross_region %zu pending %zu)",
+                          get_duration(State::Idle, _state).seconds() * 1000.0,
+                          get_duration(State::Idle, State::SweepRT).seconds() * 1000.0,
+                          TimeHelper::counter_to_millis(s->refine_duration()),
+                          s->cards_scanned(),
+                          s->cards_clean(),
+                          percent_of(s->cards_clean(), s->cards_scanned()),
+                          s->cards_not_clean(),
+                          percent_of(s->cards_not_clean(), s->cards_scanned()),
+                          s->cards_not_parsable(),
+                          s->cards_refer_to_cset(),
+                          percent_of(s->cards_refer_to_cset(), s->cards_not_clean()),
+                          s->cards_already_refer_to_cset(),
+                          percent_of(s->cards_already_refer_to_cset(), s->cards_not_clean()),
+                          s->cards_no_cross_region(),
+                          s->cards_pending()
+                         );
+  }
+
+  bool has_sweep_rt_work = _state == State::SweepRT;
+
+  advance_state(State::Idle);
+  return has_sweep_rt_work;
+}
+
+void G1ConcurrentRefineSweepState::snapshot_heap_inner() {
+  // G1CollectedHeap::heap_region_iterate() below will only visit currently committed
+  // regions. Initialize all entries in the state table here and later in this method
+  // selectively enable regions that we are interested. This way regions committed
+  // later will be automatically excluded from iteration.
+  // Their refinement table must be completely empty anyway.
+  _sweep_table->reset_all_to_claimed();
+
+  class SnapshotRegionsClosure : public G1HeapRegionClosure {
+    G1CardTableClaimTable* _sweep_table;
+
+  public:
+    SnapshotRegionsClosure(G1CardTableClaimTable* sweep_table) : G1HeapRegionClosure(), _sweep_table(sweep_table) { }
+
+    bool do_heap_region(G1HeapRegion* r) override {
+      if (!r->is_free()) {
+        // Need to scan all parts of non-free regions, so reset the claim.
+        // No need for synchronization: we are only interested in regions
+        // that were allocated before the handshake; the handshake makes such
+        // regions' metadata visible to all threads, and we do not care about
+        // humongous regions that were allocated afterwards.
+        _sweep_table->reset_to_unclaimed(r->hrm_index());
+      }
+      return false;
+    }
+  } cl(_sweep_table);
+  G1CollectedHeap::heap()->heap_region_iterate(&cl);
+}
+
+bool G1ConcurrentRefineSweepState::is_in_progress() const {
+  return _state != State::Idle;
+}
+
+bool G1ConcurrentRefineSweepState::are_java_threads_synched() const {
+  return _state > State::SwapJavaThreadsCT || !is_in_progress();
+}
+
 uint64_t G1ConcurrentRefine::adjust_threads_period_ms() const {
  // Instead of a fixed value, this could be a command line option.  But then
  // we might also want to allow configuration of adjust_threads_wait_ms().
-  return 50;
+
+  // Use a prime number close to 50ms, different to other components that derive
+  // their wait time from the try_get_available_bytes_estimate() call to minimize
+  // interference.
+  return 53;
 }

 static size_t minimum_pending_cards_target() {
-  // One buffer per thread.
-  return ParallelGCThreads * G1UpdateBufferSize;
+  return ParallelGCThreads * G1PerThreadPendingCardThreshold;
 }

-G1ConcurrentRefine::G1ConcurrentRefine(G1Policy* policy) :
-  _policy(policy),
-  _threads_wanted(0),
+G1ConcurrentRefine::G1ConcurrentRefine(G1CollectedHeap* g1h) :
+  _policy(g1h->policy()),
+  _num_threads_wanted(0),
  _pending_cards_target(PendingCardsTargetUninitialized),
  _last_adjust(),
  _needs_adjust(false),
-  _threads_needed(policy, adjust_threads_period_ms()),
+  _heap_was_locked(false),
+  _threads_needed(g1h->policy(), adjust_threads_period_ms()),
  _thread_control(G1ConcRefinementThreads),
-  _dcqs(G1BarrierSet::dirty_card_queue_set())
-{}
+  _sweep_state(g1h->max_num_regions())
+{ }

 jint G1ConcurrentRefine::initialize() {
  return _thread_control.initialize(this);
 }

-G1ConcurrentRefine* G1ConcurrentRefine::create(G1Policy* policy, jint* ecode) {
-  G1ConcurrentRefine* cr = new G1ConcurrentRefine(policy);
+G1ConcurrentRefineSweepState& G1ConcurrentRefine::sweep_state_for_merge() {
+  bool has_sweep_claims = sweep_state().complete_work(false /* concurrent */);
+  if (has_sweep_claims) {
+    log_debug(gc, refine)("Continue existing work");
+  } else {
+    // Refinement has been interrupted without having a snapshot. There may
+    // be a mix of already swapped and not-swapped card tables assigned to threads,
+    // so they might have already dirtied the swapped card tables.
+    // Conservatively scan all (non-free, non-committed) region's card tables,
+    // creating the snapshot right now.
+    log_debug(gc, refine)("Create work from scratch");
+
+    sweep_state().snapshot_heap(false /* concurrent */);
+  }
+  return sweep_state();
+}
+
+void G1ConcurrentRefine::run_with_refinement_workers(WorkerTask* task) {
+  _thread_control.run_task(task, num_threads_wanted());
+}
+
+void G1ConcurrentRefine::notify_region_reclaimed(G1HeapRegion* r) {
+  assert_at_safepoint();
+  if (_sweep_state.is_in_progress()) {
+    _sweep_state.sweep_table()->claim_all_cards(r->hrm_index());
+  }
+}
+
+G1ConcurrentRefine* G1ConcurrentRefine::create(G1CollectedHeap* g1h, jint* ecode) {
+  G1ConcurrentRefine* cr = new G1ConcurrentRefine(g1h);
  *ecode = cr->initialize();
  if (*ecode != 0) {
    delete cr;
@ -176,25 +465,31 @@ G1ConcurrentRefine::~G1ConcurrentRefine() {
 }

 void G1ConcurrentRefine::threads_do(ThreadClosure *tc) {
+  worker_threads_do(tc);
+  control_thread_do(tc);
+}
+
+void G1ConcurrentRefine::worker_threads_do(ThreadClosure *tc) {
  _thread_control.worker_threads_do(tc);
 }

-void G1ConcurrentRefine::update_pending_cards_target(double logged_cards_time_ms,
-                                                     size_t processed_logged_cards,
-                                                     size_t predicted_thread_buffer_cards,
+void G1ConcurrentRefine::control_thread_do(ThreadClosure *tc) {
+  _thread_control.control_thread_do(tc);
+}
+
+void G1ConcurrentRefine::update_pending_cards_target(double pending_cards_time_ms,
+                                                     size_t processed_pending_cards,
                                                     double goal_ms) {
  size_t minimum = minimum_pending_cards_target();
-  if ((processed_logged_cards < minimum) || (logged_cards_time_ms == 0.0)) {
-    log_debug(gc, ergo, refine)("Unchanged pending cards target: %zu",
-                                _pending_cards_target);
+  if ((processed_pending_cards < minimum) || (pending_cards_time_ms == 0.0)) {
+    log_debug(gc, ergo, refine)("Unchanged pending cards target: %zu (processed %zu minimum %zu time %1.2f)",
+                                _pending_cards_target, processed_pending_cards, minimum, pending_cards_time_ms);
    return;
  }

  // Base the pending cards budget on the measured rate.
-  double rate = processed_logged_cards / logged_cards_time_ms;
-  size_t budget = static_cast<size_t>(goal_ms * rate);
-  // Deduct predicted cards in thread buffers to get target.
-  size_t new_target = budget - MIN2(budget, predicted_thread_buffer_cards);
+  double rate = processed_pending_cards / pending_cards_time_ms;
+  size_t new_target = static_cast<size_t>(goal_ms * rate);
  // Add some hysteresis with previous values.
  if (is_pending_cards_target_initialized()) {
    new_target = (new_target + _pending_cards_target) / 2;
@ -205,46 +500,36 @@ void G1ConcurrentRefine::update_pending_cards_target(double logged_cards_time_ms
  log_debug(gc, ergo, refine)("New pending cards target: %zu", new_target);
 }

-void G1ConcurrentRefine::adjust_after_gc(double logged_cards_time_ms,
-                                         size_t processed_logged_cards,
-                                         size_t predicted_thread_buffer_cards,
+void G1ConcurrentRefine::adjust_after_gc(double pending_cards_time_ms,
+                                         size_t processed_pending_cards,
                                         double goal_ms) {
-  if (!G1UseConcRefinement) return;
+  if (!G1UseConcRefinement) {
+    return;
+  }

-  update_pending_cards_target(logged_cards_time_ms,
-                              processed_logged_cards,
-                              predicted_thread_buffer_cards,
+  update_pending_cards_target(pending_cards_time_ms,
+                              processed_pending_cards,
                              goal_ms);
-  if (_thread_control.max_num_threads() == 0) {
-    // If no refinement threads then the mutator threshold is the target.
-    _dcqs.set_mutator_refinement_threshold(_pending_cards_target);
-  } else {
-    // Provisionally make the mutator threshold unlimited, to be updated by
-    // the next periodic adjustment.  Because card state may have changed
-    // drastically, record that adjustment is needed and kick the primary
-    // thread, in case it is waiting.
-    _dcqs.set_mutator_refinement_threshold(SIZE_MAX);
+  if (_thread_control.is_refinement_enabled()) {
    _needs_adjust = true;
    if (is_pending_cards_target_initialized()) {
-      _thread_control.activate(0);
+      _thread_control.activate();
    }
  }
 }

-// Wake up the primary thread less frequently when the time available until
-// the next GC is longer.  But don't increase the wait time too rapidly.
-// This reduces the number of primary thread wakeups that just immediately
-// go back to waiting, while still being responsive to behavior changes.
-static uint64_t compute_adjust_wait_time_ms(double available_ms) {
-  return static_cast<uint64_t>(sqrt(available_ms) * 4.0);
-}
-
 uint64_t G1ConcurrentRefine::adjust_threads_wait_ms() const {
-  assert_current_thread_is_primary_refinement_thread();
+  assert_current_thread_is_control_refinement_thread();
  if (is_pending_cards_target_initialized()) {
-    double available_ms = _threads_needed.predicted_time_until_next_gc_ms();
-    uint64_t wait_time_ms = compute_adjust_wait_time_ms(available_ms);
-    return MAX2(wait_time_ms, adjust_threads_period_ms());
+    // Retry asap when the cause for not getting a prediction was that we temporarily
+    // did not get the heap lock. Otherwise we might wait for too long until we get
+    // back here.
+    if (_heap_was_locked) {
+      return 1;
+    }
+    double available_time_ms = _threads_needed.predicted_time_until_next_gc_ms();
+
+    return _policy->adjust_wait_time_ms(available_time_ms, adjust_threads_period_ms());
  } else {
    // If target not yet initialized then wait forever (until explicitly
    // activated).  This happens during startup, when we don't bother with
@ -253,185 +538,74 @@ uint64_t G1ConcurrentRefine::adjust_threads_wait_ms() const {
  }
 }

-class G1ConcurrentRefine::RemSetSamplingClosure : public G1HeapRegionClosure {
-  size_t _sampled_code_root_rs_length;
+bool G1ConcurrentRefine::adjust_num_threads_periodically() {
+  assert_current_thread_is_control_refinement_thread();

-public:
-  RemSetSamplingClosure() :
-    _sampled_code_root_rs_length(0) {}
-
-  bool do_heap_region(G1HeapRegion* r) override {
-    G1HeapRegionRemSet* rem_set = r->rem_set();
-    _sampled_code_root_rs_length += rem_set->code_roots_list_length();
-    return false;
-  }
-
-  size_t sampled_code_root_rs_length() const { return _sampled_code_root_rs_length; }
-};
-
-// Adjust the target length (in regions) of the young gen, based on the
-// current length of the remembered sets.
-//
-// At the end of the GC G1 determines the length of the young gen based on
-// how much time the next GC can take, and when the next GC may occur
-// according to the MMU.
-//
-// The assumption is that a significant part of the GC is spent on scanning
-// the remembered sets (and many other components), so this thread constantly
-// reevaluates the prediction for the remembered set scanning costs, and potentially
-// resizes the young gen. This may do a premature GC or even increase the young
-// gen size to keep pause time length goal.
-void G1ConcurrentRefine::adjust_young_list_target_length() {
-  if (_policy->use_adaptive_young_list_length()) {
-    G1CollectedHeap* g1h = G1CollectedHeap::heap();
-    G1CollectionSet* cset = g1h->collection_set();
-    RemSetSamplingClosure cl;
-    cset->iterate(&cl);
-
-    size_t card_rs_length = g1h->young_regions_cardset()->occupied();
-
-    size_t sampled_code_root_rs_length = cl.sampled_code_root_rs_length();
-    _policy->revise_young_list_target_length(card_rs_length, sampled_code_root_rs_length);
-  }
-}
-
-bool G1ConcurrentRefine::adjust_threads_periodically() {
-  assert_current_thread_is_primary_refinement_thread();
-
-  // Check whether it's time to do a periodic adjustment.
+  _heap_was_locked = false;
+  // Check whether it's time to do a periodic adjustment if there is no explicit
+  // request pending. We might have spuriously woken up.
  if (!_needs_adjust) {
    Tickspan since_adjust = Ticks::now() - _last_adjust;
-    if (since_adjust.milliseconds() >= adjust_threads_period_ms()) {
-      _needs_adjust = true;
+    if (since_adjust.milliseconds() < adjust_threads_period_ms()) {
+      _num_threads_wanted = 0;
+      return false;
    }
  }

-  // If needed, try to adjust threads wanted.
-  if (_needs_adjust) {
-    // Getting used young bytes requires holding Heap_lock.  But we can't use
-    // normal lock and block until available.  Blocking on the lock could
-    // deadlock with a GC VMOp that is holding the lock and requesting a
-    // safepoint.  Instead try to lock, and if fail then skip adjustment for
-    // this iteration of the thread, do some refinement work, and retry the
-    // adjustment later.
-    if (Heap_lock->try_lock()) {
-      size_t used_bytes = _policy->estimate_used_young_bytes_locked();
-      Heap_lock->unlock();
-      adjust_young_list_target_length();
-      size_t young_bytes = _policy->young_list_target_length() * G1HeapRegion::GrainBytes;
-      size_t available_bytes = young_bytes - MIN2(young_bytes, used_bytes);
-      adjust_threads_wanted(available_bytes);
-      _needs_adjust = false;
-      _last_adjust = Ticks::now();
-      return true;
-    }
+  // Reset pending request.
+  _needs_adjust = false;
+  size_t available_bytes = 0;
+  if (_policy->try_get_available_bytes_estimate(available_bytes)) {
+    adjust_threads_wanted(available_bytes);
+    _last_adjust = Ticks::now();
+  } else {
+    _heap_was_locked = true;
+    // Defer adjustment to next time.
+    _needs_adjust = true;
  }

-  return false;
-}
-
-bool G1ConcurrentRefine::is_in_last_adjustment_period() const {
-  return _threads_needed.predicted_time_until_next_gc_ms() <= adjust_threads_period_ms();
+  return (_num_threads_wanted > 0) && !heap_was_locked();
 }

 void G1ConcurrentRefine::adjust_threads_wanted(size_t available_bytes) {
-  assert_current_thread_is_primary_refinement_thread();
-  size_t num_cards = _dcqs.num_cards();
-  size_t mutator_threshold = SIZE_MAX;
-  uint old_wanted = AtomicAccess::load(&_threads_wanted);
+  assert_current_thread_is_control_refinement_thread();

-  _threads_needed.update(old_wanted,
+  G1Policy* policy = G1CollectedHeap::heap()->policy();
+  const G1Analytics* analytics = policy->analytics();
+
+  size_t num_cards = policy->current_pending_cards();
+
+  _threads_needed.update(_num_threads_wanted,
                         available_bytes,
                         num_cards,
                         _pending_cards_target);
  uint new_wanted = _threads_needed.threads_needed();
  if (new_wanted > _thread_control.max_num_threads()) {
-    // If running all the threads can't reach goal, turn on refinement by
-    // mutator threads.  Using target as the threshold may be stronger
-    // than required, but will do the most to get us under goal, and we'll
-    // reevaluate with the next adjustment.
-    mutator_threshold = _pending_cards_target;
+    // Bound the wanted threads by maximum available.
    new_wanted = _thread_control.max_num_threads();
-  } else if (is_in_last_adjustment_period()) {
-    // If very little time remains until GC, enable mutator refinement.  If
-    // the target has been reached, this keeps the number of pending cards on
-    // target even if refinement threads deactivate in the meantime.  And if
-    // the target hasn't been reached, this prevents things from getting
-    // worse.
-    mutator_threshold = _pending_cards_target;
  }
-  AtomicAccess::store(&_threads_wanted, new_wanted);
-  _dcqs.set_mutator_refinement_threshold(mutator_threshold);
-  log_debug(gc, refine)("Concurrent refinement: wanted %u, cards: %zu, "
-                        "predicted: %zu, time: %1.2fms",
+
+  _num_threads_wanted = new_wanted;
+
+  log_debug(gc, refine)("Concurrent refinement: wanted %u, pending cards: %zu (pending-from-gc %zu), "
+                        "predicted: %zu, goal %zu, time-until-next-gc: %1.2fms pred-refine-rate %1.2fc/ms log-rate %1.2fc/ms",
                        new_wanted,
                        num_cards,
+                        G1CollectedHeap::heap()->policy()->pending_cards_from_gc(),
                        _threads_needed.predicted_cards_at_next_gc(),
-                        _threads_needed.predicted_time_until_next_gc_ms());
-  // Activate newly wanted threads.  The current thread is the primary
-  // refinement thread, so is already active.
-  for (uint i = MAX2(old_wanted, 1u); i < new_wanted; ++i) {
-    if (!_thread_control.activate(i)) {
-      // Failed to allocate and activate thread.  Stop trying to activate, and
-      // instead use mutator threads to make up the gap.
-      AtomicAccess::store(&_threads_wanted, i);
-      _dcqs.set_mutator_refinement_threshold(_pending_cards_target);
-      break;
-    }
-  }
-}
-
-void G1ConcurrentRefine::reduce_threads_wanted() {
-  assert_current_thread_is_primary_refinement_thread();
-  if (!_needs_adjust) {         // Defer if adjustment request is active.
-    uint wanted = AtomicAccess::load(&_threads_wanted);
-    if (wanted > 0) {
-      AtomicAccess::store(&_threads_wanted, --wanted);
-    }
-    // If very little time remains until GC, enable mutator refinement.  If
-    // the target has been reached, this keeps the number of pending cards on
-    // target even as refinement threads deactivate in the meantime.
-    if (is_in_last_adjustment_period()) {
-      _dcqs.set_mutator_refinement_threshold(_pending_cards_target);
-    }
-  }
-}
-
-bool G1ConcurrentRefine::is_thread_wanted(uint worker_id) const {
-  return worker_id < AtomicAccess::load(&_threads_wanted);
+                        _pending_cards_target,
+                        _threads_needed.predicted_time_until_next_gc_ms(),
+                        analytics->predict_concurrent_refine_rate_ms(),
+                        analytics->predict_dirtied_cards_rate_ms()
+                        );
 }

 bool G1ConcurrentRefine::is_thread_adjustment_needed() const {
-  assert_current_thread_is_primary_refinement_thread();
+  assert_current_thread_is_control_refinement_thread();
  return _needs_adjust;
 }

 void G1ConcurrentRefine::record_thread_adjustment_needed() {
-  assert_current_thread_is_primary_refinement_thread();
+  assert_current_thread_is_control_refinement_thread();
  _needs_adjust = true;
 }
-
-G1ConcurrentRefineStats G1ConcurrentRefine::get_and_reset_refinement_stats() {
-  struct CollectStats : public ThreadClosure {
-    G1ConcurrentRefineStats _total_stats;
-    virtual void do_thread(Thread* t) {
-      G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
-      G1ConcurrentRefineStats& stats = *crt->refinement_stats();
-      _total_stats += stats;
-      stats.reset();
-    }
-  } collector;
-  threads_do(&collector);
-  return collector._total_stats;
-}
-
-uint G1ConcurrentRefine::worker_id_offset() {
-  return G1DirtyCardQueueSet::num_par_ids();
-}
-
-bool G1ConcurrentRefine::try_refinement_step(uint worker_id,
-                                             size_t stop_at,
-                                             G1ConcurrentRefineStats* stats) {
-  uint adjusted_id = worker_id + worker_id_offset();
-  return _dcqs.refine_completed_buffer_concurrently(adjusted_id, stop_at, stats);
-}
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -34,23 +34,28 @@
 #include "utilities/macros.hpp"

 // Forward decl
+class G1CardTableClaimTable;
+class G1CollectedHeap;
 class G1ConcurrentRefine;
 class G1ConcurrentRefineThread;
-class G1DirtyCardQueueSet;
+class G1HeapRegion;
 class G1Policy;
 class ThreadClosure;
+class WorkerTask;
+class WorkerThreads;

 // Helper class for refinement thread management. Used to start, stop and
 // iterate over them.
 class G1ConcurrentRefineThreadControl {
  G1ConcurrentRefine* _cr;
-  GrowableArrayCHeap<G1ConcurrentRefineThread*, mtGC> _threads;
+  G1ConcurrentRefineThread* _control_thread;
+
+  WorkerThreads* _workers;
+  uint _max_num_threads;

  // Create the refinement thread for the given worker id.
  // If initializing is true, ignore InjectGCWorkerCreationFailure.
-  G1ConcurrentRefineThread* create_refinement_thread(uint worker_id, bool initializing);
-
-  bool ensure_threads_created(uint worker_id, bool initializing);
+  G1ConcurrentRefineThread* create_refinement_thread();

  NONCOPYABLE(G1ConcurrentRefineThreadControl);

@ -60,21 +65,119 @@ public:

  jint initialize(G1ConcurrentRefine* cr);

-  void assert_current_thread_is_primary_refinement_thread() const NOT_DEBUG_RETURN;
+  void assert_current_thread_is_control_refinement_thread() const NOT_DEBUG_RETURN;

-  uint max_num_threads() const { return _threads.capacity(); }
+  uint max_num_threads() const { return _max_num_threads; }
+  bool is_refinement_enabled() const { return _max_num_threads > 0; }

-  // Activate the indicated thread.  If the thread has not yet been allocated,
-  // allocate and then activate.  If allocation is needed and fails, return
-  // false.  Otherwise return true.
-  // precondition: worker_id < max_num_threads().
-  // precondition: current thread is not the designated worker.
-  bool activate(uint worker_id);
+  // Activate the control thread.
+  void activate();

+  void run_task(WorkerTask* task, uint num_workers);
+
+  void control_thread_do(ThreadClosure* tc);
  void worker_threads_do(ThreadClosure* tc);
  void stop();
 };

+// Tracks the current state of re-examining the dirty cards from idle to completion
+// (and reset back to idle).
+//
+// The process steps are as follows:
+//
+// 1) Swap global card table pointers
+//
+// 2) Swap Java Thread's card table pointers
+//
+// 3) Synchronize GC Threads
+//      Ensures memory visibility
+//
+// After this point mutator threads should not mark the refinement table.
+//
+// 4) Snapshot the heap
+//      Determines which regions need to be swept.
+//
+// 5) Sweep Refinement table
+//      Examines non-Clean cards on the refinement table.
+//
+// 6) Completion Work
+//      Calculates statistics about the process to be used in various parts of
+//      the garbage collection.
+//
+// All but step 4 are interruptible by safepoints. In case of a garbage collection,
+// the garbage collection will interrupt this process, and go to Idle state.
+//
+class G1ConcurrentRefineSweepState {
+
+  enum class State : uint {
+    Idle,                        // Refinement is doing nothing.
+    SwapGlobalCT,                // Swap global card table.
+    SwapJavaThreadsCT,           // Swap java thread's card tables.
+    SynchronizeGCThreads,        // Synchronize GC thread's memory view.
+    SnapshotHeap,                // Take a snapshot of the region's top() values.
+    SweepRT,                     // Sweep the refinement table for pending (dirty) cards.
+    CompleteRefineWork,          // Cleanup of refinement work, reset to idle.
+    Last
+  } _state;
+
+  static const char* state_name(State state) {
+    static const char* _state_names[] = {
+      "Idle",
+      "Swap Global Card Table",
+      "Swap JavaThread Card Table",
+      "Synchronize GC Threads",
+      "Snapshot Heap",
+      "Sweep Refinement Table",
+      "Complete Sweep Work"
+    };
+
+    return _state_names[static_cast<uint>(state)];
+  }
+
+  // Current heap snapshot.
+  G1CardTableClaimTable* _sweep_table;
+
+  // Start times for all states.
+  Ticks _state_start[static_cast<uint>(State::Last)];
+
+  void set_state_start_time();
+  Tickspan get_duration(State start, State end);
+
+  G1ConcurrentRefineStats _stats;
+
+  // Advances the state to next_state if not interrupted by a changed epoch. Returns
+  // to Idle otherwise.
+  bool advance_state(State next_state);
+
+  void assert_state(State expected);
+
+  void snapshot_heap_inner();
+
+public:
+  G1ConcurrentRefineSweepState(uint max_reserved_regions);
+  ~G1ConcurrentRefineSweepState();
+
+  void start_work();
+
+  bool swap_global_card_table();
+  bool swap_java_threads_ct();
+  bool swap_gc_threads_ct();
+  void snapshot_heap(bool concurrent = true);
+  void sweep_refinement_table_start();
+  bool sweep_refinement_table_step();
+
+  bool complete_work(bool concurrent, bool print_log = true);
+
+  G1CardTableClaimTable* sweep_table() { return _sweep_table; }
+  G1ConcurrentRefineStats* stats() { return &_stats; }
+  void reset_stats();
+
+  void add_yield_during_sweep_duration(jlong duration);
+
+  bool is_in_progress() const;
+  bool are_java_threads_synched() const;
+};
+
 // Controls concurrent refinement.
 //
 // Mutator threads produce dirty cards, which need to be examined for updates
@ -84,49 +187,43 @@ public:
 // pending dirty cards at the start of a GC can be processed within that time
 // budget.
 //
-// Concurrent refinement is performed by a combination of dedicated threads
-// and by mutator threads as they produce dirty cards.  If configured to not
-// have any dedicated threads (-XX:G1ConcRefinementThreads=0) then all
-// concurrent refinement work is performed by mutator threads.  When there are
-// dedicated threads, they generally do most of the concurrent refinement
-// work, to minimize throughput impact of refinement work on mutator threads.
+// Concurrent refinement is performed by a set of dedicated threads.  If configured
+// to not have any dedicated threads (-XX:G1ConcRefinementThreads=0) then no
+// refinement work is performed at all.
 //
 // This class determines the target number of dirty cards pending for the next
 // GC.  It also owns the dedicated refinement threads and controls their
 // activation in order to achieve that target.
 //
-// There are two kinds of dedicated refinement threads, a single primary
-// thread and some number of secondary threads.  When active, all refinement
-// threads take buffers of dirty cards from the dirty card queue and process
-// them.  Between buffers they query this owning object to find out whether
-// they should continue running, deactivating themselves if not.
+// There are two kinds of dedicated refinement threads, a single control
+// thread and some number of refinement worker threads.
+// The control thread determines whether there is need to do work, and then starts
+// an appropriate number of refinement worker threads to get back to the target
+// number of pending dirty cards.
+//
+// The control wakes up periodically whether there is need to do refinement
+// work, starting the refinement process as necessary.
 //
-// The primary thread drives the control system that determines how many
-// refinement threads should be active.  If inactive, it wakes up periodically
-// to recalculate the number of active threads needed, and activates
-// additional threads as necessary.  While active it also periodically
-// recalculates the number wanted and activates more threads if needed.  It
-// also reduces the number of wanted threads when the target has been reached,
-// triggering deactivations.
 class G1ConcurrentRefine : public CHeapObj<mtGC> {
  G1Policy* _policy;
-  volatile uint _threads_wanted;
+  volatile uint _num_threads_wanted;
  size_t _pending_cards_target;
  Ticks _last_adjust;
  Ticks _last_deactivate;
  bool _needs_adjust;
+  bool _heap_was_locked;                // The heap has been locked the last time we tried to adjust the number of refinement threads.
+
  G1ConcurrentRefineThreadsNeeded _threads_needed;
  G1ConcurrentRefineThreadControl _thread_control;
-  G1DirtyCardQueueSet& _dcqs;

-  G1ConcurrentRefine(G1Policy* policy);
+  G1ConcurrentRefineSweepState _sweep_state;

-  static uint worker_id_offset();
+  G1ConcurrentRefine(G1CollectedHeap* g1h);

  jint initialize();

-  void assert_current_thread_is_primary_refinement_thread() const {
-    _thread_control.assert_current_thread_is_primary_refinement_thread();
+  void assert_current_thread_is_control_refinement_thread() const {
+    _thread_control.assert_current_thread_is_control_refinement_thread();
  }

  // For the first few collection cycles we don't have a target (and so don't
@ -138,16 +235,11 @@ class G1ConcurrentRefine : public CHeapObj<mtGC> {
    return _pending_cards_target != PendingCardsTargetUninitialized;
  }

-  void update_pending_cards_target(double logged_cards_scan_time_ms,
-                                   size_t processed_logged_cards,
-                                   size_t predicted_thread_buffer_cards,
+  void update_pending_cards_target(double pending_cards_scan_time_ms,
+                                   size_t processed_pending_cards,
                                   double goal_ms);

  uint64_t adjust_threads_period_ms() const;
-  bool is_in_last_adjustment_period() const;
-
-  class RemSetSamplingClosure;  // Helper class for adjusting young length.
-  void adjust_young_list_target_length();

  void adjust_threads_wanted(size_t available_bytes);

@ -156,67 +248,66 @@ class G1ConcurrentRefine : public CHeapObj<mtGC> {
 public:
  ~G1ConcurrentRefine();

+  G1ConcurrentRefineSweepState& sweep_state() { return _sweep_state; }
+
+  G1ConcurrentRefineSweepState& sweep_state_for_merge();
+
+  void run_with_refinement_workers(WorkerTask* task);
+
+  void notify_region_reclaimed(G1HeapRegion* r);
+
  // Returns a G1ConcurrentRefine instance if succeeded to create/initialize the
  // G1ConcurrentRefine instance. Otherwise, returns null with error code.
-  static G1ConcurrentRefine* create(G1Policy* policy, jint* ecode);
+  static G1ConcurrentRefine* create(G1CollectedHeap* g1h, jint* ecode);

  // Stop all the refinement threads.
  void stop();

  // Called at the end of a GC to prepare for refinement during the next
  // concurrent phase.  Updates the target for the number of pending dirty
-  // cards.  Updates the mutator refinement threshold.  Ensures the primary
-  // refinement thread (if it exists) is active, so it will adjust the number
+  // cards.  Updates the mutator refinement threshold.  Ensures the refinement
+  // control thread (if it exists) is active, so it will adjust the number
  // of running threads.
-  void adjust_after_gc(double logged_cards_scan_time_ms,
-                       size_t processed_logged_cards,
-                       size_t predicted_thread_buffer_cards,
+  void adjust_after_gc(double pending_cards_scan_time_ms,
+                       size_t processed_pending_cards,
                       double goal_ms);

  // Target number of pending dirty cards at the start of the next GC.
  size_t pending_cards_target() const { return _pending_cards_target; }

-  // May recalculate the number of refinement threads that should be active in
-  // order to meet the pending cards target.  Returns true if adjustment was
-  // performed, and clears any pending request.  Returns false if the
-  // adjustment period has not expired, or because a timed or requested
-  // adjustment could not be performed immediately and so was deferred.
-  // precondition: current thread is the primary refinement thread.
-  bool adjust_threads_periodically();
+  // Recalculates the number of refinement threads that should be active in
+  // order to meet the pending cards target.
+  // Returns true if it could recalculate the number of threads and
+  // refinement threads should be started.
+  // Returns false if the adjustment period has not expired, or because a timed
+  // or requested adjustment could not be performed immediately and so was deferred.
+  bool adjust_num_threads_periodically();

-  // The amount of time (in ms) the primary refinement thread should sleep
+  // The amount of time (in ms) the refinement control thread should sleep
  // when it is inactive.  It requests adjustment whenever it is reactivated.
-  // precondition: current thread is the primary refinement thread.
+  // precondition: current thread is the refinement control thread.
  uint64_t adjust_threads_wait_ms() const;

  // Record a request for thread adjustment as soon as possible.
-  // precondition: current thread is the primary refinement thread.
+  // precondition: current thread is the refinement control thread.
  void record_thread_adjustment_needed();

  // Test whether there is a pending request for thread adjustment.
-  // precondition: current thread is the primary refinement thread.
+  // precondition: current thread is the refinement control thread.
  bool is_thread_adjustment_needed() const;

-  // Reduce the number of active threads wanted.
-  // precondition: current thread is the primary refinement thread.
-  void reduce_threads_wanted();
+  // Indicate that last refinement adjustment had been deferred due to not
+  // obtaining the heap lock.
+  bool heap_was_locked() const { return _heap_was_locked; }

-  // Test whether the thread designated by worker_id should be active.
-  bool is_thread_wanted(uint worker_id) const;
-
-  // Return total of concurrent refinement stats for the
-  // ConcurrentRefineThreads.  Also reset the stats for the threads.
-  G1ConcurrentRefineStats get_and_reset_refinement_stats();
-
-  // Perform a single refinement step; called by the refinement
-  // threads.  Returns true if there was refinement work available.
-  // Updates stats.
-  bool try_refinement_step(uint worker_id,
-                           size_t stop_at,
-                           G1ConcurrentRefineStats* stats);
+  uint num_threads_wanted() const { return _num_threads_wanted; }
+  uint max_num_threads() const { return _thread_control.max_num_threads(); }

  // Iterate over all concurrent refinement threads applying the given closure.
  void threads_do(ThreadClosure *tc);
+  // Iterate over specific refinement threads applying the given closure.
+  void worker_threads_do(ThreadClosure *tc);
+  void control_thread_do(ThreadClosure *tc);
 };

 #endif // SHARE_GC_G1_G1CONCURRENTREFINE_HPP
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
@ -23,41 +23,33 @@
 */

 #include "gc/g1/g1ConcurrentRefineStats.hpp"
+#include "runtime/atomicAccess.hpp"
+#include "runtime/timer.hpp"

 G1ConcurrentRefineStats::G1ConcurrentRefineStats() :
-  _refinement_time(),
-  _refined_cards(0),
-  _precleaned_cards(0),
-  _dirtied_cards(0)
+  _sweep_duration(0),
+  _yield_during_sweep_duration(0),
+  _cards_scanned(0),
+  _cards_clean(0),
+  _cards_not_parsable(0),
+  _cards_already_refer_to_cset(0),
+  _cards_refer_to_cset(0),
+  _cards_no_cross_region(0),
+  _refine_duration(0)
 {}

-double G1ConcurrentRefineStats::refinement_rate_ms() const {
-  // Report 0 when no time recorded because no refinement performed.
-  double secs = refinement_time().seconds();
-  return (secs > 0) ? (refined_cards() / (secs * MILLIUNITS)) : 0.0;
-}
+void G1ConcurrentRefineStats::add_atomic(G1ConcurrentRefineStats* other) {
+  AtomicAccess::add(&_sweep_duration, other->_sweep_duration, memory_order_relaxed);
+  AtomicAccess::add(&_yield_during_sweep_duration, other->_yield_during_sweep_duration, memory_order_relaxed);

-G1ConcurrentRefineStats&
-G1ConcurrentRefineStats::operator+=(const G1ConcurrentRefineStats& other) {
-  _refinement_time += other._refinement_time;
-  _refined_cards += other._refined_cards;
-  _precleaned_cards += other._precleaned_cards;
-  _dirtied_cards += other._dirtied_cards;
-  return *this;
-}
+  AtomicAccess::add(&_cards_scanned, other->_cards_scanned, memory_order_relaxed);
+  AtomicAccess::add(&_cards_clean, other->_cards_clean, memory_order_relaxed);
+  AtomicAccess::add(&_cards_not_parsable, other->_cards_not_parsable, memory_order_relaxed);
+  AtomicAccess::add(&_cards_already_refer_to_cset, other->_cards_already_refer_to_cset, memory_order_relaxed);
+  AtomicAccess::add(&_cards_refer_to_cset, other->_cards_refer_to_cset, memory_order_relaxed);
+  AtomicAccess::add(&_cards_no_cross_region, other->_cards_no_cross_region, memory_order_relaxed);

-template<typename T>
-static T clipped_sub(T x, T y) {
-  return (x < y) ? T() : (x - y);
-}
-
-G1ConcurrentRefineStats&
-G1ConcurrentRefineStats::operator-=(const G1ConcurrentRefineStats& other) {
-  _refinement_time = clipped_sub(_refinement_time, other._refinement_time);
-  _refined_cards = clipped_sub(_refined_cards, other._refined_cards);
-  _precleaned_cards = clipped_sub(_precleaned_cards, other._precleaned_cards);
-  _dirtied_cards = clipped_sub(_dirtied_cards, other._dirtied_cards);
-  return *this;
+  AtomicAccess::add(&_refine_duration, other->_refine_duration, memory_order_relaxed);
 }

 void G1ConcurrentRefineStats::reset() {
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -33,47 +33,56 @@
 // Used for collecting per-thread statistics and for summaries over a
 // collection of threads.
 class G1ConcurrentRefineStats : public CHeapObj<mtGC> {
-  Tickspan _refinement_time;
-  size_t _refined_cards;
-  size_t _precleaned_cards;
-  size_t _dirtied_cards;
+  jlong _sweep_duration;              // Time spent sweeping the table finding non-clean cards
+                                      // and refining them.
+  jlong _yield_during_sweep_duration; // Time spent yielding during the sweep (not doing the sweep).
+
+  size_t _cards_scanned;              // Total number of cards scanned.
+  size_t _cards_clean;                // Number of cards found clean.
+  size_t _cards_not_parsable;         // Number of cards we could not parse and left unrefined.
+  size_t _cards_already_refer_to_cset;// Number of cards marked found to be already young.
+  size_t _cards_refer_to_cset;        // Number of dirty cards that were recently found to contain a to-cset reference.
+  size_t _cards_no_cross_region;      // Number of dirty cards that were dirtied, but then cleaned again by the mutator.
+
+  jlong _refine_duration;             // Time spent during actual refinement.

 public:
  G1ConcurrentRefineStats();

-  // Time spent performing concurrent refinement.
-  Tickspan refinement_time() const { return _refinement_time; }
+  // Time spent performing sweeping the refinement table (includes actual refinement,
+  // but not yield time).
+  jlong sweep_duration() const { return _sweep_duration - _yield_during_sweep_duration; }
+  jlong yield_during_sweep_duration() const { return _yield_during_sweep_duration; }
+  jlong refine_duration() const { return _refine_duration; }

  // Number of refined cards.
-  size_t refined_cards() const { return _refined_cards; }
+  size_t refined_cards() const { return cards_not_clean(); }

-  // Refinement rate, in cards per ms.
-  double refinement_rate_ms() const;
+  size_t cards_scanned() const { return _cards_scanned; }
+  size_t cards_clean() const { return _cards_clean; }
+  size_t cards_not_clean() const { return _cards_scanned - _cards_clean; }
+  size_t cards_not_parsable() const { return _cards_not_parsable; }
+  size_t cards_already_refer_to_cset() const { return _cards_already_refer_to_cset; }
+  size_t cards_refer_to_cset() const { return _cards_refer_to_cset; }
+  size_t cards_no_cross_region() const { return _cards_no_cross_region; }
+  // Number of cards that were marked dirty and in need of refinement. This includes cards recently
+  // found to refer to the collection set as they originally were dirty.
+  size_t cards_pending() const { return cards_not_clean() - _cards_already_refer_to_cset; }

-  // Number of cards for which refinement was skipped because some other
-  // thread had already refined them.
-  size_t precleaned_cards() const { return _precleaned_cards; }
+  size_t cards_to_cset() const { return _cards_already_refer_to_cset + _cards_refer_to_cset; }

-  // Number of cards marked dirty and in need of refinement.
-  size_t dirtied_cards() const { return _dirtied_cards; }
+  void inc_sweep_time(jlong t) { _sweep_duration += t; }
+  void inc_yield_during_sweep_duration(jlong t) { _yield_during_sweep_duration += t; }
+  void inc_refine_duration(jlong t) { _refine_duration += t; }

-  void inc_refinement_time(Tickspan t) { _refinement_time += t; }
-  void inc_refined_cards(size_t cards) { _refined_cards += cards; }
-  void inc_precleaned_cards(size_t cards) { _precleaned_cards += cards; }
-  void inc_dirtied_cards(size_t cards) { _dirtied_cards += cards; }
+  void inc_cards_scanned(size_t increment) { _cards_scanned += increment; }
+  void inc_cards_clean(size_t increment) { _cards_clean += increment; }
+  void inc_cards_not_parsable() { _cards_not_parsable++; }
+  void inc_cards_already_refer_to_cset() { _cards_already_refer_to_cset++; }
+  void inc_cards_refer_to_cset() { _cards_refer_to_cset++; }
+  void inc_cards_no_cross_region() { _cards_no_cross_region++; }

-  G1ConcurrentRefineStats& operator+=(const G1ConcurrentRefineStats& other);
-  G1ConcurrentRefineStats& operator-=(const G1ConcurrentRefineStats& other);
-
-  friend G1ConcurrentRefineStats operator+(G1ConcurrentRefineStats x,
-                                           const G1ConcurrentRefineStats& y) {
-    return x += y;
-  }
-
-  friend G1ConcurrentRefineStats operator-(G1ConcurrentRefineStats x,
-                                           const G1ConcurrentRefineStats& y) {
-    return x -= y;
-  }
+  void add_atomic(G1ConcurrentRefineStats* other);

  void reset();
 };
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp
@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1ConcurrentRefineSweepTask.hpp"
+
+class G1RefineRegionClosure : public G1HeapRegionClosure {
+  using CardValue = G1CardTable::CardValue;
+
+  G1RemSet* _rem_set;
+  G1CardTableClaimTable* _scan_state;
+
+  uint _worker_id;
+
+  size_t _num_collections_at_start;
+
+  bool has_work(G1HeapRegion* r) {
+    return _scan_state->has_unclaimed_cards(r->hrm_index());
+  }
+
+  void verify_card_pair_refers_to_same_card(CardValue* source_card, CardValue* dest_card) {
+#ifdef ASSERT
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+    G1HeapRegion* refinement_r = g1h->heap_region_containing(g1h->refinement_table()->addr_for(source_card));
+    G1HeapRegion* card_r = g1h->heap_region_containing(g1h->card_table()->addr_for(dest_card));
+    size_t refinement_i = g1h->refinement_table()->index_for_cardvalue(source_card);
+    size_t card_i = g1h->card_table()->index_for_cardvalue(dest_card);
+
+    assert(refinement_r == card_r, "not same region source %u (%zu) dest %u (%zu) ", refinement_r->hrm_index(), refinement_i, card_r->hrm_index(), card_i);
+    assert(refinement_i == card_i, "indexes are not same %zu %zu", refinement_i, card_i);
+#endif
+  }
+
+  void do_dirty_card(CardValue* source_card, CardValue* dest_card) {
+    verify_card_pair_refers_to_same_card(source_card, dest_card);
+
+    G1RemSet::RefineResult res = _rem_set->refine_card_concurrently(source_card, _worker_id);
+    // Gather statistics based on the result.
+    switch (res) {
+      case G1RemSet::HasRefToCSet: {
+        *dest_card = G1CardTable::g1_to_cset_card;
+        _refine_stats.inc_cards_refer_to_cset();
+        break;
+      }
+      case G1RemSet::AlreadyToCSet: {
+        *dest_card = G1CardTable::g1_to_cset_card;
+        _refine_stats.inc_cards_already_refer_to_cset();
+        break;
+      }
+      case G1RemSet::NoCrossRegion: {
+        _refine_stats.inc_cards_no_cross_region();
+        break;
+      }
+      case G1RemSet::CouldNotParse: {
+        // Could not refine - redirty with the original value.
+        *dest_card = *source_card;
+        _refine_stats.inc_cards_not_parsable();
+        break;
+      }
+      case G1RemSet::HasRefToOld : break; // Nothing special to do.
+    }
+    // Clean card on source card table.
+    *source_card = G1CardTable::clean_card_val();
+  }
+
+  void do_claimed_block(CardValue* dirty_l, CardValue* dirty_r, CardValue* dest_card) {
+    for (CardValue* source = dirty_l; source < dirty_r; ++source, ++dest_card) {
+      do_dirty_card(source, dest_card);
+    }
+  }
+
+public:
+  bool _completed;
+  G1ConcurrentRefineStats _refine_stats;
+
+  G1RefineRegionClosure(uint worker_id, G1CardTableClaimTable* scan_state) :
+    G1HeapRegionClosure(),
+    _rem_set(G1CollectedHeap::heap()->rem_set()),
+    _scan_state(scan_state),
+    _worker_id(worker_id),
+    _completed(true),
+    _refine_stats() { }
+
+  bool do_heap_region(G1HeapRegion* r) override {
+
+    if (!has_work(r)) {
+      return false;
+    }
+
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+
+    if (r->is_young()) {
+      if (_scan_state->claim_all_cards(r->hrm_index()) == 0) {
+        // Clear the pre-dirtying information.
+        r->clear_refinement_table();
+      }
+      return false;
+    }
+
+    G1CardTable* card_table = g1h->card_table();
+    G1CardTable* refinement_table = g1h->refinement_table();
+
+    G1CardTableChunkClaimer claim(_scan_state, r->hrm_index());
+
+    size_t const region_card_base_idx = (size_t)r->hrm_index() << G1HeapRegion::LogCardsPerRegion;
+
+    while (claim.has_next()) {
+      size_t const start_idx = region_card_base_idx + claim.value();
+      CardValue* const start_card = refinement_table->byte_for_index(start_idx);
+      CardValue* const end_card = start_card + claim.size();
+
+      CardValue* dest_card = card_table->byte_for_index(start_idx);
+
+      G1ChunkScanner scanner{start_card, end_card};
+
+      size_t num_dirty_cards = 0;
+      scanner.on_dirty_cards([&] (CardValue* dirty_l, CardValue* dirty_r) {
+                               jlong refine_start = os::elapsed_counter();
+
+                               do_claimed_block(dirty_l, dirty_r, dest_card + pointer_delta(dirty_l, start_card, sizeof(CardValue)));
+                               num_dirty_cards += pointer_delta(dirty_r, dirty_l, sizeof(CardValue));
+
+                               _refine_stats.inc_refine_duration(os::elapsed_counter() - refine_start);
+                             });
+
+      if (VerifyDuringGC) {
+        for (CardValue* i = start_card; i < end_card; ++i) {
+          guarantee(*i == G1CardTable::clean_card_val(), "must be");
+        }
+      }
+
+      _refine_stats.inc_cards_scanned(claim.size());
+      _refine_stats.inc_cards_clean(claim.size() - num_dirty_cards);
+
+      if (SuspendibleThreadSet::should_yield()) {
+        _completed = false;
+        break;
+      }
+    }
+
+    return !_completed;
+  }
+};
+
+G1ConcurrentRefineSweepTask::G1ConcurrentRefineSweepTask(G1CardTableClaimTable* scan_state,
+                                                           G1ConcurrentRefineStats* stats,
+                                                           uint max_workers) :
+  WorkerTask("G1 Refine Task"),
+  _scan_state(scan_state),
+  _stats(stats),
+  _max_workers(max_workers),
+  _sweep_completed(true)
+{ }
+
+void G1ConcurrentRefineSweepTask::work(uint worker_id) {
+  jlong start = os::elapsed_counter();
+
+  G1RefineRegionClosure sweep_cl(worker_id, _scan_state);
+  _scan_state->heap_region_iterate_from_worker_offset(&sweep_cl, worker_id, _max_workers);
+
+  if (!sweep_cl._completed) {
+    _sweep_completed = false;
+  }
+
+  sweep_cl._refine_stats.inc_sweep_time(os::elapsed_counter() - start);
+  _stats->add_atomic(&sweep_cl._refine_stats);
+}
+
+bool G1ConcurrentRefineSweepTask::sweep_completed() const { return _sweep_completed; }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -22,20 +22,27 @@
 *
 */

-#ifndef SHARE_GC_SHARED_BUFFERNODELIST_HPP
-#define SHARE_GC_SHARED_BUFFERNODELIST_HPP
+#ifndef SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP
+#define SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP

-#include "utilities/globalDefinitions.hpp"
+#include "gc/g1/g1ConcurrentRefineStats.hpp"
+#include "gc/shared/workerThread.hpp"

-class BufferNode;
+class G1CardTableClaimTable;

-struct BufferNodeList {
-  BufferNode* _head;            // First node in list or null if empty.
-  BufferNode* _tail;            // Last node in list or null if empty.
-  size_t _entry_count;          // Sum of entries in nodes in list.
+class G1ConcurrentRefineSweepTask : public WorkerTask {
+  G1CardTableClaimTable* _scan_state;
+  G1ConcurrentRefineStats* _stats;
+  uint _max_workers;
+  bool _sweep_completed;

-  BufferNodeList();
-  BufferNodeList(BufferNode* head, BufferNode* tail, size_t entry_count);
+public:
+
+  G1ConcurrentRefineSweepTask(G1CardTableClaimTable* scan_state, G1ConcurrentRefineStats* stats, uint max_workers);
+
+  void work(uint worker_id) override;
+
+  bool sweep_completed() const;
 };

-#endif // SHARE_GC_SHARED_BUFFERNODELIST_HPP
+#endif /* SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP */
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
@ -23,10 +23,13 @@
 */

 #include "gc/g1/g1BarrierSet.hpp"
+#include "gc/g1/g1CardTableClaimTable.inline.hpp"
+#include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
+#include "gc/g1/g1ConcurrentRefineSweepTask.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
+#include "gc/shared/gcTraceTime.inline.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "logging/log.hpp"
 #include "runtime/cpuTimeCounters.hpp"
@ -38,60 +41,61 @@
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/ticks.hpp"

-G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) :
+G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr) :
  ConcurrentGCThread(),
-  _notifier(Mutex::nosafepoint, FormatBuffer<>("G1 Refine#%d", worker_id), true),
+  _notifier(Mutex::nosafepoint, "G1 Refine Control", true),
  _requested_active(false),
-  _refinement_stats(),
-  _worker_id(worker_id),
  _cr(cr)
 {
-  // set name
-  set_name("G1 Refine#%d", worker_id);
+  set_name("G1 Refine Control");
 }

 void G1ConcurrentRefineThread::run_service() {
-  while (wait_for_completed_buffers()) {
+  while (wait_for_work()) {
    SuspendibleThreadSetJoiner sts_join;
-    G1ConcurrentRefineStats active_stats_start = _refinement_stats;
    report_active("Activated");
    while (!should_terminate()) {
      if (sts_join.should_yield()) {
-        report_inactive("Paused", _refinement_stats - active_stats_start);
+        report_inactive("Paused");
        sts_join.yield();
        // Reset after yield rather than accumulating across yields, else a
        // very long running thread could overflow.
-        active_stats_start = _refinement_stats;
        report_active("Resumed");
-      } else if (maybe_deactivate()) {
-        break;
+      }
+      // Look if we want to do refinement. If we don't then don't do any refinement
+      // this. This thread may have just woken up but no threads are currently
+      // needed, which is common.  In this case we want to just go back to
+      // waiting, with a minimum of fuss; in particular, don't do any "premature"
+      // refinement.  However, adjustment may be pending but temporarily
+      // blocked. In that case we wait for adjustment to succeed.
+      Ticks adjust_start = Ticks::now();
+      if (cr()->adjust_num_threads_periodically()) {
+        GCTraceTime(Info, gc, refine) tm("Concurrent Refine Cycle");
+        do_refinement();
      } else {
-        do_refinement_step();
+        log_debug(gc, refine)("Concurrent Refine Adjust Only (#threads wanted: %u adjustment_needed: %s wait_for_heap_lock: %s) %.2fms",
+                              cr()->num_threads_wanted(),
+                              BOOL_TO_STR(cr()->is_thread_adjustment_needed()),
+                              BOOL_TO_STR(cr()->heap_was_locked()),
+                              (Ticks::now() - adjust_start).seconds() * MILLIUNITS);
+
+        deactivate();
+        break;
      }
    }
-    report_inactive("Deactivated", _refinement_stats - active_stats_start);
+    report_inactive("Deactivated");
    update_perf_counter_cpu_time();
  }

-  log_debug(gc, refine)("Stopping %d", _worker_id);
+  log_debug(gc, refine)("Stopping %s", name());
 }

 void G1ConcurrentRefineThread::report_active(const char* reason) const {
-  log_trace(gc, refine)("%s worker %u, current: %zu",
-                        reason,
-                        _worker_id,
-                        G1BarrierSet::dirty_card_queue_set().num_cards());
+  log_trace(gc, refine)("%s active (%s)", name(), reason);
 }

-void G1ConcurrentRefineThread::report_inactive(const char* reason,
-                                               const G1ConcurrentRefineStats& stats) const {
-  log_trace(gc, refine)
-           ("%s worker %u, cards: %zu, refined %zu, rate %1.2fc/ms",
-            reason,
-            _worker_id,
-            G1BarrierSet::dirty_card_queue_set().num_cards(),
-            stats.refined_cards(),
-            stats.refinement_rate_ms());
+void G1ConcurrentRefineThread::report_inactive(const char* reason) const {
+  log_trace(gc, refine)("%s inactive (%s)", name(), reason);
 }

 void G1ConcurrentRefineThread::activate() {
@ -103,21 +107,12 @@ void G1ConcurrentRefineThread::activate() {
  }
 }

-bool G1ConcurrentRefineThread::maybe_deactivate() {
+bool G1ConcurrentRefineThread::deactivate() {
  assert(this == Thread::current(), "precondition");
-  if (cr()->is_thread_wanted(_worker_id)) {
-    return false;
-  } else {
-    MutexLocker ml(&_notifier, Mutex::_no_safepoint_check_flag);
-    bool requested = _requested_active;
-    _requested_active = false;
-    return !requested;  // Deactivate only if not recently requested active.
-  }
-}
-
-bool G1ConcurrentRefineThread::try_refinement_step(size_t stop_at) {
-  assert(this == Thread::current(), "precondition");
-  return _cr->try_refinement_step(_worker_id, stop_at, &_refinement_stats);
+  MutexLocker ml(&_notifier, Mutex::_no_safepoint_check_flag);
+  bool requested = _requested_active;
+  _requested_active = false;
+  return !requested;  // Deactivate only if not recently requested active.
 }

 void G1ConcurrentRefineThread::stop_service() {
@ -128,23 +123,9 @@ jlong G1ConcurrentRefineThread::cpu_time() {
  return os::thread_cpu_time(this);
 }

-// The (single) primary thread drives the controller for the refinement threads.
-class G1PrimaryConcurrentRefineThread final : public G1ConcurrentRefineThread {
-  bool wait_for_completed_buffers() override;
-  bool maybe_deactivate() override;
-  void do_refinement_step() override;
-  // Updates jstat cpu usage for all refinement threads.
-  void update_perf_counter_cpu_time() override;
-
-public:
-  G1PrimaryConcurrentRefineThread(G1ConcurrentRefine* cr) :
-    G1ConcurrentRefineThread(cr, 0)
-  {}
-};
-
-// When inactive, the primary thread periodically wakes up and requests
-// adjustment of the number of active refinement threads.
-bool G1PrimaryConcurrentRefineThread::wait_for_completed_buffers() {
+// When inactive, the control thread periodically wakes up to check if there is
+// refinement work pending.
+bool G1ConcurrentRefineThread::wait_for_work() {
  assert(this == Thread::current(), "precondition");
  MonitorLocker ml(notifier(), Mutex::_no_safepoint_check_flag);
  if (!requested_active() && !should_terminate()) {
@ -157,78 +138,115 @@ bool G1PrimaryConcurrentRefineThread::wait_for_completed_buffers() {
  return !should_terminate();
 }

-bool G1PrimaryConcurrentRefineThread::maybe_deactivate() {
-  // Don't deactivate while needing to adjust the number of active threads.
-  return !cr()->is_thread_adjustment_needed() &&
-         G1ConcurrentRefineThread::maybe_deactivate();
+void G1ConcurrentRefineThread::do_refinement() {
+  G1ConcurrentRefineSweepState& state = _cr->sweep_state();
+
+  state.start_work();
+
+  // Swap card tables.
+
+  // 1. Global card table
+  if (!state.swap_global_card_table()) {
+    log_debug(gc, refine)("GC pause after Global Card Table Swap");
+    return;
+  }
+
+  // 2. Java threads
+  if (!state.swap_java_threads_ct()) {
+    log_debug(gc, refine)("GC pause after Java Thread CT swap");
+    return;
+  }
+
+  // 3. GC threads
+  if (!state.swap_gc_threads_ct()) {
+    log_debug(gc, refine)("GC pause after GC Thread CT swap");
+    return;
+  }
+
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  jlong epoch_yield_duration = g1h->yield_duration_in_refinement_epoch();
+  jlong next_epoch_start = os::elapsed_counter();
+
+  jlong total_yield_during_sweep_duration = 0;
+
+  // 4. Snapshot heap.
+  state.snapshot_heap();
+
+  // 5. Sweep refinement table until done
+  bool interrupted_by_gc = false;
+
+  log_info(gc, task)("Concurrent Refine Sweep Using %u of %u Workers", _cr->num_threads_wanted(), _cr->max_num_threads());
+
+  state.sweep_refinement_table_start();
+  while (true) {
+    bool completed = state.sweep_refinement_table_step();
+
+    if (completed) {
+      break;
+    }
+
+    if (SuspendibleThreadSet::should_yield()) {
+      jlong yield_during_sweep_start = os::elapsed_counter();
+      SuspendibleThreadSet::yield();
+
+      // The yielding may have completed the task, check.
+      if (!state.is_in_progress()) {
+        log_debug(gc, refine)("GC completed sweeping, aborting concurrent operation");
+        interrupted_by_gc = true;
+        break;
+      } else {
+        jlong yield_during_sweep_duration = os::elapsed_counter() - yield_during_sweep_start;
+        log_debug(gc, refine)("Yielded from card table sweeping for %.2fms, no GC inbetween, continue",
+                              TimeHelper::counter_to_millis(yield_during_sweep_duration));
+        total_yield_during_sweep_duration += yield_during_sweep_duration;
+      }
+    }
+  }
+
+  if (!interrupted_by_gc) {
+    GCTraceTime(Info, gc, refine) tm("Concurrent Refine Complete Work");
+
+    state.add_yield_during_sweep_duration(total_yield_during_sweep_duration);
+
+    state.complete_work(true);
+
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+    G1Policy* policy = g1h->policy();
+    G1ConcurrentRefineStats* stats = state.stats();
+    policy->record_refinement_stats(stats);
+
+    {
+      // The young gen revising mechanism reads the predictor and the values set
+      // here. Avoid inconsistencies by locking.
+      MutexLocker x(G1ReviseYoungLength_lock, Mutex::_no_safepoint_check_flag);
+      policy->record_dirtying_stats(TimeHelper::counter_to_millis(G1CollectedHeap::heap()->last_refinement_epoch_start()),
+                                    TimeHelper::counter_to_millis(next_epoch_start),
+                                    stats->cards_pending(),
+                                    TimeHelper::counter_to_millis(epoch_yield_duration),
+                                    0 /* pending_cards_from_gc */,
+                                    stats->cards_to_cset());
+      G1CollectedHeap::heap()->set_last_refinement_epoch_start(next_epoch_start, epoch_yield_duration);
+    }
+    stats->reset();
+  }
 }

-void G1PrimaryConcurrentRefineThread::do_refinement_step() {
-  // Try adjustment first.  If it succeeds then don't do any refinement this
-  // round.  This thread may have just woken up but no threads are currently
-  // needed, which is common.  In this case we want to just go back to
-  // waiting, with a minimum of fuss; in particular, don't do any "premature"
-  // refinement.  However, adjustment may be pending but temporarily
-  // blocked. In that case we *do* try refinement, rather than possibly
-  // uselessly spinning while waiting for adjustment to succeed.
-  if (!cr()->adjust_threads_periodically()) {
-    // No adjustment, so try refinement, with the target as a cuttoff.
-    if (!try_refinement_step(cr()->pending_cards_target())) {
-      // Refinement was cut off, so proceed with fewer threads.
-      cr()->reduce_threads_wanted();
+void G1ConcurrentRefineThread::update_perf_counter_cpu_time() {
+  // The control thread is responsible for updating the CPU time for all workers.
+  if (UsePerfData) {
+    {
+      ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine);
+      cr()->worker_threads_do(&tttc);
+    }
+    {
+      ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine_control);
+      cr()->control_thread_do(&tttc);
    }
  }
 }

-void G1PrimaryConcurrentRefineThread::update_perf_counter_cpu_time() {
-  if (UsePerfData) {
-    ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine);
-    cr()->threads_do(&tttc);
-  }
-}
-
-class G1SecondaryConcurrentRefineThread final : public G1ConcurrentRefineThread {
-  bool wait_for_completed_buffers() override;
-  void do_refinement_step() override;
-  void update_perf_counter_cpu_time() override { /* Nothing to do. The primary thread does all the work. */ }
-
-public:
-  G1SecondaryConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) :
-    G1ConcurrentRefineThread(cr, worker_id)
-  {
-    assert(worker_id > 0, "precondition");
-  }
-};
-
-bool G1SecondaryConcurrentRefineThread::wait_for_completed_buffers() {
-  assert(this == Thread::current(), "precondition");
-  MonitorLocker ml(notifier(), Mutex::_no_safepoint_check_flag);
-  while (!requested_active() && !should_terminate()) {
-    ml.wait();
-  }
-  return !should_terminate();
-}
-
-void G1SecondaryConcurrentRefineThread::do_refinement_step() {
-  assert(this == Thread::current(), "precondition");
-  // Secondary threads ignore the target and just drive the number of pending
-  // dirty cards down.  The primary thread is responsible for noticing the
-  // target has been reached and reducing the number of wanted threads.  This
-  // makes the control of wanted threads all under the primary, while avoiding
-  // useless spinning by secondary threads until the primary thread notices.
-  // (Useless spinning is still possible if there are no pending cards, but
-  // that should rarely happen.)
-  try_refinement_step(0);
-}
-
-G1ConcurrentRefineThread*
-G1ConcurrentRefineThread::create(G1ConcurrentRefine* cr, uint worker_id) {
-  G1ConcurrentRefineThread* crt;
-  if (worker_id == 0) {
-    crt = new (std::nothrow) G1PrimaryConcurrentRefineThread(cr);
-  } else {
-    crt = new (std::nothrow) G1SecondaryConcurrentRefineThread(cr, worker_id);
-  }
+G1ConcurrentRefineThread* G1ConcurrentRefineThread::create(G1ConcurrentRefine* cr) {
+  G1ConcurrentRefineThread* crt = new (std::nothrow) G1ConcurrentRefineThread(cr);
  if (crt != nullptr) {
    crt->create_and_start();
  }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
@ -33,8 +33,8 @@
 // Forward Decl.
 class G1ConcurrentRefine;

-// One or more G1 Concurrent Refinement Threads may be active if concurrent
-// refinement is in progress.
+// Concurrent refinement control thread watching card mark accrual on the card table
+// and starting refinement work.
 class G1ConcurrentRefineThread: public ConcurrentGCThread {
  friend class VMStructs;
  friend class G1CollectedHeap;
@ -42,43 +42,34 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {
  Monitor _notifier;
  bool _requested_active;

-  G1ConcurrentRefineStats _refinement_stats;
-
  uint _worker_id;

  G1ConcurrentRefine* _cr;

  NONCOPYABLE(G1ConcurrentRefineThread);

-protected:
-  G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id);
+  G1ConcurrentRefineThread(G1ConcurrentRefine* cr);

  Monitor* notifier() { return &_notifier; }
  bool requested_active() const { return _requested_active; }

  // Returns !should_terminate().
  // precondition: this is the current thread.
-  virtual bool wait_for_completed_buffers() = 0;
+  bool wait_for_work();

  // Deactivate if appropriate.  Returns true if deactivated.
  // precondition: this is the current thread.
-  virtual bool maybe_deactivate();
+  bool deactivate();

-  // Attempt to do some refinement work.
-  // precondition: this is the current thread.
-  virtual void do_refinement_step() = 0;
+  // Swap card table and do a complete re-examination/refinement pass over the
+  // refinement table.
+  void do_refinement();

  // Update concurrent refine threads cpu time stats.
-  virtual void update_perf_counter_cpu_time() = 0;
-
-  // Helper for do_refinement_step implementations.  Try to perform some
-  // refinement work, limited by stop_at.  Returns true if any refinement work
-  // was performed, false if no work available per stop_at.
-  // precondition: this is the current thread.
-  bool try_refinement_step(size_t stop_at);
+  void update_perf_counter_cpu_time();

  void report_active(const char* reason) const;
-  void report_inactive(const char* reason, const G1ConcurrentRefineStats& stats) const;
+  void report_inactive(const char* reason) const;

  G1ConcurrentRefine* cr() const { return _cr; }

@ -86,23 +77,12 @@ protected:
  void stop_service() override;

 public:
-  static G1ConcurrentRefineThread* create(G1ConcurrentRefine* cr, uint worker_id);
-  virtual ~G1ConcurrentRefineThread() = default;
-
-  uint worker_id() const { return _worker_id; }
+  static G1ConcurrentRefineThread* create(G1ConcurrentRefine* cr);

  // Activate this thread.
  // precondition: this is not the current thread.
  void activate();

-  G1ConcurrentRefineStats* refinement_stats() {
-    return &_refinement_stats;
-  }
-
-  const G1ConcurrentRefineStats* refinement_stats() const {
-    return &_refinement_stats;
-  }
-
  // Total cpu time spent in this thread so far.
  jlong cpu_time();
 };
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp
@ -45,48 +45,22 @@ G1ConcurrentRefineThreadsNeeded::G1ConcurrentRefineThreadsNeeded(G1Policy* polic
 //
 // 1. Minimize the number of refinement threads running at once.
 //
-// 2. Minimize the number of activations and deactivations for the
-// refinement threads that run.
-//
-// 3. Delay performing refinement work.  Having more dirty cards waiting to
+// 2. Delay performing refinement work.  Having more dirty cards waiting to
 // be refined can be beneficial, as further writes to the same card don't
 // create more work.
 void G1ConcurrentRefineThreadsNeeded::update(uint active_threads,
                                             size_t available_bytes,
                                             size_t num_cards,
                                             size_t target_num_cards) {
+  _predicted_time_until_next_gc_ms = _policy->predict_time_to_next_gc_ms(available_bytes);
+
+  // Estimate number of cards that need to be processed before next GC.
  const G1Analytics* analytics = _policy->analytics();

-  // Estimate time until next GC, based on remaining bytes available for
-  // allocation and the allocation rate.
-  double alloc_region_rate = analytics->predict_alloc_rate_ms();
-  double alloc_bytes_rate = alloc_region_rate * G1HeapRegion::GrainBytes;
-  if (alloc_bytes_rate == 0.0) {
-    // A zero rate indicates we don't yet have data to use for predictions.
-    // Since we don't have any idea how long until the next GC, use a time of
-    // zero.
-    _predicted_time_until_next_gc_ms = 0.0;
-  } else {
-    // If the heap size is large and the allocation rate is small, we can get
-    // a predicted time until next GC that is so large it can cause problems
-    // (such as overflow) in other calculations.  Limit the prediction to one
-    // hour, which is still large in this context.
-    const double one_hour_ms = 60.0 * 60.0 * MILLIUNITS;
-    double raw_time_ms = available_bytes / alloc_bytes_rate;
-    _predicted_time_until_next_gc_ms = MIN2(raw_time_ms, one_hour_ms);
-  }
+  double incoming_rate = analytics->predict_dirtied_cards_rate_ms();
+  double raw_cards = incoming_rate * _predicted_time_until_next_gc_ms;
+  size_t incoming_cards = static_cast<size_t>(raw_cards);

-  // Estimate number of cards that need to be processed before next GC.  There
-  // are no incoming cards when time is short, because in that case the
-  // controller activates refinement by mutator threads to stay on target even
-  // if threads deactivate in the meantime.  This also covers the case of not
-  // having a real prediction of time until GC.
-  size_t incoming_cards = 0;
-  if (_predicted_time_until_next_gc_ms > _update_period_ms) {
-    double incoming_rate = analytics->predict_dirtied_cards_rate_ms();
-    double raw_cards = incoming_rate * _predicted_time_until_next_gc_ms;
-    incoming_cards = static_cast<size_t>(raw_cards);
-  }
  size_t total_cards = num_cards + incoming_cards;
  _predicted_cards_at_next_gc = total_cards;

@ -100,9 +74,8 @@ void G1ConcurrentRefineThreadsNeeded::update(uint active_threads,
  // The calculation of the number of threads needed isn't very stable when
  // time is short, and can lead to starting up lots of threads for not much
  // profit.  If we're in the last update period, don't change the number of
-  // threads running, other than to treat the current thread as running.  That
-  // might not be sufficient, but hopefully we were already reasonably close.
-  // We won't accumulate more because mutator refinement will be activated.
+  // threads needed.  That might not be sufficient, but hopefully we were
+  // already reasonably close.
  if (_predicted_time_until_next_gc_ms <= _update_period_ms) {
    _threads_needed = MAX2(active_threads, 1u);
    return;
@ -133,11 +106,12 @@ void G1ConcurrentRefineThreadsNeeded::update(uint active_threads,
  // close to the next GC we want to drive toward the target, so round up
  // then.  The rest of the time we round to nearest, trying to remain near
  // the middle of the range.
+  double rthreads = nthreads;
  if (_predicted_time_until_next_gc_ms <= _update_period_ms * 5.0) {
-    nthreads = ::ceil(nthreads);
+    rthreads = ::ceil(nthreads);
  } else {
-    nthreads = ::round(nthreads);
+    rthreads = ::round(nthreads);
  }

-  _threads_needed = static_cast<uint>(MIN2<size_t>(nthreads, UINT_MAX));
+  _threads_needed = static_cast<uint>(MIN2<size_t>(rthreads, UINT_MAX));
 }
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@ -1,599 +0,0 @@
-/*
- * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "gc/g1/g1BarrierSet.inline.hpp"
-#include "gc/g1/g1CardTableEntryClosure.hpp"
-#include "gc/g1/g1CollectedHeap.inline.hpp"
-#include "gc/g1/g1ConcurrentRefineStats.hpp"
-#include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
-#include "gc/g1/g1FreeIdSet.hpp"
-#include "gc/g1/g1HeapRegionRemSet.inline.hpp"
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
-#include "gc/g1/g1RemSet.hpp"
-#include "gc/g1/g1ThreadLocalData.hpp"
-#include "gc/shared/bufferNode.hpp"
-#include "gc/shared/bufferNodeList.hpp"
-#include "gc/shared/suspendibleThreadSet.hpp"
-#include "memory/iterator.hpp"
-#include "runtime/atomicAccess.hpp"
-#include "runtime/javaThread.hpp"
-#include "runtime/mutex.hpp"
-#include "runtime/mutexLocker.hpp"
-#include "runtime/os.hpp"
-#include "runtime/safepoint.hpp"
-#include "runtime/threads.hpp"
-#include "runtime/threadSMR.hpp"
-#include "utilities/globalCounter.inline.hpp"
-#include "utilities/macros.hpp"
-#include "utilities/nonblockingQueue.inline.hpp"
-#include "utilities/pair.hpp"
-#include "utilities/quickSort.hpp"
-#include "utilities/ticks.hpp"
-
-G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
-  PtrQueue(qset),
-  _refinement_stats(new G1ConcurrentRefineStats())
-{ }
-
-G1DirtyCardQueue::~G1DirtyCardQueue() {
-  delete _refinement_stats;
-}
-
-// Assumed to be zero by concurrent threads.
-static uint par_ids_start() { return 0; }
-
-G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) :
-  PtrQueueSet(allocator),
-  _num_cards(0),
-  _mutator_refinement_threshold(SIZE_MAX),
-  _completed(),
-  _paused(),
-  _free_ids(par_ids_start(), num_par_ids()),
-  _detached_refinement_stats()
-{}
-
-G1DirtyCardQueueSet::~G1DirtyCardQueueSet() {
-  abandon_completed_buffers();
-}
-
-// Determines how many mutator threads can process the buffers in parallel.
-uint G1DirtyCardQueueSet::num_par_ids() {
-  return (uint)os::initial_active_processor_count();
-}
-
-void G1DirtyCardQueueSet::flush_queue(G1DirtyCardQueue& queue) {
-  if (queue.buffer() != nullptr) {
-    G1ConcurrentRefineStats* stats = queue.refinement_stats();
-    stats->inc_dirtied_cards(queue.size());
-  }
-  PtrQueueSet::flush_queue(queue);
-}
-
-void G1DirtyCardQueueSet::enqueue(G1DirtyCardQueue& queue,
-                                  volatile CardValue* card_ptr) {
-  CardValue* value = const_cast<CardValue*>(card_ptr);
-  if (!try_enqueue(queue, value)) {
-    handle_zero_index(queue);
-    retry_enqueue(queue, value);
-  }
-}
-
-void G1DirtyCardQueueSet::handle_zero_index(G1DirtyCardQueue& queue) {
-  assert(queue.index() == 0, "precondition");
-  BufferNode* old_node = exchange_buffer_with_new(queue);
-  if (old_node != nullptr) {
-    assert(old_node->index() == 0, "invariant");
-    G1ConcurrentRefineStats* stats = queue.refinement_stats();
-    stats->inc_dirtied_cards(old_node->capacity());
-    handle_completed_buffer(old_node, stats);
-  }
-}
-
-void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
-  G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t);
-  G1BarrierSet::dirty_card_queue_set().handle_zero_index(queue);
-}
-
-size_t G1DirtyCardQueueSet::num_cards() const {
-  return AtomicAccess::load(&_num_cards);
-}
-
-void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
-  assert(cbn != nullptr, "precondition");
-  // Increment _num_cards before adding to queue, so queue removal doesn't
-  // need to deal with _num_cards possibly going negative.
-  AtomicAccess::add(&_num_cards, cbn->size());
-  // Perform push in CS.  The old tail may be popped while the push is
-  // observing it (attaching it to the new buffer).  We need to ensure it
-  // can't be reused until the push completes, to avoid ABA problems.
-  GlobalCounter::CriticalSection cs(Thread::current());
-  _completed.push(*cbn);
-}
-
-// Thread-safe attempt to remove and return the first buffer from
-// the _completed queue, using the NonblockingQueue::try_pop() underneath.
-// It has a limitation that it may return null when there are objects
-// in the queue if there is a concurrent push/append operation.
-BufferNode* G1DirtyCardQueueSet::dequeue_completed_buffer() {
-  Thread* current_thread = Thread::current();
-  BufferNode* result = nullptr;
-  while (true) {
-    // Use GlobalCounter critical section to avoid ABA problem.
-    // The release of a buffer to its allocator's free list uses
-    // GlobalCounter::write_synchronize() to coordinate with this
-    // dequeuing operation.
-    // We use a CS per iteration, rather than over the whole loop,
-    // because we're not guaranteed to make progress. Lingering in
-    // one CS could defer releasing buffer to the free list for reuse,
-    // leading to excessive allocations.
-    GlobalCounter::CriticalSection cs(current_thread);
-    if (_completed.try_pop(&result)) return result;
-  }
-}
-
-BufferNode* G1DirtyCardQueueSet::get_completed_buffer() {
-  BufferNode* result = dequeue_completed_buffer();
-  if (result == nullptr) {         // Unlikely if no paused buffers.
-    enqueue_previous_paused_buffers();
-    result = dequeue_completed_buffer();
-    if (result == nullptr) return nullptr;
-  }
-  AtomicAccess::sub(&_num_cards, result->size());
-  return result;
-}
-
-#ifdef ASSERT
-void G1DirtyCardQueueSet::verify_num_cards() const {
-  size_t actual = 0;
-  for (BufferNode* cur = _completed.first();
-       !_completed.is_end(cur);
-       cur = cur->next()) {
-    actual += cur->size();
-  }
-  assert(actual == AtomicAccess::load(&_num_cards),
-         "Num entries in completed buffers should be %zu but are %zu",
-         AtomicAccess::load(&_num_cards), actual);
-}
-#endif // ASSERT
-
-G1DirtyCardQueueSet::PausedBuffers::PausedList::PausedList() :
-  _head(nullptr), _tail(nullptr),
-  _safepoint_id(SafepointSynchronize::safepoint_id())
-{}
-
-#ifdef ASSERT
-G1DirtyCardQueueSet::PausedBuffers::PausedList::~PausedList() {
-  assert(AtomicAccess::load(&_head) == nullptr, "precondition");
-  assert(_tail == nullptr, "precondition");
-}
-#endif // ASSERT
-
-bool G1DirtyCardQueueSet::PausedBuffers::PausedList::is_next() const {
-  assert_not_at_safepoint();
-  return _safepoint_id == SafepointSynchronize::safepoint_id();
-}
-
-void G1DirtyCardQueueSet::PausedBuffers::PausedList::add(BufferNode* node) {
-  assert_not_at_safepoint();
-  assert(is_next(), "precondition");
-  BufferNode* old_head = AtomicAccess::xchg(&_head, node);
-  if (old_head == nullptr) {
-    assert(_tail == nullptr, "invariant");
-    _tail = node;
-  } else {
-    node->set_next(old_head);
-  }
-}
-
-G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::PausedList::take() {
-  BufferNode* head = AtomicAccess::load(&_head);
-  BufferNode* tail = _tail;
-  AtomicAccess::store(&_head, (BufferNode*)nullptr);
-  _tail = nullptr;
-  return HeadTail(head, tail);
-}
-
-G1DirtyCardQueueSet::PausedBuffers::PausedBuffers() : _plist(nullptr) {}
-
-#ifdef ASSERT
-G1DirtyCardQueueSet::PausedBuffers::~PausedBuffers() {
-  assert(AtomicAccess::load(&_plist) == nullptr, "invariant");
-}
-#endif // ASSERT
-
-void G1DirtyCardQueueSet::PausedBuffers::add(BufferNode* node) {
-  assert_not_at_safepoint();
-  PausedList* plist = AtomicAccess::load_acquire(&_plist);
-  if (plist == nullptr) {
-    // Try to install a new next list.
-    plist = new PausedList();
-    PausedList* old_plist = AtomicAccess::cmpxchg(&_plist, (PausedList*)nullptr, plist);
-    if (old_plist != nullptr) {
-      // Some other thread installed a new next list.  Use it instead.
-      delete plist;
-      plist = old_plist;
-    }
-  }
-  assert(plist->is_next(), "invariant");
-  plist->add(node);
-}
-
-G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_previous() {
-  assert_not_at_safepoint();
-  PausedList* previous;
-  {
-    // Deal with plist in a critical section, to prevent it from being
-    // deleted out from under us by a concurrent take_previous().
-    GlobalCounter::CriticalSection cs(Thread::current());
-    previous = AtomicAccess::load_acquire(&_plist);
-    if ((previous == nullptr) ||   // Nothing to take.
-        previous->is_next() ||  // Not from a previous safepoint.
-        // Some other thread stole it.
-        (AtomicAccess::cmpxchg(&_plist, previous, (PausedList*)nullptr) != previous)) {
-      return HeadTail();
-    }
-  }
-  // We now own previous.
-  HeadTail result = previous->take();
-  // There might be other threads examining previous (in concurrent
-  // take_previous()).  Synchronize to wait until any such threads are
-  // done with such examination before deleting.
-  GlobalCounter::write_synchronize();
-  delete previous;
-  return result;
-}
-
-G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_all() {
-  assert_at_safepoint();
-  HeadTail result;
-  PausedList* plist = AtomicAccess::load(&_plist);
-  if (plist != nullptr) {
-    AtomicAccess::store(&_plist, (PausedList*)nullptr);
-    result = plist->take();
-    delete plist;
-  }
-  return result;
-}
-
-void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) {
-  assert_not_at_safepoint();
-  assert(node->next() == nullptr, "precondition");
-  // Ensure there aren't any paused buffers from a previous safepoint.
-  enqueue_previous_paused_buffers();
-  // Cards for paused buffers are included in count, to contribute to
-  // notification checking after the coming safepoint if it doesn't GC.
-  // Note that this means the queue's _num_cards differs from the number
-  // of cards in the queued buffers when there are paused buffers.
-  AtomicAccess::add(&_num_cards, node->size());
-  _paused.add(node);
-}
-
-void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(const HeadTail& paused) {
-  if (paused._head != nullptr) {
-    assert(paused._tail != nullptr, "invariant");
-    // Cards from paused buffers are already recorded in the queue count.
-    _completed.append(*paused._head, *paused._tail);
-  }
-}
-
-void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() {
-  assert_not_at_safepoint();
-  enqueue_paused_buffers_aux(_paused.take_previous());
-}
-
-void G1DirtyCardQueueSet::enqueue_all_paused_buffers() {
-  assert_at_safepoint();
-  enqueue_paused_buffers_aux(_paused.take_all());
-}
-
-void G1DirtyCardQueueSet::abandon_completed_buffers() {
-  BufferNodeList list = take_all_completed_buffers();
-  BufferNode* buffers_to_delete = list._head;
-  while (buffers_to_delete != nullptr) {
-    BufferNode* bn = buffers_to_delete;
-    buffers_to_delete = bn->next();
-    bn->set_next(nullptr);
-    deallocate_buffer(bn);
-  }
-}
-
-// Merge lists of buffers. The source queue set is emptied as a
-// result. The queue sets must share the same allocator.
-void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) {
-  assert(allocator() == src->allocator(), "precondition");
-  const BufferNodeList from = src->take_all_completed_buffers();
-  if (from._head != nullptr) {
-    AtomicAccess::add(&_num_cards, from._entry_count);
-    _completed.append(*from._head, *from._tail);
-  }
-}
-
-BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
-  enqueue_all_paused_buffers();
-  verify_num_cards();
-  Pair<BufferNode*, BufferNode*> pair = _completed.take_all();
-  size_t num_cards = AtomicAccess::load(&_num_cards);
-  AtomicAccess::store(&_num_cards, size_t(0));
-  return BufferNodeList(pair.first, pair.second, num_cards);
-}
-
-class G1RefineBufferedCards : public StackObj {
-  BufferNode* const _node;
-  CardTable::CardValue** const _node_buffer;
-  const size_t _node_buffer_capacity;
-  const uint _worker_id;
-  G1ConcurrentRefineStats* _stats;
-  G1RemSet* const _g1rs;
-
-  static inline ptrdiff_t compare_cards(const CardTable::CardValue* p1,
-                                        const CardTable::CardValue* p2) {
-    return p2 - p1;
-  }
-
-  // Sorts the cards from start_index to _node_buffer_capacity in *decreasing*
-  // address order. Tests showed that this order is preferable to not sorting
-  // or increasing address order.
-  void sort_cards(size_t start_index) {
-    QuickSort::sort(&_node_buffer[start_index],
-                    _node_buffer_capacity - start_index,
-                    compare_cards);
-  }
-
-  // Returns the index to the first clean card in the buffer.
-  size_t clean_cards() {
-    const size_t start = _node->index();
-    assert(start <= _node_buffer_capacity, "invariant");
-
-    // Two-fingered compaction algorithm similar to the filtering mechanism in
-    // SATBMarkQueue. The main difference is that clean_card_before_refine()
-    // could change the buffer element in-place.
-    // We don't check for SuspendibleThreadSet::should_yield(), because
-    // cleaning and redirtying the cards is fast.
-    CardTable::CardValue** src = &_node_buffer[start];
-    CardTable::CardValue** dst = &_node_buffer[_node_buffer_capacity];
-    assert(src <= dst, "invariant");
-    for ( ; src < dst; ++src) {
-      // Search low to high for a card to keep.
-      if (_g1rs->clean_card_before_refine(src)) {
-        // Found keeper.  Search high to low for a card to discard.
-        while (src < --dst) {
-          if (!_g1rs->clean_card_before_refine(dst)) {
-            *dst = *src;         // Replace discard with keeper.
-            break;
-          }
-        }
-        // If discard search failed (src == dst), the outer loop will also end.
-      }
-    }
-
-    // dst points to the first retained clean card, or the end of the buffer
-    // if all the cards were discarded.
-    const size_t first_clean = dst - _node_buffer;
-    assert(first_clean >= start && first_clean <= _node_buffer_capacity, "invariant");
-    // Discarded cards are considered as refined.
-    _stats->inc_refined_cards(first_clean - start);
-    _stats->inc_precleaned_cards(first_clean - start);
-    return first_clean;
-  }
-
-  bool refine_cleaned_cards(size_t start_index) {
-    bool result = true;
-    size_t i = start_index;
-    for ( ; i < _node_buffer_capacity; ++i) {
-      if (SuspendibleThreadSet::should_yield()) {
-        redirty_unrefined_cards(i);
-        result = false;
-        break;
-      }
-      _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id);
-    }
-    _node->set_index(i);
-    _stats->inc_refined_cards(i - start_index);
-    return result;
-  }
-
-  void redirty_unrefined_cards(size_t start) {
-    for ( ; start < _node_buffer_capacity; ++start) {
-      *_node_buffer[start] = G1CardTable::dirty_card_val();
-    }
-  }
-
-public:
-  G1RefineBufferedCards(BufferNode* node,
-                        uint worker_id,
-                        G1ConcurrentRefineStats* stats) :
-    _node(node),
-    _node_buffer(reinterpret_cast<CardTable::CardValue**>(BufferNode::make_buffer_from_node(node))),
-    _node_buffer_capacity(node->capacity()),
-    _worker_id(worker_id),
-    _stats(stats),
-    _g1rs(G1CollectedHeap::heap()->rem_set()) {}
-
-  bool refine() {
-    size_t first_clean_index = clean_cards();
-    if (first_clean_index == _node_buffer_capacity) {
-      _node->set_index(first_clean_index);
-      return true;
-    }
-    // This fence serves two purposes. First, the cards must be cleaned
-    // before processing the contents. Second, we can't proceed with
-    // processing a region until after the read of the region's top in
-    // collect_and_clean_cards(), for synchronization with possibly concurrent
-    // humongous object allocation (see comment at the StoreStore fence before
-    // setting the regions' tops in humongous allocation path).
-    // It's okay that reading region's top and reading region's type were racy
-    // wrto each other. We need both set, in any order, to proceed.
-    OrderAccess::fence();
-    sort_cards(first_clean_index);
-    return refine_cleaned_cards(first_clean_index);
-  }
-};
-
-bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
-                                        uint worker_id,
-                                        G1ConcurrentRefineStats* stats) {
-  Ticks start_time = Ticks::now();
-  G1RefineBufferedCards buffered_cards(node, worker_id, stats);
-  bool result = buffered_cards.refine();
-  stats->inc_refinement_time(Ticks::now() - start_time);
-  return result;
-}
-
-void G1DirtyCardQueueSet::handle_refined_buffer(BufferNode* node,
-                                                bool fully_processed) {
-  if (fully_processed) {
-    assert(node->is_empty(), "Buffer not fully consumed: index: %zu, size: %zu",
-           node->index(), node->capacity());
-    deallocate_buffer(node);
-  } else {
-    assert(!node->is_empty(), "Buffer fully consumed.");
-    // Buffer incompletely processed because there is a pending safepoint.
-    // Record partially processed buffer, to be finished later.
-    record_paused_buffer(node);
-  }
-}
-
-void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node,
-                                                  G1ConcurrentRefineStats* stats) {
-  enqueue_completed_buffer(new_node);
-
-  // No need for mutator refinement if number of cards is below limit.
-  if (AtomicAccess::load(&_num_cards) <= AtomicAccess::load(&_mutator_refinement_threshold)) {
-    return;
-  }
-
-  // Don't try to process a buffer that will just get immediately paused.
-  // When going into a safepoint it's just a waste of effort.
-  // When coming out of a safepoint, Java threads may be running before the
-  // yield request (for non-Java threads) has been cleared.
-  if (SuspendibleThreadSet::should_yield()) {
-    return;
-  }
-
-  // Only Java threads perform mutator refinement.
-  if (!Thread::current()->is_Java_thread()) {
-    return;
-  }
-
-  BufferNode* node = get_completed_buffer();
-  if (node == nullptr) return;     // Didn't get a buffer to process.
-
-  // Refine cards in buffer.
-
-  uint worker_id = _free_ids.claim_par_id(); // temporarily claim an id
-  bool fully_processed = refine_buffer(node, worker_id, stats);
-  _free_ids.release_par_id(worker_id); // release the id
-
-  // Deal with buffer after releasing id, to let another thread use id.
-  handle_refined_buffer(node, fully_processed);
-}
-
-bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
-                                                               size_t stop_at,
-                                                               G1ConcurrentRefineStats* stats) {
-  // Not enough cards to trigger processing.
-  if (AtomicAccess::load(&_num_cards) <= stop_at) return false;
-
-  BufferNode* node = get_completed_buffer();
-  if (node == nullptr) return false; // Didn't get a buffer to process.
-
-  bool fully_processed = refine_buffer(node, worker_id, stats);
-  handle_refined_buffer(node, fully_processed);
-  return true;
-}
-
-void G1DirtyCardQueueSet::abandon_logs_and_stats() {
-  assert_at_safepoint();
-
-  // Disable mutator refinement until concurrent refinement decides otherwise.
-  set_mutator_refinement_threshold(SIZE_MAX);
-
-  // Iterate over all the threads, resetting per-thread queues and stats.
-  struct AbandonThreadLogClosure : public ThreadClosure {
-    G1DirtyCardQueueSet& _qset;
-    AbandonThreadLogClosure(G1DirtyCardQueueSet& qset) : _qset(qset) {}
-    virtual void do_thread(Thread* t) {
-      G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t);
-      _qset.reset_queue(queue);
-      queue.refinement_stats()->reset();
-    }
-  } closure(*this);
-  Threads::threads_do(&closure);
-
-  enqueue_all_paused_buffers();
-  abandon_completed_buffers();
-
-  // Reset stats from detached threads.
-  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
-  _detached_refinement_stats.reset();
-}
-
-void G1DirtyCardQueueSet::update_refinement_stats(G1ConcurrentRefineStats& stats) {
-  assert_at_safepoint();
-
-  _concatenated_refinement_stats = stats;
-
-  enqueue_all_paused_buffers();
-  verify_num_cards();
-
-  // Collect and reset stats from detached threads.
-  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
-  _concatenated_refinement_stats += _detached_refinement_stats;
-  _detached_refinement_stats.reset();
-}
-
-G1ConcurrentRefineStats G1DirtyCardQueueSet::concatenate_log_and_stats(Thread* thread) {
-  assert_at_safepoint();
-
-  G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
-  // Flush the buffer if non-empty.  Flush before accumulating and
-  // resetting stats, since flushing may modify the stats.
-  if (!queue.is_empty()) {
-    flush_queue(queue);
-  }
-
-  G1ConcurrentRefineStats result = *queue.refinement_stats();
-  queue.refinement_stats()->reset();
-  return result;
-}
-
-G1ConcurrentRefineStats G1DirtyCardQueueSet::concatenated_refinement_stats() const {
-  assert_at_safepoint();
-  return _concatenated_refinement_stats;
-}
-
-void G1DirtyCardQueueSet::record_detached_refinement_stats(G1ConcurrentRefineStats* stats) {
-  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
-  _detached_refinement_stats += *stats;
-  stats->reset();
-}
-
-size_t G1DirtyCardQueueSet::mutator_refinement_threshold() const {
-  return AtomicAccess::load(&_mutator_refinement_threshold);
-}
-
-void G1DirtyCardQueueSet::set_mutator_refinement_threshold(size_t value) {
-  AtomicAccess::store(&_mutator_refinement_threshold, value);
-}
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
@ -1,302 +0,0 @@
-/*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP
-#define SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP
-
-#include "gc/g1/g1CardTable.hpp"
-#include "gc/g1/g1ConcurrentRefineStats.hpp"
-#include "gc/g1/g1FreeIdSet.hpp"
-#include "gc/shared/bufferNode.hpp"
-#include "gc/shared/bufferNodeList.hpp"
-#include "gc/shared/ptrQueue.hpp"
-#include "memory/allocation.hpp"
-#include "memory/padded.hpp"
-#include "utilities/nonblockingQueue.hpp"
-
-class G1PrimaryConcurrentRefineThread;
-class G1DirtyCardQueueSet;
-class G1RedirtyCardsQueueSet;
-class Thread;
-
-// A ptrQueue whose elements are "oops", pointers to object heads.
-class G1DirtyCardQueue: public PtrQueue {
-  G1ConcurrentRefineStats* _refinement_stats;
-
-public:
-  G1DirtyCardQueue(G1DirtyCardQueueSet* qset);
-
-  // Flush before destroying; queue may be used to capture pending work while
-  // doing something else, with auto-flush on completion.
-  ~G1DirtyCardQueue();
-
-  G1ConcurrentRefineStats* refinement_stats() const {
-    return _refinement_stats;
-  }
-
-  // Compiler support.
-  static ByteSize byte_offset_of_index() {
-    return PtrQueue::byte_offset_of_index<G1DirtyCardQueue>();
-  }
-  using PtrQueue::byte_width_of_index;
-
-  static ByteSize byte_offset_of_buf() {
-    return PtrQueue::byte_offset_of_buf<G1DirtyCardQueue>();
-  }
-  using PtrQueue::byte_width_of_buf;
-
-};
-
-class G1DirtyCardQueueSet: public PtrQueueSet {
-  // Head and tail of a list of BufferNodes, linked through their next()
-  // fields.  Similar to BufferNodeList, but without the _entry_count.
-  struct HeadTail {
-    BufferNode* _head;
-    BufferNode* _tail;
-    HeadTail() : _head(nullptr), _tail(nullptr) {}
-    HeadTail(BufferNode* head, BufferNode* tail) : _head(head), _tail(tail) {}
-  };
-
-  // Concurrent refinement may stop processing in the middle of a buffer if
-  // there is a pending safepoint, to avoid long delays to safepoint.  A
-  // partially processed buffer needs to be recorded for processing by the
-  // safepoint if it's a GC safepoint; otherwise it needs to be recorded for
-  // further concurrent refinement work after the safepoint.  But if the
-  // buffer was obtained from the completed buffer queue then it can't simply
-  // be added back to the queue, as that would introduce a new source of ABA
-  // for the queue.
-  //
-  // The PausedBuffer object is used to record such buffers for the upcoming
-  // safepoint, and provides access to the buffers recorded for previous
-  // safepoints.  Before obtaining a buffer from the completed buffers queue,
-  // we first transfer any buffers from previous safepoints to the queue.
-  // This is ABA-safe because threads cannot be in the midst of a queue pop
-  // across a safepoint.
-  //
-  // The paused buffers are conceptually an extension of the completed buffers
-  // queue, and operations which need to deal with all of the queued buffers
-  // (such as concatenating or abandoning logs) also need to deal with any
-  // paused buffers.  In general, if a safepoint performs a GC then the paused
-  // buffers will be processed as part of it, and there won't be any paused
-  // buffers after a GC safepoint.
-  class PausedBuffers {
-    class PausedList : public CHeapObj<mtGC> {
-      BufferNode* volatile _head;
-      BufferNode* _tail;
-      size_t _safepoint_id;
-
-      NONCOPYABLE(PausedList);
-
-    public:
-      PausedList();
-      DEBUG_ONLY(~PausedList();)
-
-      // Return true if this list was created to hold buffers for the
-      // next safepoint.
-      // precondition: not at safepoint.
-      bool is_next() const;
-
-      // Thread-safe add the buffer to the list.
-      // precondition: not at safepoint.
-      // precondition: is_next().
-      void add(BufferNode* node);
-
-      // Take all the buffers from the list.  Not thread-safe.
-      HeadTail take();
-    };
-
-    // The most recently created list, which might be for either the next or
-    // a previous safepoint, or might be null if the next list hasn't been
-    // created yet.  We only need one list because of the requirement that
-    // threads calling add() must first ensure there are no paused buffers
-    // from a previous safepoint.  There might be many list instances existing
-    // at the same time though; there can be many threads competing to create
-    // and install the next list, and meanwhile there can be a thread dealing
-    // with the previous list.
-    PausedList* volatile _plist;
-    DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, sizeof(PausedList*));
-
-    NONCOPYABLE(PausedBuffers);
-
-  public:
-    PausedBuffers();
-    DEBUG_ONLY(~PausedBuffers();)
-
-    // Thread-safe add the buffer to paused list for next safepoint.
-    // precondition: not at safepoint.
-    // precondition: does not have paused buffers from a previous safepoint.
-    void add(BufferNode* node);
-
-    // Thread-safe take all paused buffers for previous safepoints.
-    // precondition: not at safepoint.
-    HeadTail take_previous();
-
-    // Take all the paused buffers.
-    // precondition: at safepoint.
-    HeadTail take_all();
-  };
-
-  DEFINE_PAD_MINUS_SIZE(0, DEFAULT_PADDING_SIZE, 0);
-  // Upper bound on the number of cards in the completed and paused buffers.
-  volatile size_t _num_cards;
-  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, sizeof(size_t));
-  // If the queue contains more cards than configured here, the
-  // mutator must start doing some of the concurrent refinement work.
-  volatile size_t _mutator_refinement_threshold;
-  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_PADDING_SIZE, sizeof(size_t));
-  // Buffers ready for refinement.
-  // NonblockingQueue has inner padding of one cache line.
-  NonblockingQueue<BufferNode, &BufferNode::next_ptr> _completed;
-  // Add a trailer padding after NonblockingQueue.
-  DEFINE_PAD_MINUS_SIZE(3, DEFAULT_PADDING_SIZE, sizeof(BufferNode*));
-  // Buffers for which refinement is temporarily paused.
-  // PausedBuffers has inner padding, including trailer.
-  PausedBuffers _paused;
-
-  G1FreeIdSet _free_ids;
-
-  G1ConcurrentRefineStats _concatenated_refinement_stats;
-  G1ConcurrentRefineStats _detached_refinement_stats;
-
-  // Verify _num_cards == sum of cards in the completed queue.
-  void verify_num_cards() const NOT_DEBUG_RETURN;
-
-  // Thread-safe add a buffer to paused list for next safepoint.
-  // precondition: not at safepoint.
-  void record_paused_buffer(BufferNode* node);
-  void enqueue_paused_buffers_aux(const HeadTail& paused);
-  // Thread-safe transfer paused buffers for previous safepoints to the queue.
-  // precondition: not at safepoint.
-  void enqueue_previous_paused_buffers();
-  // Transfer all paused buffers to the queue.
-  // precondition: at safepoint.
-  void enqueue_all_paused_buffers();
-
-  void abandon_completed_buffers();
-
-  // Refine the cards in "node" from its index to buffer_capacity.
-  // Stops processing if SuspendibleThreadSet::should_yield() is true.
-  // Returns true if the entire buffer was processed, false if there
-  // is a pending yield request.  The node's index is updated to exclude
-  // the processed elements, e.g. up to the element before processing
-  // stopped, or one past the last element if the entire buffer was
-  // processed. Updates stats.
-  bool refine_buffer(BufferNode* node,
-                     uint worker_id,
-                     G1ConcurrentRefineStats* stats);
-
-  // Deal with buffer after a call to refine_buffer.  If fully processed,
-  // deallocate the buffer.  Otherwise, record it as paused.
-  void handle_refined_buffer(BufferNode* node, bool fully_processed);
-
-  // Thread-safe attempt to remove and return the first buffer from
-  // the _completed queue.
-  // Returns null if the queue is empty, or if a concurrent push/append
-  // interferes. It uses GlobalCounter critical section to avoid ABA problem.
-  BufferNode* dequeue_completed_buffer();
-  // Remove and return a completed buffer from the list, or return null
-  // if none available.
-  BufferNode* get_completed_buffer();
-
-  // Called when queue is full or has no buffer.
-  void handle_zero_index(G1DirtyCardQueue& queue);
-
-  // Enqueue the buffer, and optionally perform refinement by the mutator.
-  // Mutator refinement is only done by Java threads, and only if there
-  // are more than mutator_refinement_threshold cards in the completed buffers.
-  // Updates stats.
-  //
-  // Mutator refinement, if performed, stops processing a buffer if
-  // SuspendibleThreadSet::should_yield(), recording the incompletely
-  // processed buffer for later processing of the remainder.
-  void handle_completed_buffer(BufferNode* node, G1ConcurrentRefineStats* stats);
-
-public:
-  G1DirtyCardQueueSet(BufferNode::Allocator* allocator);
-  ~G1DirtyCardQueueSet();
-
-  // The number of parallel ids that can be claimed to allow collector or
-  // mutator threads to do card-processing work.
-  static uint num_par_ids();
-
-  static void handle_zero_index_for_thread(Thread* t);
-
-  virtual void enqueue_completed_buffer(BufferNode* node);
-
-  // Upper bound on the number of cards currently in this queue set.
-  // Read without synchronization.  The value may be high because there
-  // is a concurrent modification of the set of buffers.
-  size_t num_cards() const;
-
-  void merge_bufferlists(G1RedirtyCardsQueueSet* src);
-
-  BufferNodeList take_all_completed_buffers();
-
-  void flush_queue(G1DirtyCardQueue& queue);
-
-  using CardValue = G1CardTable::CardValue;
-  void enqueue(G1DirtyCardQueue& queue, volatile CardValue* card_ptr);
-
-  // If there are more than stop_at cards in the completed buffers, pop
-  // a buffer, refine its contents, and return true.  Otherwise return
-  // false.  Updates stats.
-  //
-  // Stops processing a buffer if SuspendibleThreadSet::should_yield(),
-  // recording the incompletely processed buffer for later processing of
-  // the remainder.
-  bool refine_completed_buffer_concurrently(uint worker_id,
-                                            size_t stop_at,
-                                            G1ConcurrentRefineStats* stats);
-
-  // If a full collection is happening, reset per-thread refinement stats and
-  // partial logs, and release completed logs. The full collection will make
-  // them all irrelevant.
-  // precondition: at safepoint.
-  void abandon_logs_and_stats();
-
-  // Update global refinement statistics with the ones given and the ones from
-  // detached threads.
-  // precondition: at safepoint.
-  void update_refinement_stats(G1ConcurrentRefineStats& stats);
-  // Add the given thread's partial logs to the global list and return and reset
-  // its refinement stats.
-  // precondition: at safepoint.
-  G1ConcurrentRefineStats concatenate_log_and_stats(Thread* thread);
-
-  // Return the total of mutator refinement stats for all threads.
-  // precondition: at safepoint.
-  // precondition: only call after concatenate_logs_and_stats.
-  G1ConcurrentRefineStats concatenated_refinement_stats() const;
-
-  // Accumulate refinement stats from threads that are detaching.
-  void record_detached_refinement_stats(G1ConcurrentRefineStats* stats);
-
-  // Number of cards above which mutator threads should do refinement.
-  size_t mutator_refinement_threshold() const;
-
-  // Set number of cards above which mutator threads should do refinement.
-  void set_mutator_refinement_threshold(size_t value);
-};
-
-#endif // SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP
--- a/src/hotspot/share/gc/g1/g1FromCardCache.cpp
+++ b/src/hotspot/share/gc/g1/g1FromCardCache.cpp
@ -22,8 +22,6 @@
 *
 */

-#include "gc/g1/g1ConcurrentRefine.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1FromCardCache.hpp"
 #include "gc/shared/gc_globals.hpp"
 #include "memory/padded.inline.hpp"
@ -80,7 +78,7 @@ void G1FromCardCache::print(outputStream* out) {
 #endif

 uint G1FromCardCache::num_par_rem_sets() {
-  return G1DirtyCardQueueSet::num_par_ids() + G1ConcRefinementThreads + MAX2(ConcGCThreads, ParallelGCThreads);
+  return G1ConcRefinementThreads + ConcGCThreads;
 }

 void G1FromCardCache::clear(uint region_idx) {
--- a/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp
+++ b/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp
@ -147,6 +147,10 @@ void G1FullGCCompactTask::free_non_overlapping_regions(uint src_start_idx, uint

  for (uint i = non_overlapping_start; i <= src_end_idx; ++i) {
    G1HeapRegion* hr = _g1h->region_at(i);
+    if (VerifyDuringGC) {
+      // Satisfy some asserts in free_..._region
+      hr->clear_both_card_tables();
+    }
    _g1h->free_humongous_region(hr, nullptr);
  }
 }
--- a/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp
@ -35,6 +35,10 @@
 #include "gc/shared/fullGCForwarding.inline.hpp"

 void G1DetermineCompactionQueueClosure::free_empty_humongous_region(G1HeapRegion* hr) {
+  if (VerifyDuringGC) {
+    // Satisfy some asserts in free_..._region.
+    hr->clear_both_card_tables();
+  }
  _g1h->free_humongous_region(hr, nullptr);
  _collector->set_free(hr->hrm_index());
  add_to_compaction_queue(hr);
--- a/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp
+++ b/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp
@ -32,7 +32,7 @@ G1FullGCResetMetadataTask::G1ResetMetadataClosure::G1ResetMetadataClosure(G1Full

 void G1FullGCResetMetadataTask::G1ResetMetadataClosure::reset_region_metadata(G1HeapRegion* hr) {
  hr->rem_set()->clear();
-  hr->clear_cardtable();
+  hr->clear_both_card_tables();
 }

 bool G1FullGCResetMetadataTask::G1ResetMetadataClosure::do_heap_region(G1HeapRegion* hr) {
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
@ -50,8 +50,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
 {
  assert(max_gc_threads > 0, "Must have some GC threads");

-  _gc_par_phases[RetireTLABsAndFlushLogs] = new WorkerDataArray<double>("RetireTLABsAndFlushLogs", "JT Retire TLABs And Flush Logs (ms):", max_gc_threads);
-  _gc_par_phases[NonJavaThreadFlushLogs] = new WorkerDataArray<double>("NonJavaThreadFlushLogs", "Non-JT Flush Logs (ms):", max_gc_threads);
+  _gc_par_phases[RetireTLABs] = new WorkerDataArray<double>("RetireTLABs", "JavaThread Retire TLABs (ms):", max_gc_threads);

  _gc_par_phases[GCWorkerStart] = new WorkerDataArray<double>("GCWorkerStart", "GC Worker Start (ms):", max_gc_threads);
  _gc_par_phases[ExtRootScan] = new WorkerDataArray<double>("ExtRootScan", "Ext Root Scanning (ms):", max_gc_threads);
@ -83,7 +82,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
    _gc_par_phases[OptMergeRS]->create_thread_work_items(GCMergeRSWorkItemsStrings[i], i);
  }

-  _gc_par_phases[MergeLB] = new WorkerDataArray<double>("MergeLB", "Log Buffers (ms):", max_gc_threads);
+  _gc_par_phases[SweepRT] = new WorkerDataArray<double>("SweepRT", "Sweep (ms):", max_gc_threads);
  _gc_par_phases[ScanHR] = new WorkerDataArray<double>("ScanHR", "Scan Heap Roots (ms):", max_gc_threads);
  _gc_par_phases[OptScanHR] = new WorkerDataArray<double>("OptScanHR", "Optional Scan Heap Roots (ms):", max_gc_threads);
  _gc_par_phases[CodeRoots] = new WorkerDataArray<double>("CodeRoots", "Code Root Scan (ms):", max_gc_threads);
@ -98,7 +97,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
  _gc_par_phases[MergePSS] = new WorkerDataArray<double>("MergePSS", "Merge Per-Thread State (ms):", max_gc_threads);
  _gc_par_phases[RestoreEvacuationFailedRegions] = new WorkerDataArray<double>("RestoreEvacuationFailedRegions", "Restore Evacuation Failed Regions (ms):", max_gc_threads);
  _gc_par_phases[RemoveSelfForwards] = new WorkerDataArray<double>("RemoveSelfForwards", "Remove Self Forwards (ms):", max_gc_threads);
-  _gc_par_phases[ClearCardTable] = new WorkerDataArray<double>("ClearLoggedCards", "Clear Logged Cards (ms):", max_gc_threads);
+  _gc_par_phases[ClearCardTable] = new WorkerDataArray<double>("ClearPendingCards", "Clear Pending Cards (ms):", max_gc_threads);
  _gc_par_phases[RecalculateUsed] = new WorkerDataArray<double>("RecalculateUsed", "Recalculate Used Memory (ms):", max_gc_threads);
 #if COMPILER2_OR_JVMCI
  _gc_par_phases[UpdateDerivedPointers] = new WorkerDataArray<double>("UpdateDerivedPointers", "Update Derived Pointers (ms):", max_gc_threads);
@ -107,11 +106,15 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
  _gc_par_phases[ResetPartialArrayStateManager] = new WorkerDataArray<double>("ResetPartialArrayStateManager", "Reset Partial Array State Manager (ms):", max_gc_threads);
  _gc_par_phases[ProcessEvacuationFailedRegions] = new WorkerDataArray<double>("ProcessEvacuationFailedRegions", "Process Evacuation Failed Regions (ms):", max_gc_threads);

+  _gc_par_phases[ScanHR]->create_thread_work_items("Pending Cards:", ScanHRPendingCards);
+  _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Empty:", ScanHRScannedEmptyCards);
  _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Cards:", ScanHRScannedCards);
  _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Blocks:", ScanHRScannedBlocks);
  _gc_par_phases[ScanHR]->create_thread_work_items("Claimed Chunks:", ScanHRClaimedChunks);
  _gc_par_phases[ScanHR]->create_thread_work_items("Found Roots:", ScanHRFoundRoots);

+  _gc_par_phases[OptScanHR]->create_thread_work_items("Pending Cards:", ScanHRPendingCards);
+  _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Empty:", ScanHRScannedEmptyCards);
  _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Cards:", ScanHRScannedCards);
  _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Blocks:", ScanHRScannedBlocks);
  _gc_par_phases[OptScanHR]->create_thread_work_items("Claimed Chunks:", ScanHRClaimedChunks);
@ -119,9 +122,6 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
  _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Refs:", ScanHRScannedOptRefs);
  _gc_par_phases[OptScanHR]->create_thread_work_items("Used Memory:", ScanHRUsedMemory);

-  _gc_par_phases[MergeLB]->create_thread_work_items("Dirty Cards:", MergeLBDirtyCards);
-  _gc_par_phases[MergeLB]->create_thread_work_items("Skipped Cards:", MergeLBSkippedCards);
-
  _gc_par_phases[CodeRoots]->create_thread_work_items("Scanned Nmethods:", CodeRootsScannedNMethods);

  _gc_par_phases[OptCodeRoots]->create_thread_work_items("Scanned Nmethods:", CodeRootsScannedNMethods);
@ -129,7 +129,10 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :
  _gc_par_phases[MergePSS]->create_thread_work_items("Copied Bytes:", MergePSSCopiedBytes);
  _gc_par_phases[MergePSS]->create_thread_work_items("LAB Waste:", MergePSSLABWasteBytes);
  _gc_par_phases[MergePSS]->create_thread_work_items("LAB Undo Waste:", MergePSSLABUndoWasteBytes);
-  _gc_par_phases[MergePSS]->create_thread_work_items("Evac Fail Extra Cards:", MergePSSEvacFailExtra);
+  _gc_par_phases[MergePSS]->create_thread_work_items("Pending Cards:", MergePSSPendingCards);
+  _gc_par_phases[MergePSS]->create_thread_work_items("To-Young-Gen Cards:", MergePSSToYoungGenCards);
+  _gc_par_phases[MergePSS]->create_thread_work_items("Evac-Fail Cards:", MergePSSEvacFail);
+  _gc_par_phases[MergePSS]->create_thread_work_items("Marked Cards:", MergePSSMarked);

  _gc_par_phases[RestoreEvacuationFailedRegions]->create_thread_work_items("Evacuation Failed Regions:", RestoreEvacFailureRegionsEvacFailedNum);
  _gc_par_phases[RestoreEvacuationFailedRegions]->create_thread_work_items("Pinned Regions:", RestoreEvacFailureRegionsPinnedNum);
@ -150,9 +153,6 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) :

  _gc_par_phases[OptTermination]->create_thread_work_items("Optional Termination Attempts:");

-  _gc_par_phases[RedirtyCards] = new WorkerDataArray<double>("RedirtyCards", "Redirty Logged Cards (ms):", max_gc_threads);
-  _gc_par_phases[RedirtyCards]->create_thread_work_items("Redirtied Cards:");
-
  _gc_par_phases[ResizeThreadLABs] = new WorkerDataArray<double>("ResizeTLABs", "Resize TLABs (ms):", max_gc_threads);

  _gc_par_phases[FreeCollectionSet] = new WorkerDataArray<double>("FreeCSet", "Free Collection Set (ms):", max_gc_threads);
@ -171,9 +171,9 @@ void G1GCPhaseTimes::reset() {
  _cur_optional_evac_time_ms = 0.0;
  _cur_collection_nmethod_list_cleanup_time_ms = 0.0;
  _cur_merge_heap_roots_time_ms = 0.0;
+  _cur_merge_refinement_table_time_ms = 0.0;
  _cur_optional_merge_heap_roots_time_ms = 0.0;
  _cur_prepare_merge_heap_roots_time_ms = 0.0;
-  _cur_distribute_log_buffers_time_ms = 0.0;
  _cur_optional_prepare_merge_heap_roots_time_ms = 0.0;
  _cur_pre_evacuate_prepare_time_ms = 0.0;
  _cur_post_evacuate_cleanup_1_time_ms = 0.0;
@ -249,7 +249,7 @@ void G1GCPhaseTimes::record_gc_pause_end() {
      ASSERT_PHASE_UNINITIALIZED(MergeER);
      ASSERT_PHASE_UNINITIALIZED(MergeRS);
      ASSERT_PHASE_UNINITIALIZED(OptMergeRS);
-      ASSERT_PHASE_UNINITIALIZED(MergeLB);
+      ASSERT_PHASE_UNINITIALIZED(SweepRT);
      ASSERT_PHASE_UNINITIALIZED(ScanHR);
      ASSERT_PHASE_UNINITIALIZED(CodeRoots);
      ASSERT_PHASE_UNINITIALIZED(OptCodeRoots);
@ -425,8 +425,7 @@ double G1GCPhaseTimes::print_pre_evacuate_collection_set() const {
  }

  debug_time("Pre Evacuate Prepare", _cur_pre_evacuate_prepare_time_ms);
-  debug_phase(_gc_par_phases[RetireTLABsAndFlushLogs], 1);
-  debug_phase(_gc_par_phases[NonJavaThreadFlushLogs], 1);
+  debug_phase(_gc_par_phases[RetireTLABs], 1);
  debug_time("Choose Collection Set", (_recorded_young_cset_choice_time_ms + _recorded_non_young_cset_choice_time_ms));
  debug_time("Region Register", _cur_region_register_time);

@ -458,8 +457,8 @@ double G1GCPhaseTimes::print_evacuate_initial_collection_set() const {
  debug_time("Prepare Merge Heap Roots", _cur_prepare_merge_heap_roots_time_ms);
  debug_phase_merge_remset();

-  debug_time("Distribute Log Buffers", _cur_distribute_log_buffers_time_ms);
-  debug_phase(_gc_par_phases[MergeLB]);
+  debug_time("Merge Refinement Table", _cur_merge_refinement_table_time_ms);
+  debug_phase(_gc_par_phases[SweepRT], 1);

  info_time("Evacuate Collection Set", _cur_collection_initial_evac_time_ms);

@ -521,7 +520,6 @@ double G1GCPhaseTimes::print_post_evacuate_collection_set(bool evacuation_failed
  if (G1CollectedHeap::heap()->should_sample_collection_set_candidates()) {
    debug_phase(_gc_par_phases[SampleCollectionSetCandidates], 1);
  }
-  debug_phase(_gc_par_phases[RedirtyCards], 1);
  if (UseTLAB && ResizeTLAB) {
    debug_phase(_gc_par_phases[ResizeThreadLABs], 1);
  }
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
@ -46,8 +46,7 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {

 public:
  enum GCParPhases {
-    RetireTLABsAndFlushLogs,
-    NonJavaThreadFlushLogs,
+    RetireTLABs,
    GCWorkerStart,
    ExtRootScan,
    ThreadRoots,
@ -59,7 +58,7 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    MergeER = StrongOopStorageSetRoots + EnumRange<OopStorageSet::StrongId>().size(),
    MergeRS,
    OptMergeRS,
-    MergeLB,
+    SweepRT,
    ScanHR,
    OptScanHR,
    CodeRoots,
@ -71,7 +70,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    Other,
    GCWorkerTotal,
    GCWorkerEnd,
-    RedirtyCards,
    FreeCollectionSet,
    YoungFreeCSet,
    NonYoungFreeCSet,
@ -111,16 +109,19 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    MergeRSHowlArrayOfCards,
    MergeRSHowlBitmap,
    MergeRSHowlFull,
-    MergeRSCards,
+    MergeRSFromRemSetCards,
+    MergeRSTotalCards,
    MergeRSContainersSentinel
  };

  static constexpr const char* GCMergeRSWorkItemsStrings[MergeRSContainersSentinel] =
    { "Merged Inline:", "Merged ArrayOfCards:", "Merged Howl:", "Merged Full:",
      "Merged Howl Inline:", "Merged Howl ArrayOfCards:", "Merged Howl BitMap:", "Merged Howl Full:",
-      "Merged Cards:" };
+      "Merged From RS Cards:", "Total Cards:" };

  enum GCScanHRWorkItems {
+    ScanHRPendingCards,
+    ScanHRScannedEmptyCards,
    ScanHRScannedCards,
    ScanHRScannedBlocks,
    ScanHRClaimedChunks,
@ -129,11 +130,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    ScanHRUsedMemory
  };

-  enum GCMergeLBWorkItems {
-    MergeLBDirtyCards,
-    MergeLBSkippedCards
-  };
-
  enum GCCodeRootsWorkItems {
    CodeRootsScannedNMethods
  };
@ -143,7 +139,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    MergePSSLABSize,
    MergePSSLABWasteBytes,
    MergePSSLABUndoWasteBytes,
-    MergePSSEvacFailExtra
+    MergePSSPendingCards,      // To be scanned cards generated by GC (from cross-references and evacuation failure).
+    MergePSSToYoungGenCards,   // To-young-gen cards generated by GC.
+    MergePSSEvacFail,          // Evacuation failure generated dirty cards by GC.
+    MergePSSMarked,            // Total newly marked cards.
  };

  enum RestoreEvacFailureRegionsWorkItems {
@ -176,9 +175,9 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
  double _cur_collection_nmethod_list_cleanup_time_ms;

  double _cur_merge_heap_roots_time_ms;
+  // Merge refinement table time. Note that this time is included in _cur_merge_heap_roots_time_ms.
+  double _cur_merge_refinement_table_time_ms;
  double _cur_optional_merge_heap_roots_time_ms;
-  // Included in above merge and optional-merge time.
-  double _cur_distribute_log_buffers_time_ms;

  double _cur_prepare_merge_heap_roots_time_ms;
  double _cur_optional_prepare_merge_heap_roots_time_ms;
@ -302,6 +301,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    _cur_merge_heap_roots_time_ms += ms;
  }

+  void record_merge_refinement_table_time(double ms) {
+    _cur_merge_refinement_table_time_ms = ms;
+  }
+
  void record_or_add_optional_merge_heap_roots_time(double ms) {
    _cur_optional_merge_heap_roots_time_ms += ms;
  }
@ -310,10 +313,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    _cur_prepare_merge_heap_roots_time_ms += ms;
  }

-  void record_distribute_log_buffers_time_ms(double ms) {
-    _cur_distribute_log_buffers_time_ms += ms;
-  }
-
  void record_or_add_optional_prepare_merge_heap_roots_time(double ms) {
    _cur_optional_prepare_merge_heap_roots_time_ms += ms;
  }
@ -382,10 +381,6 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    _recorded_prepare_heap_roots_time_ms = recorded_prepare_heap_roots_time_ms;
  }

-  double cur_distribute_log_buffers_time_ms() {
-    return _cur_distribute_log_buffers_time_ms;
-  }
-
  double cur_collection_par_time_ms() {
    return _cur_collection_initial_evac_time_ms +
           _cur_optional_evac_time_ms +
@ -396,6 +391,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
           _cur_collection_nmethod_list_cleanup_time_ms;
  }

+  double cur_merge_refinement_table_time() const {
+    return _cur_merge_refinement_table_time_ms;
+  }
+
  double cur_resize_heap_time_ms() {
    return _cur_resize_heap_time_ms;
  }
--- a/src/hotspot/share/gc/g1/g1HeapRegion.cpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegion.cpp
@ -39,6 +39,7 @@
 #include "logging/log.hpp"
 #include "logging/logStream.hpp"
 #include "memory/iterator.inline.hpp"
+#include "memory/memRegion.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/access.inline.hpp"
 #include "oops/compressedOops.inline.hpp"
@ -137,11 +138,21 @@ void G1HeapRegion::hr_clear(bool clear_space) {
  if (clear_space) clear(SpaceDecorator::Mangle);
 }

-void G1HeapRegion::clear_cardtable() {
+void G1HeapRegion::clear_card_table() {
  G1CardTable* ct = G1CollectedHeap::heap()->card_table();
  ct->clear_MemRegion(MemRegion(bottom(), end()));
 }

+void G1HeapRegion::clear_refinement_table() {
+  G1CardTable* ct = G1CollectedHeap::heap()->refinement_table();
+  ct->clear_MemRegion(MemRegion(bottom(), end()));
+}
+
+void G1HeapRegion::clear_both_card_tables() {
+  clear_card_table();
+  clear_refinement_table();
+}
+
 void G1HeapRegion::set_free() {
  if (!is_free()) {
    report_region_type_change(G1HeapRegionTraceType::Free);
@ -591,8 +602,12 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure {

    G1HeapRegion* _from;
    G1HeapRegion* _to;
-    CardValue _cv_obj;
-    CardValue _cv_field;
+
+    CardValue _cv_obj_ct;    // In card table.
+    CardValue _cv_field_ct;
+
+    CardValue _cv_obj_rt;    // In refinement table.
+    CardValue _cv_field_rt;

    RemSetChecker(G1VerifyFailureCounter* failures, oop containing_obj, T* p, oop obj)
      : Checker<T>(failures, containing_obj, p, obj) {
@ -600,19 +615,23 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure {
      _to = this->_g1h->heap_region_containing(obj);

      CardTable* ct = this->_g1h->card_table();
-      _cv_obj = *ct->byte_for_const(this->_containing_obj);
-      _cv_field = *ct->byte_for_const(p);
+      _cv_obj_ct = *ct->byte_for_const(this->_containing_obj);
+      _cv_field_ct = *ct->byte_for_const(p);
+
+      ct = this->_g1h->refinement_table();
+      _cv_obj_rt = *ct->byte_for_const(this->_containing_obj);
+      _cv_field_rt = *ct->byte_for_const(p);
    }

    bool failed() const {
      if (_from != _to && !_from->is_young() &&
          _to->rem_set()->is_complete() &&
          _from->rem_set()->cset_group() != _to->rem_set()->cset_group()) {
-        const CardValue dirty = G1CardTable::dirty_card_val();
+        const CardValue clean = G1CardTable::clean_card_val();
        return !(_to->rem_set()->contains_reference(this->_p) ||
                 (this->_containing_obj->is_objArray() ?
-                  _cv_field == dirty :
-                  _cv_obj == dirty || _cv_field == dirty));
+                  (_cv_field_ct != clean || _cv_field_rt != clean) :
+                  (_cv_obj_ct != clean || _cv_field_ct != clean || _cv_obj_rt != clean || _cv_field_rt != clean)));
      }
      return false;
    }
@ -630,7 +649,8 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure {
      log.error("Missing rem set entry:");
      this->print_containing_obj(&ls, _from);
      this->print_referenced_obj(&ls, _to, "");
-      log.error("Obj head CV = %d, field CV = %d.", _cv_obj, _cv_field);
+      log.error("CT obj head CV = %d, field CV = %d.", _cv_obj_ct, _cv_field_ct);
+      log.error("RT Obj head CV = %d, field CV = %d.", _cv_obj_rt, _cv_field_rt);
      log.error("----------");
    }
  };
--- a/src/hotspot/share/gc/g1/g1HeapRegion.hpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegion.hpp
@ -42,7 +42,6 @@ class G1CollectedHeap;
 class G1CMBitMap;
 class G1CSetCandidateGroup;
 class G1Predictions;
-class G1HeapRegion;
 class G1HeapRegionRemSet;
 class G1HeapRegionSetBase;
 class nmethod;
@ -478,7 +477,10 @@ public:
  // Callers must ensure this is not called by multiple threads at the same time.
  void hr_clear(bool clear_space);
  // Clear the card table corresponding to this region.
-  void clear_cardtable();
+  void clear_card_table();
+  void clear_refinement_table();
+
+  void clear_both_card_tables();

  // Notify the region that an evacuation failure occurred for an object within this
  // region.
--- a/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp
@ -63,7 +63,8 @@ public:

 G1HeapRegionManager::G1HeapRegionManager() :
  _bot_mapper(nullptr),
-  _cardtable_mapper(nullptr),
+  _card_table_mapper(nullptr),
+  _refinement_table_mapper(nullptr),
  _committed_map(),
  _next_highest_used_hrm_index(0),
  _regions(), _heap_mapper(nullptr),
@ -74,7 +75,8 @@ G1HeapRegionManager::G1HeapRegionManager() :
 void G1HeapRegionManager::initialize(G1RegionToSpaceMapper* heap_storage,
                                     G1RegionToSpaceMapper* bitmap,
                                     G1RegionToSpaceMapper* bot,
-                                     G1RegionToSpaceMapper* cardtable) {
+                                     G1RegionToSpaceMapper* card_table,
+                                     G1RegionToSpaceMapper* refinement_table) {
  _next_highest_used_hrm_index = 0;

  _heap_mapper = heap_storage;
@ -82,7 +84,8 @@ void G1HeapRegionManager::initialize(G1RegionToSpaceMapper* heap_storage,
  _bitmap_mapper = bitmap;

  _bot_mapper = bot;
-  _cardtable_mapper = cardtable;
+  _card_table_mapper = card_table;
+  _refinement_table_mapper = refinement_table;

  _regions.initialize(heap_storage->reserved(), G1HeapRegion::GrainBytes);

@ -186,7 +189,8 @@ void G1HeapRegionManager::commit_regions(uint index, size_t num_regions, WorkerT
  _bitmap_mapper->commit_regions(index, num_regions, pretouch_workers);

  _bot_mapper->commit_regions(index, num_regions, pretouch_workers);
-  _cardtable_mapper->commit_regions(index, num_regions, pretouch_workers);
+  _card_table_mapper->commit_regions(index, num_regions, pretouch_workers);
+  _refinement_table_mapper->commit_regions(index, num_regions, pretouch_workers);
 }

 void G1HeapRegionManager::uncommit_regions(uint start, uint num_regions) {
@ -209,7 +213,8 @@ void G1HeapRegionManager::uncommit_regions(uint start, uint num_regions) {
  _bitmap_mapper->uncommit_regions(start, num_regions);

  _bot_mapper->uncommit_regions(start, num_regions);
-  _cardtable_mapper->uncommit_regions(start, num_regions);
+  _card_table_mapper->uncommit_regions(start, num_regions);
+  _refinement_table_mapper->uncommit_regions(start, num_regions);

  _committed_map.uncommit(start, end);
 }
@ -261,19 +266,23 @@ void G1HeapRegionManager::clear_auxiliary_data_structures(uint start, uint num_r
  // Signal G1BlockOffsetTable to clear the given regions.
  _bot_mapper->signal_mapping_changed(start, num_regions);
  // Signal G1CardTable to clear the given regions.
-  _cardtable_mapper->signal_mapping_changed(start, num_regions);
+  _card_table_mapper->signal_mapping_changed(start, num_regions);
+  // Signal refinement table to clear the given regions.
+  _refinement_table_mapper->signal_mapping_changed(start, num_regions);
 }

 MemoryUsage G1HeapRegionManager::get_auxiliary_data_memory_usage() const {
  size_t used_sz =
    _bitmap_mapper->committed_size() +
    _bot_mapper->committed_size() +
-    _cardtable_mapper->committed_size();
+    _card_table_mapper->committed_size() +
+    _refinement_table_mapper->committed_size();

  size_t committed_sz =
    _bitmap_mapper->reserved_size() +
    _bot_mapper->reserved_size() +
-    _cardtable_mapper->reserved_size();
+    _card_table_mapper->reserved_size() +
+    _refinement_table_mapper->reserved_size();

  return MemoryUsage(0, used_sz, committed_sz, committed_sz);
 }
--- a/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp
@ -74,7 +74,8 @@ class G1HeapRegionManager: public CHeapObj<mtGC> {
  friend class G1HeapRegionClaimer;

  G1RegionToSpaceMapper* _bot_mapper;
-  G1RegionToSpaceMapper* _cardtable_mapper;
+  G1RegionToSpaceMapper* _card_table_mapper;
+  G1RegionToSpaceMapper* _refinement_table_mapper;

  // Keeps track of the currently committed regions in the heap. The committed regions
  // can either be active (ready for use) or inactive (ready for uncommit).
@ -161,7 +162,8 @@ public:
  void initialize(G1RegionToSpaceMapper* heap_storage,
                  G1RegionToSpaceMapper* bitmap,
                  G1RegionToSpaceMapper* bot,
-                  G1RegionToSpaceMapper* cardtable);
+                  G1RegionToSpaceMapper* card_table,
+                  G1RegionToSpaceMapper* refinement_table);

  // Return the "dummy" region used for G1AllocRegion. This is currently a hardwired
  // new G1HeapRegion that owns G1HeapRegion at index 0. Since at the moment we commit
--- a/src/hotspot/share/gc/g1/g1HeapVerifier.cpp
+++ b/src/hotspot/share/gc/g1/g1HeapVerifier.cpp
@ -42,6 +42,7 @@
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/handles.inline.hpp"
+#include "runtime/threads.hpp"

 int G1HeapVerifier::_enabled_verification_types = G1HeapVerifier::G1VerifyAll;

@ -528,6 +529,7 @@ void G1HeapVerifier::verify_before_gc() {

 void G1HeapVerifier::verify_after_gc() {
  verify(VerifyOption::G1UseConcMarking, "After GC");
+  verify_card_tables_in_sync();
 }

 void G1HeapVerifier::verify_bitmap_clear(bool from_tams) {
@ -556,17 +558,17 @@ void G1HeapVerifier::verify_bitmap_clear(bool from_tams) {
  G1CollectedHeap::heap()->heap_region_iterate(&cl);
 }

-#ifndef PRODUCT
 class G1VerifyCardTableCleanup: public G1HeapRegionClosure {
  G1HeapVerifier* _verifier;
 public:
  G1VerifyCardTableCleanup(G1HeapVerifier* verifier)
    : _verifier(verifier) { }
  virtual bool do_heap_region(G1HeapRegion* r) {
+    _verifier->verify_ct_clean_region(r);
    if (r->is_survivor()) {
-      _verifier->verify_dirty_region(r);
+      _verifier->verify_rt_clean_region(r);
    } else {
-      _verifier->verify_not_dirty_region(r);
+      _verifier->verify_rt_clean_from_top(r);
    }
    return false;
  }
@ -579,14 +581,35 @@ void G1HeapVerifier::verify_card_table_cleanup() {
  }
 }

-void G1HeapVerifier::verify_not_dirty_region(G1HeapRegion* hr) {
-  // All of the region should be clean.
-  G1CardTable* ct = _g1h->card_table();
-  MemRegion mr(hr->bottom(), hr->end());
-  ct->verify_not_dirty_region(mr);
+class G1VerifyCardTablesClean: public G1HeapRegionClosure {
+  G1HeapVerifier* _verifier;
+  bool _both_card_tables;
+
+public:
+  G1VerifyCardTablesClean(G1HeapVerifier* verifier, bool both_card_tables = true)
+    : _verifier(verifier), _both_card_tables(both_card_tables) { }
+
+  virtual bool do_heap_region(G1HeapRegion* r) {
+    _verifier->verify_rt_clean_region(r);     // Must be all Clean from bottom -> end.
+    if (_both_card_tables) {
+      _verifier->verify_ct_clean_region(r);
+    }
+    return false;
+  }
+};
+
+void G1HeapVerifier::verify_card_tables_clean(bool both_card_tables) {
+  G1VerifyCardTablesClean cl(this, both_card_tables);
+  _g1h->heap_region_iterate(&cl);
 }

-void G1HeapVerifier::verify_dirty_region(G1HeapRegion* hr) {
+void G1HeapVerifier::verify_rt_clean_from_top(G1HeapRegion* hr) {
+  G1CardTable* ct = _g1h->refinement_table();
+  MemRegion mr(align_up(hr->top(), G1CardTable::card_size()), hr->end());
+  ct->verify_region(mr, G1CardTable::clean_card_val(), true);
+}
+
+void G1HeapVerifier::verify_rt_dirty_to_dummy_top(G1HeapRegion* hr) {
  // We cannot guarantee that [bottom(),end()] is dirty.  Threads
  // dirty allocated blocks as they allocate them. The thread that
  // retires each region and replaces it with a new one will do a
@ -594,29 +617,56 @@ void G1HeapVerifier::verify_dirty_region(G1HeapRegion* hr) {
  // not dirty that area (one less thing to have to do while holding
  // a lock). So we can only verify that [bottom(),pre_dummy_top()]
  // is dirty.
-  G1CardTable* ct = _g1h->card_table();
+  G1CardTable* ct = _g1h->refinement_table();
  MemRegion mr(hr->bottom(), hr->pre_dummy_top());
-  if (hr->is_young()) {
-    ct->verify_g1_young_region(mr);
-  } else {
-    ct->verify_dirty_region(mr);
-  }
+  ct->verify_dirty_region(mr);
 }

-class G1VerifyDirtyYoungListClosure : public G1HeapRegionClosure {
-private:
-  G1HeapVerifier* _verifier;
-public:
-  G1VerifyDirtyYoungListClosure(G1HeapVerifier* verifier) : G1HeapRegionClosure(), _verifier(verifier) { }
-  virtual bool do_heap_region(G1HeapRegion* r) {
-    _verifier->verify_dirty_region(r);
-    return false;
-  }
-};
+void G1HeapVerifier::verify_ct_clean_region(G1HeapRegion* hr) {
+  G1CardTable* ct = _g1h->card_table();
+  MemRegion mr(hr->bottom(), hr->end());
+  ct->verify_region(mr, G1CardTable::clean_card_val(), true);
+}

-void G1HeapVerifier::verify_dirty_young_regions() {
-  G1VerifyDirtyYoungListClosure cl(this);
-  _g1h->collection_set()->iterate(&cl);
+void G1HeapVerifier::verify_rt_clean_region(G1HeapRegion* hr) {
+  G1CardTable* ct = _g1h->refinement_table();
+  MemRegion mr(hr->bottom(), hr->end());
+  ct->verify_region(mr, G1CardTable::clean_card_val(), true);
+}
+
+#ifndef PRODUCT
+
+void G1HeapVerifier::verify_card_tables_in_sync() {
+
+    // Non-Java thread card tables must be null.
+    class AssertCardTableBaseNull : public ThreadClosure {
+    public:
+
+      void do_thread(Thread* thread) {
+        ResourceMark rm;
+        assert(G1ThreadLocalData::get_byte_map_base(thread) == nullptr, "thread " PTR_FORMAT " (%s) has non-null card table base",
+               p2i(thread), thread->name());
+      }
+    } check_null_cl;
+
+    Threads::non_java_threads_do(&check_null_cl);
+
+    // Java thread card tables must be the same as the global card table.
+    class AssertSameCardTableClosure : public ThreadClosure {
+    public:
+
+      void do_thread(Thread* thread) {
+        G1CardTable::CardValue* global_ct_base = G1CollectedHeap::heap()->card_table_base();
+        G1CardTable::CardValue* cur_ct_base = G1ThreadLocalData::get_byte_map_base(thread);
+
+        ResourceMark rm;
+        assert(cur_ct_base == global_ct_base,
+               "thread " PTR_FORMAT " (%s) has wrong card table base, should be " PTR_FORMAT " is " PTR_FORMAT,
+               p2i(thread), thread->name(), p2i(global_ct_base), p2i(cur_ct_base));
+      }
+    } check_same_cl;
+
+    Threads::java_threads_do(&check_same_cl);
 }

 class G1CheckRegionAttrTableClosure : public G1HeapRegionClosure {
--- a/src/hotspot/share/gc/g1/g1HeapVerifier.hpp
+++ b/src/hotspot/share/gc/g1/g1HeapVerifier.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -78,11 +78,16 @@ public:
  // Do sanity check on the contents of the in-cset fast test table.
  bool check_region_attr_table() PRODUCT_RETURN_( return true; );

-  void verify_card_table_cleanup() PRODUCT_RETURN;
+  void verify_card_table_cleanup();
+  void verify_card_tables_clean(bool both_card_tables);

-  void verify_not_dirty_region(G1HeapRegion* hr) PRODUCT_RETURN;
-  void verify_dirty_region(G1HeapRegion* hr) PRODUCT_RETURN;
-  void verify_dirty_young_regions() PRODUCT_RETURN;
+  void verify_ct_clean_region(G1HeapRegion* hr);
+  void verify_rt_dirty_to_dummy_top(G1HeapRegion* hr);
+  void verify_rt_clean_from_top(G1HeapRegion* hr);
+  void verify_rt_clean_region(G1HeapRegion* hr);
+
+  // Verify that the global card table and the thread's card tables are in sync.
+  void verify_card_tables_in_sync() PRODUCT_RETURN;
 };

 #endif // SHARE_GC_G1_G1HEAPVERIFIER_HPP
--- a/src/hotspot/share/gc/g1/g1OopClosures.hpp
+++ b/src/hotspot/share/gc/g1/g1OopClosures.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -86,19 +86,19 @@ public:

 // This closure is applied to the fields of the objects that have just been copied during evacuation.
 class G1ScanEvacuatedObjClosure : public G1ScanClosureBase {
-  friend class G1SkipCardEnqueueSetter;
+  friend class G1SkipCardMarkSetter;

-  enum SkipCardEnqueueTristate {
+  enum SkipCardMarkTristate {
    False = 0,
    True,
    Uninitialized
  };

-  SkipCardEnqueueTristate _skip_card_enqueue;
+  SkipCardMarkTristate _skip_card_mark;

 public:
  G1ScanEvacuatedObjClosure(G1CollectedHeap* g1h, G1ParScanThreadState* par_scan_state) :
-    G1ScanClosureBase(g1h, par_scan_state), _skip_card_enqueue(Uninitialized) { }
+    G1ScanClosureBase(g1h, par_scan_state), _skip_card_mark(Uninitialized) { }

  template <class T> void do_oop_work(T* p);
  virtual void do_oop(oop* p)          { do_oop_work(p); }
@ -109,22 +109,22 @@ public:
  }

 #ifdef ASSERT
-  bool skip_card_enqueue_set() const { return _skip_card_enqueue != Uninitialized; }
+  bool skip_card_mark_set() const { return _skip_card_mark != Uninitialized; }
 #endif
 };

-// RAII object to properly set the _skip_card_enqueue field in G1ScanEvacuatedObjClosure.
-class G1SkipCardEnqueueSetter : public StackObj {
+// RAII object to properly set the _skip_card_mark field in G1ScanEvacuatedObjClosure.
+class G1SkipCardMarkSetter : public StackObj {
  G1ScanEvacuatedObjClosure* _closure;

 public:
-  G1SkipCardEnqueueSetter(G1ScanEvacuatedObjClosure* closure, bool skip_card_enqueue) : _closure(closure) {
-    assert(_closure->_skip_card_enqueue == G1ScanEvacuatedObjClosure::Uninitialized, "Must not be set");
-    _closure->_skip_card_enqueue = skip_card_enqueue ? G1ScanEvacuatedObjClosure::True : G1ScanEvacuatedObjClosure::False;
+  G1SkipCardMarkSetter(G1ScanEvacuatedObjClosure* closure, bool skip_card_mark) : _closure(closure) {
+    assert(_closure->_skip_card_mark == G1ScanEvacuatedObjClosure::Uninitialized, "Must not be set");
+    _closure->_skip_card_mark = skip_card_mark ? G1ScanEvacuatedObjClosure::True : G1ScanEvacuatedObjClosure::False;
  }

-  ~G1SkipCardEnqueueSetter() {
-    DEBUG_ONLY(_closure->_skip_card_enqueue = G1ScanEvacuatedObjClosure::Uninitialized;)
+  ~G1SkipCardMarkSetter() {
+    DEBUG_ONLY(_closure->_skip_card_mark = G1ScanEvacuatedObjClosure::Uninitialized;)
  }
 };

@ -206,13 +206,20 @@ public:
 class G1ConcurrentRefineOopClosure: public BasicOopIterateClosure {
  G1CollectedHeap* _g1h;
  uint _worker_id;
+  bool _has_ref_to_cset;
+  bool _has_ref_to_old;

 public:
  G1ConcurrentRefineOopClosure(G1CollectedHeap* g1h, uint worker_id) :
    _g1h(g1h),
-    _worker_id(worker_id) {
+    _worker_id(worker_id),
+    _has_ref_to_cset(false),
+    _has_ref_to_old(false) {
  }

+  bool has_ref_to_cset() const { return _has_ref_to_cset; }
+  bool has_ref_to_old() const { return _has_ref_to_old; }
+
  virtual ReferenceIterationMode reference_iteration_mode() { return DO_FIELDS; }

  template <class T> void do_oop_work(T* p);
@ -223,6 +230,7 @@ public:
 class G1RebuildRemSetClosure : public BasicOopIterateClosure {
  G1CollectedHeap* _g1h;
  uint _worker_id;
+
 public:
  G1RebuildRemSetClosure(G1CollectedHeap* g1h, uint worker_id) : _g1h(g1h), _worker_id(worker_id) {
  }
--- a/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
@ -90,11 +90,11 @@ inline void G1ScanEvacuatedObjClosure::do_oop_work(T* p) {
    prefetch_and_push(p, obj);
  } else if (!G1HeapRegion::is_in_same_region(p, obj)) {
    handle_non_cset_obj_common(region_attr, p, obj);
-    assert(_skip_card_enqueue != Uninitialized, "Scan location has not been initialized.");
-    if (_skip_card_enqueue == True) {
+    assert(_skip_card_mark != Uninitialized, "Scan location has not been initialized.");
+    if (_skip_card_mark == True) {
      return;
    }
-    _par_scan_state->enqueue_card_if_tracked(region_attr, p, obj);
+    _par_scan_state->mark_card_if_tracked(region_attr, p, obj);
  }
 }

@ -127,6 +127,11 @@ inline static void check_obj_during_refinement(T* p, oop const obj) {

 template <class T>
 inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) {
+  // Early out if we already found a to-young reference.
+  if (_has_ref_to_cset) {
+    return;
+  }
+
  T o = RawAccess<MO_RELAXED>::oop_load(p);
  if (CompressedOops::is_null(o)) {
    return;
@ -146,7 +151,12 @@ inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) {
    return;
  }

-  G1HeapRegionRemSet* to_rem_set = _g1h->heap_region_containing(obj)->rem_set();
+  G1HeapRegion* to_region = _g1h->heap_region_containing(obj);
+  if (to_region->is_young()) {
+    _has_ref_to_cset = true;
+    return;
+  }
+  G1HeapRegionRemSet* to_rem_set = to_region->rem_set();

  assert(to_rem_set != nullptr, "Need per-region 'into' remsets.");
  if (to_rem_set->is_tracked()) {
@ -154,6 +164,7 @@ inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) {

    if (from->rem_set()->cset_group() != to_rem_set->cset_group()) {
      to_rem_set->add_reference(p, _worker_id);
+      _has_ref_to_old = true;
    }
  }
 }
@ -180,7 +191,7 @@ inline void G1ScanCardClosure::do_oop_work(T* p) {
    _heap_roots_found++;
  } else if (!G1HeapRegion::is_in_same_region(p, obj)) {
    handle_non_cset_obj_common(region_attr, p, obj);
-    _par_scan_state->enqueue_card_if_tracked(region_attr, p, obj);
+    _par_scan_state->mark_card_if_tracked(region_attr, p, obj);
  }
 }

@ -272,10 +283,14 @@ template <class T> void G1RebuildRemSetClosure::do_oop_work(T* p) {
  G1HeapRegion* to = _g1h->heap_region_containing(obj);
  G1HeapRegionRemSet* rem_set = to->rem_set();
  if (rem_set->is_tracked()) {
-    G1HeapRegion* from = _g1h->heap_region_containing(p);
+    if (to->is_young()) {
+      G1BarrierSet::g1_barrier_set()->write_ref_field_post(p);
+    } else {
+      G1HeapRegion* from = _g1h->heap_region_containing(p);

-    if (from->rem_set()->cset_group() != rem_set->cset_group()) {
-      rem_set->add_reference(p, _worker_id);
+      if (from->rem_set()->cset_group() != rem_set->cset_group()) {
+        rem_set->add_reference(p, _worker_id);
+      }
    }
  }
 }
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
@ -57,22 +57,21 @@
 #define MAYBE_INLINE_EVACUATION NOT_DEBUG(inline) DEBUG_ONLY(NOINLINE)

 G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h,
-                                           G1RedirtyCardsQueueSet* rdcqs,
                                           uint worker_id,
                                           uint num_workers,
                                           G1CollectionSet* collection_set,
                                           G1EvacFailureRegions* evac_failure_regions)
  : _g1h(g1h),
    _task_queue(g1h->task_queue(worker_id)),
-    _rdc_local_qset(rdcqs),
-    _ct(g1h->card_table()),
+    _ct(g1h->refinement_table()),
    _closures(nullptr),
    _plab_allocator(nullptr),
    _age_table(false),
    _tenuring_threshold(g1h->policy()->tenuring_threshold()),
    _scanner(g1h, this),
    _worker_id(worker_id),
-    _last_enqueued_card(SIZE_MAX),
+    _num_cards_marked_dirty(0),
+    _num_cards_marked_to_cset(0),
    _stack_trim_upper_threshold(GCDrainStackTargetSize * 2 + 1),
    _stack_trim_lower_threshold(GCDrainStackTargetSize),
    _trim_ticks(),
@ -88,7 +87,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h,
    ALLOCATION_FAILURE_INJECTOR_ONLY(_allocation_failure_inject_counter(0) COMMA)
    _evacuation_failed_info(),
    _evac_failure_regions(evac_failure_regions),
-    _evac_failure_enqueued_cards(0)
+    _num_cards_from_evac_failure(0)
 {
  // We allocate number of young gen regions in the collection set plus one
  // entries, since entry 0 keeps track of surviving bytes for non-young regions.
@ -112,8 +111,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h,
  initialize_numa_stats();
 }

-size_t G1ParScanThreadState::flush_stats(size_t* surviving_young_words, uint num_workers, BufferNodeList* rdc_buffers) {
-  *rdc_buffers = _rdc_local_qset.flush();
+size_t G1ParScanThreadState::flush_stats(size_t* surviving_young_words, uint num_workers) {
  flush_numa_stats();
  // Update allocation statistics.
  _plab_allocator->flush_and_retire_stats(num_workers);
@ -147,8 +145,16 @@ size_t G1ParScanThreadState::lab_undo_waste_words() const {
  return _plab_allocator->undo_waste();
 }

-size_t G1ParScanThreadState::evac_failure_enqueued_cards() const {
-  return _evac_failure_enqueued_cards;
+size_t G1ParScanThreadState::num_cards_pending() const {
+  return _num_cards_marked_dirty + _num_cards_from_evac_failure;
+}
+
+size_t G1ParScanThreadState::num_cards_marked() const {
+  return num_cards_pending() + _num_cards_marked_to_cset;
+}
+
+size_t G1ParScanThreadState::num_cards_from_evac_failure() const {
+  return _num_cards_from_evac_failure;
 }

 #ifdef ASSERT
@ -230,7 +236,7 @@ void G1ParScanThreadState::do_partial_array(PartialArrayState* state, bool stole
  PartialArraySplitter::Claim claim =
    _partial_array_splitter.claim(state, _task_queue, stolen);
  G1HeapRegionAttr dest_attr = _g1h->region_attr(to_array);
-  G1SkipCardEnqueueSetter x(&_scanner, dest_attr.is_new_survivor());
+  G1SkipCardMarkSetter x(&_scanner, dest_attr.is_new_survivor());
  // Process claimed task.
  to_array->oop_iterate_range(&_scanner,
                              checked_cast<int>(claim._start),
@ -250,7 +256,7 @@ void G1ParScanThreadState::start_partial_objarray(oop from_obj,
    // The source array is unused when processing states.
    _partial_array_splitter.start(_task_queue, nullptr, to_array, array_length);

-  assert(_scanner.skip_card_enqueue_set(), "must be");
+  assert(_scanner.skip_card_mark_set(), "must be");
  // Process the initial chunk.  No need to process the type in the
  // klass, as it will already be handled by processing the built-in
  // module.
@ -451,7 +457,7 @@ void G1ParScanThreadState::do_iterate_object(oop const obj,
      _string_dedup_requests.add(old);
    }

-    assert(_scanner.skip_card_enqueue_set(), "must be");
+    assert(_scanner.skip_card_mark_set(), "must be");
    obj->oop_iterate_backwards(&_scanner, klass);
 }

@ -546,7 +552,7 @@ oop G1ParScanThreadState::do_copy_to_survivor_space(G1HeapRegionAttr const regio
      // Instead, we use dest_attr.is_young() because the two values are always
      // equal: successfully allocated young regions must be survivor regions.
      assert(dest_attr.is_young() == _g1h->heap_region_containing(obj)->is_survivor(), "must be");
-      G1SkipCardEnqueueSetter x(&_scanner, dest_attr.is_young());
+      G1SkipCardMarkSetter x(&_scanner, dest_attr.is_young());
      do_iterate_object(obj, old, klass, region_attr, dest_attr, age);
    }

@ -569,7 +575,7 @@ G1ParScanThreadState* G1ParScanThreadStateSet::state_for_worker(uint worker_id)
  assert(worker_id < _num_workers, "out of bounds access");
  if (_states[worker_id] == nullptr) {
    _states[worker_id] =
-      new G1ParScanThreadState(_g1h, rdcqs(),
+      new G1ParScanThreadState(_g1h,
                               worker_id,
                               _num_workers,
                               _collection_set,
@ -595,22 +601,24 @@ void G1ParScanThreadStateSet::flush_stats() {
    // because it resets the PLAB allocator where we get this info from.
    size_t lab_waste_bytes = pss->lab_waste_words() * HeapWordSize;
    size_t lab_undo_waste_bytes = pss->lab_undo_waste_words() * HeapWordSize;
-    size_t copied_bytes = pss->flush_stats(_surviving_young_words_total, _num_workers, &_rdc_buffers[worker_id]) * HeapWordSize;
-    size_t evac_fail_enqueued_cards = pss->evac_failure_enqueued_cards();
+    size_t copied_bytes = pss->flush_stats(_surviving_young_words_total, _num_workers) * HeapWordSize;
+    size_t pending_cards = pss->num_cards_pending();
+    size_t to_young_gen_cards = pss->num_cards_marked() - pss->num_cards_pending();
+    size_t evac_failure_cards = pss->num_cards_from_evac_failure();
+    size_t marked_cards = pss->num_cards_marked();

    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, copied_bytes, G1GCPhaseTimes::MergePSSCopiedBytes);
    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, lab_waste_bytes, G1GCPhaseTimes::MergePSSLABWasteBytes);
    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, lab_undo_waste_bytes, G1GCPhaseTimes::MergePSSLABUndoWasteBytes);
-    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, evac_fail_enqueued_cards, G1GCPhaseTimes::MergePSSEvacFailExtra);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, pending_cards, G1GCPhaseTimes::MergePSSPendingCards);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, to_young_gen_cards, G1GCPhaseTimes::MergePSSToYoungGenCards);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, evac_failure_cards, G1GCPhaseTimes::MergePSSEvacFail);
+    p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, marked_cards, G1GCPhaseTimes::MergePSSMarked);

    delete pss;
    _states[worker_id] = nullptr;
  }

-  G1DirtyCardQueueSet& dcq = G1BarrierSet::dirty_card_queue_set();
-  dcq.merge_bufferlists(rdcqs());
-  rdcqs()->verify_empty();
-
  _flushed = true;
 }

@ -652,7 +660,7 @@ oop G1ParScanThreadState::handle_evacuation_failure_par(oop old, markWord m, Kla
      // existing closure to scan evacuated objects; since we are iterating from a
      // collection set region (i.e. never a Survivor region), we always need to
      // gather cards for this case.
-      G1SkipCardEnqueueSetter x(&_scanner, false /* skip_card_enqueue */);
+      G1SkipCardMarkSetter x(&_scanner, false /* skip_card_mark */);
      do_iterate_object(old, old, klass, attr, attr, m.age());
    }

@ -709,9 +717,7 @@ G1ParScanThreadStateSet::G1ParScanThreadStateSet(G1CollectedHeap* g1h,
                                                 G1EvacFailureRegions* evac_failure_regions) :
    _g1h(g1h),
    _collection_set(collection_set),
-    _rdcqs(G1BarrierSet::dirty_card_queue_set().allocator()),
    _states(NEW_C_HEAP_ARRAY(G1ParScanThreadState*, num_workers, mtGC)),
-    _rdc_buffers(NEW_C_HEAP_ARRAY(BufferNodeList, num_workers, mtGC)),
    _surviving_young_words_total(NEW_C_HEAP_ARRAY(size_t, collection_set->young_region_length() + 1, mtGC)),
    _num_workers(num_workers),
    _flushed(false),
@ -719,7 +725,6 @@ G1ParScanThreadStateSet::G1ParScanThreadStateSet(G1CollectedHeap* g1h,
 {
  for (uint i = 0; i < num_workers; ++i) {
    _states[i] = nullptr;
-    _rdc_buffers[i] = BufferNodeList();
  }
  memset(_surviving_young_words_total, 0, (collection_set->young_region_length() + 1) * sizeof(size_t));
 }
@ -728,7 +733,6 @@ G1ParScanThreadStateSet::~G1ParScanThreadStateSet() {
  assert(_flushed, "thread local state from the per thread states should have been flushed");
  FREE_C_HEAP_ARRAY(G1ParScanThreadState*, _states);
  FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_total);
-  FREE_C_HEAP_ARRAY(BufferNodeList, _rdc_buffers);
 }

 #if TASKQUEUE_STATS
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -27,7 +27,6 @@

 #include "gc/g1/g1CollectedHeap.hpp"
 #include "gc/g1/g1OopClosures.hpp"
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
 #include "gc/g1/g1YoungGCAllocationFailureInjector.hpp"
 #include "gc/shared/ageTable.hpp"
 #include "gc/shared/copyFailedInfo.hpp"
@ -52,7 +51,6 @@ class outputStream;
 class G1ParScanThreadState : public CHeapObj<mtGC> {
  G1CollectedHeap* _g1h;
  G1ScannerTasksQueue* _task_queue;
-  G1RedirtyCardsLocalQueueSet _rdc_local_qset;
  G1CardTable* _ct;
  G1EvacuationRootClosures* _closures;

@ -65,9 +63,8 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {

  uint _worker_id;

-  // Remember the last enqueued card to avoid enqueuing the same card over and over;
-  // since we only ever scan a card once, this is sufficient.
-  size_t _last_enqueued_card;
+  size_t _num_cards_marked_dirty;
+  size_t _num_cards_marked_to_cset;

  // Upper and lower threshold to start and end work queue draining.
  uint const _stack_trim_upper_threshold;
@ -104,22 +101,19 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {

  EvacuationFailedInfo _evacuation_failed_info;
  G1EvacFailureRegions* _evac_failure_regions;
-  // Number of additional cards into evacuation failed regions enqueued into
-  // the local DCQS. This is an approximation, as cards that would be added later
-  // outside of evacuation failure will not be subtracted again.
-  size_t _evac_failure_enqueued_cards;
+  // Number of additional cards into evacuation failed regions.
+  size_t _num_cards_from_evac_failure;

-  // Enqueue the card if not already in the set; this is a best-effort attempt on
+  // Mark the card if not already in the set; this is a best-effort attempt on
  // detecting duplicates.
-  template <class T> bool enqueue_if_new(T* p);
-  // Enqueue the card of p into the (evacuation failed) region.
-  template <class T> void enqueue_card_into_evac_fail_region(T* p, oop obj);
+  template <class T> bool mark_if_new(T* p, bool into_survivor);
+  // Mark the card of p into the (evacuation failed) region.
+  template <class T> void mark_card_into_evac_fail_region(T* p, oop obj);

  bool inject_allocation_failure(uint region_idx) ALLOCATION_FAILURE_INJECTOR_RETURN_( return false; );

 public:
  G1ParScanThreadState(G1CollectedHeap* g1h,
-                       G1RedirtyCardsQueueSet* rdcqs,
                       uint worker_id,
                       uint num_workers,
                       G1CollectionSet* collection_set,
@ -139,16 +133,16 @@ public:

  void push_on_queue(ScannerTask task);

-  // Apply the post barrier to the given reference field. Enqueues the card of p
+  // Apply the post barrier to the given reference field. Marks the card of p
  // if the barrier does not filter out the reference for some reason (e.g.
  // p and q are in the same region, p is in survivor, p is in collection set)
  // To be called during GC if nothing particular about p and obj are known.
  template <class T> void write_ref_field_post(T* p, oop obj);

-  // Enqueue the card if the reference's target region's remembered set is tracked.
+  // Mark the card if the reference's target region's remembered set is tracked.
  // Assumes that a significant amount of pre-filtering (like done by
  // write_ref_field_post() above) has already been performed.
-  template <class T> void enqueue_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o);
+  template <class T> void mark_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o);

  G1EvacuationRootClosures* closures() { return _closures; }
  uint worker_id() { return _worker_id; }
@ -156,11 +150,22 @@ public:
  size_t lab_waste_words() const;
  size_t lab_undo_waste_words() const;

-  size_t evac_failure_enqueued_cards() const;
+  // Newly marked cards during this garbage collection, to be refined concurrently
+  // later. Contains both marks generated by new cross-region references as well
+  // as cards generated from regions into evacuation failed regions.
+  // Does not contain cards into the next collection set (e.g. survivors) - they will not
+  // be refined concurrently. Calculation is done on a best-effort basis.
+  size_t num_cards_pending() const;
+  // Number of cards newly generated by references into evacuation failed regions.
+  // Calculation is done on a best-effort basis.
+  size_t num_cards_from_evac_failure() const;
+  // Sum of cards marked by evacuation. Contains both pending cards as well as cards
+  // into the next collection set (e.g. survivors).
+  size_t num_cards_marked() const;

  // Pass locally gathered statistics to global state. Returns the total number of
  // HeapWords copied.
-  size_t flush_stats(size_t* surviving_young_words, uint num_workers, BufferNodeList* buffer_log);
+  size_t flush_stats(size_t* surviving_young_words, uint num_workers);

 #if TASKQUEUE_STATS
  PartialArrayTaskStats* partial_array_task_stats();
@ -249,9 +254,7 @@ public:
 class G1ParScanThreadStateSet : public StackObj {
  G1CollectedHeap* _g1h;
  G1CollectionSet* _collection_set;
-  G1RedirtyCardsQueueSet _rdcqs;
  G1ParScanThreadState** _states;
-  BufferNodeList* _rdc_buffers;
  size_t* _surviving_young_words_total;
  uint _num_workers;
  bool _flushed;
@ -264,9 +267,6 @@ class G1ParScanThreadStateSet : public StackObj {
                          G1EvacFailureRegions* evac_failure_regions);
  ~G1ParScanThreadStateSet();

-  G1RedirtyCardsQueueSet* rdcqs() { return &_rdcqs; }
-  BufferNodeList* rdc_buffers() { return _rdc_buffers; }
-
  void flush_stats();
  void record_unused_optional_region(G1HeapRegion* hr);
 #if TASKQUEUE_STATS
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp
@ -96,25 +96,24 @@ G1OopStarChunkedList* G1ParScanThreadState::oops_into_optional_region(const G1He
  return &_oops_into_optional_regions[hr->index_in_opt_cset()];
 }

-template <class T> bool G1ParScanThreadState::enqueue_if_new(T* p) {
-  size_t card_index = ct()->index_for(p);
-  // If the card hasn't been added to the buffer, do it.
-  if (_last_enqueued_card != card_index) {
-    _rdc_local_qset.enqueue(ct()->byte_for_index(card_index));
-    _last_enqueued_card = card_index;
+template <class T> bool G1ParScanThreadState::mark_if_new(T* p, bool into_new_survivor) {
+  G1CardTable::CardValue* card = ct()->byte_for(p);
+  G1CardTable::CardValue value = *card;
+  if (value == G1CardTable::clean_card_val()) {
+    *card = into_new_survivor ? G1CardTable::g1_to_cset_card : G1CardTable::g1_dirty_card;
    return true;
  } else {
    return false;
  }
 }

-template <class T> void G1ParScanThreadState::enqueue_card_into_evac_fail_region(T* p, oop obj) {
+template <class T> void G1ParScanThreadState::mark_card_into_evac_fail_region(T* p, oop obj) {
  assert(!G1HeapRegion::is_in_same_region(p, obj), "Should have filtered out cross-region references already.");
  assert(!_g1h->heap_region_containing(p)->is_survivor(), "Should have filtered out from-newly allocated survivor references already.");
  assert(_g1h->heap_region_containing(obj)->in_collection_set(), "Only for enqeueing reference into collection set region");

-  if (enqueue_if_new(p)) {
-    _evac_failure_enqueued_cards++;
+  if (mark_if_new(p, false /* into_new_survivor */)) { // The reference is never into survivor regions.
+    _num_cards_from_evac_failure++;
  }
 }

@ -137,18 +136,18 @@ template <class T> void G1ParScanThreadState::write_ref_field_post(T* p, oop obj
  if (dest_attr.is_in_cset()) {
    assert(obj->is_forwarded(), "evac-failed but not forwarded: " PTR_FORMAT, p2i(obj));
    assert(obj->forwardee() == obj, "evac-failed but not self-forwarded: " PTR_FORMAT, p2i(obj));
-    enqueue_card_into_evac_fail_region(p, obj);
+    mark_card_into_evac_fail_region(p, obj);
    return;
  }
-  enqueue_card_if_tracked(dest_attr, p, obj);
+  mark_card_if_tracked(dest_attr, p, obj);
 }

-template <class T> void G1ParScanThreadState::enqueue_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o) {
+template <class T> void G1ParScanThreadState::mark_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o) {
  assert(!G1HeapRegion::is_in_same_region(p, o), "Should have filtered out cross-region references already.");
  assert(!_g1h->heap_region_containing(p)->is_survivor(), "Should have filtered out from-newly allocated survivor references already.");
  // We relabel all regions that failed evacuation as old gen without remembered,
  // and so pre-filter them out in the caller.
-  assert(!_g1h->heap_region_containing(o)->in_collection_set(), "Should not try to enqueue reference into collection set region");
+  assert(!_g1h->heap_region_containing(o)->in_collection_set(), "Should not try to mark reference into collection set region");

 #ifdef ASSERT
  G1HeapRegion* const hr_obj = _g1h->heap_region_containing(o);
@ -161,7 +160,14 @@ template <class T> void G1ParScanThreadState::enqueue_card_if_tracked(G1HeapRegi
  if (!region_attr.remset_is_tracked()) {
    return;
  }
-  enqueue_if_new(p);
+  bool into_survivor = region_attr.is_new_survivor();
+  if (mark_if_new(p, into_survivor)) {
+    if (into_survivor) {
+      _num_cards_marked_to_cset++;
+    } else {
+      _num_cards_marked_dirty++;
+    }
+  }
 }

 #endif // SHARE_GC_G1_G1PARSCANTHREADSTATE_INLINE_HPP
--- a/src/hotspot/share/gc/g1/g1Policy.cpp
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp
@ -67,8 +67,7 @@ G1Policy::G1Policy(STWGCTimer* gc_timer) :
  _reserve_regions(0),
  _young_gen_sizer(),
  _free_regions_at_end_of_collection(0),
-  _card_rs_length(0),
-  _pending_cards_at_gc_start(0),
+  _pending_cards_from_gc(0),
  _concurrent_start_to_mixed(),
  _collection_set(nullptr),
  _g1h(nullptr),
@ -553,12 +552,9 @@ G1GCPhaseTimes* G1Policy::phase_times() const {
  return _phase_times;
 }

-void G1Policy::revise_young_list_target_length(size_t card_rs_length, size_t code_root_rs_length) {
+void G1Policy::revise_young_list_target_length(size_t pending_cards, size_t card_rs_length, size_t code_root_rs_length) {
  guarantee(use_adaptive_young_list_length(), "should not call this otherwise" );

-  size_t thread_buffer_cards = _analytics->predict_dirtied_cards_in_thread_buffers();
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  size_t pending_cards = dcqs.num_cards() + thread_buffer_cards;
  update_young_length_bounds(pending_cards, card_rs_length, code_root_rs_length);
 }

@ -567,7 +563,7 @@ void G1Policy::record_full_collection_start() {
  // Release the future to-space so that it is available for compaction into.
  collector_state()->set_in_young_only_phase(false);
  collector_state()->set_in_full_gc(true);
-  _pending_cards_at_gc_start = 0;
+  _collection_set->abandon_all_candidates();
 }

 void G1Policy::record_full_collection_end() {
@ -600,59 +596,70 @@ void G1Policy::record_full_collection_end() {
  record_pause(G1GCPauseType::FullGC, start_time_sec, end_sec);
 }

-static void log_refinement_stats(const char* kind, const G1ConcurrentRefineStats& stats) {
+static void log_refinement_stats(const G1ConcurrentRefineStats& stats) {
  log_debug(gc, refine, stats)
-           ("%s refinement: %.2fms, refined: %zu"
-            ", precleaned: %zu, dirtied: %zu",
-            kind,
-            stats.refinement_time().seconds() * MILLIUNITS,
+           ("Refinement: sweep: %.2fms, yield: %.2fms refined: %zu, dirtied: %zu",
+            TimeHelper::counter_to_millis(stats.sweep_duration()),
+            TimeHelper::counter_to_millis(stats.yield_during_sweep_duration()),
            stats.refined_cards(),
-            stats.precleaned_cards(),
-            stats.dirtied_cards());
+            stats.cards_pending());
 }

-void G1Policy::record_concurrent_refinement_stats(size_t pending_cards,
-                                                  size_t thread_buffer_cards) {
-  _pending_cards_at_gc_start = pending_cards;
-  _analytics->report_dirtied_cards_in_thread_buffers(thread_buffer_cards);
-
-  // Collect per-thread stats, mostly from mutator activity.
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  G1ConcurrentRefineStats mut_stats = dcqs.concatenated_refinement_stats();
-
-  // Collect specialized concurrent refinement thread stats.
-  G1ConcurrentRefine* cr = _g1h->concurrent_refine();
-  G1ConcurrentRefineStats cr_stats = cr->get_and_reset_refinement_stats();
-
-  G1ConcurrentRefineStats total_stats = mut_stats + cr_stats;
-
-  log_refinement_stats("Mutator", mut_stats);
-  log_refinement_stats("Concurrent", cr_stats);
-  log_refinement_stats("Total", total_stats);
+void G1Policy::record_refinement_stats(G1ConcurrentRefineStats* refine_stats) {
+  log_refinement_stats(*refine_stats);

  // Record the rate at which cards were refined.
-  // Don't update the rate if the current sample is empty or time is zero.
-  Tickspan refinement_time = total_stats.refinement_time();
-  size_t refined_cards = total_stats.refined_cards();
-  if ((refined_cards > 0) && (refinement_time > Tickspan())) {
-    double rate = refined_cards / (refinement_time.seconds() * MILLIUNITS);
+  // Don't update the rate if the current sample is empty or time is zero (which is
+  // the case during GC).
+  double refinement_time = TimeHelper::counter_to_millis(refine_stats->sweep_duration());
+  size_t refined_cards = refine_stats->refined_cards();
+  if ((refined_cards > 0) && (refinement_time > 0)) {
+    double rate = refined_cards / refinement_time;
    _analytics->report_concurrent_refine_rate_ms(rate);
-    log_debug(gc, refine, stats)("Concurrent refinement rate: %.2f cards/ms", rate);
+    log_debug(gc, refine, stats)("Concurrent refinement rate: %.2f cards/ms predicted: %.2f cards/ms", rate, _analytics->predict_concurrent_refine_rate_ms());
  }
+}

+template<typename T>
+static T saturated_sub(T x, T y) {
+  return (x < y) ? T() : (x - y);
+}
+
+void G1Policy::record_dirtying_stats(double last_mutator_start_dirty_ms,
+                                     double last_mutator_end_dirty_ms,
+                                     size_t pending_cards,
+                                     double yield_duration_ms,
+                                     size_t next_pending_cards_from_gc,
+                                     size_t next_to_collection_set_cards) {
+  assert(SafepointSynchronize::is_at_safepoint() || G1ReviseYoungLength_lock->is_locked(),
+         "must be (at safepoint %s locked %s)",
+         BOOL_TO_STR(SafepointSynchronize::is_at_safepoint()), BOOL_TO_STR(G1ReviseYoungLength_lock->is_locked()));
  // Record mutator's card logging rate.
-  double mut_start_time = _analytics->prev_collection_pause_end_ms();
-  double mut_end_time = cur_pause_start_sec() * MILLIUNITS;
-  double mut_time = mut_end_time - mut_start_time;
+
  // Unlike above for conc-refine rate, here we should not require a
  // non-empty sample, since an application could go some time with only
  // young-gen or filtered out writes.  But we'll ignore unusually short
  // sample periods, as they may just pollute the predictions.
-  if (mut_time > 1.0) {   // Require > 1ms sample time.
-    double dirtied_rate = total_stats.dirtied_cards() / mut_time;
+  double const mutator_dirty_time_ms = (last_mutator_end_dirty_ms - last_mutator_start_dirty_ms) - yield_duration_ms;
+  assert(mutator_dirty_time_ms >= 0.0,
+         "must be (start: %.2f end: %.2f yield: %.2f)",
+         last_mutator_start_dirty_ms, last_mutator_end_dirty_ms, yield_duration_ms);
+
+  if (mutator_dirty_time_ms > 1.0) {   // Require > 1ms sample time.
+    // The subtractive term is pending_cards_from_gc() which includes both dirtied and dirty-as-young cards,
+    // which can be larger than what is actually considered as "pending" (dirty cards only).
+    size_t dirtied_cards = saturated_sub(pending_cards, pending_cards_from_gc());
+    double dirtied_rate = dirtied_cards / mutator_dirty_time_ms;
    _analytics->report_dirtied_cards_rate_ms(dirtied_rate);
-    log_debug(gc, refine, stats)("Generate dirty cards rate: %.2f cards/ms", dirtied_rate);
+    log_debug(gc, refine, stats)("Generate dirty cards rate: %.2f cards/ms dirtying time %.2f (start %.2f end %.2f yield %.2f) dirtied %zu (pending %zu during_gc %zu)",
+                                 dirtied_rate,
+                                 mutator_dirty_time_ms,
+                                 last_mutator_start_dirty_ms, last_mutator_end_dirty_ms, yield_duration_ms,
+                                 dirtied_cards, pending_cards, pending_cards_from_gc());
  }
+
+  _pending_cards_from_gc = next_pending_cards_from_gc;
+  _to_collection_set_cards = next_to_collection_set_cards;
 }

 bool G1Policy::should_retain_evac_failed_region(uint index) const {
@ -761,27 +768,27 @@ bool G1Policy::concurrent_operation_is_full_mark(const char* msg) {
    ((_g1h->gc_cause() != GCCause::_g1_humongous_allocation) || need_to_start_conc_mark(msg));
 }

-double G1Policy::logged_cards_processing_time() const {
+double G1Policy::pending_cards_processing_time() const {
  double all_cards_processing_time = average_time_ms(G1GCPhaseTimes::ScanHR) + average_time_ms(G1GCPhaseTimes::OptScanHR);
-  size_t logged_dirty_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards);
+  size_t pending_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) +
+                         phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards);
  size_t scan_heap_roots_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
                                 phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);

-  double merge_logged_cards_time = average_time_ms(G1GCPhaseTimes::MergeLB) +
-                                   phase_times()->cur_distribute_log_buffers_time_ms();
+  double merge_pending_cards_time = phase_times()->cur_merge_refinement_table_time();

-  // Approximate the time spent processing cards from log buffers by scaling
-  // the total processing time by the ratio of logged cards to total cards
+  // Approximate the time spent processing cards from pending cards by scaling
+  // the total processing time by the ratio of pending cards to total cards
  // processed.  There might be duplicate cards in different log buffers,
  // leading to an overestimate.  That effect should be relatively small
  // unless there are few cards to process, because cards in buffers are
  // dirtied to limit duplication.  Also need to avoid scaling when both
  // counts are zero, which happens especially during early GCs.  So ascribe
-  // all of the time to the logged cards unless there are more total cards.
-  if (logged_dirty_cards >= scan_heap_roots_cards) {
-    return all_cards_processing_time + merge_logged_cards_time;
+  // all of the time to the pending cards unless there are more total cards.
+  if (pending_cards >= scan_heap_roots_cards) {
+    return all_cards_processing_time + merge_pending_cards_time;
  }
-  return (all_cards_processing_time * logged_dirty_cards / scan_heap_roots_cards) + merge_logged_cards_time;
+  return (all_cards_processing_time * pending_cards / scan_heap_roots_cards) + merge_pending_cards_time;
 }

 // Anything below that is considered to be zero
@ -815,6 +822,22 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
  // We make the assumption that these are rare.
  bool update_stats = !allocation_failure;

+  size_t const total_cards_scanned = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
+                                     p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
+
+  // Number of scanned cards with "Dirty" value (and nothing else).
+  size_t const pending_cards_from_refinement_table = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) +
+                                                     p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards);
+  // Number of cards actually merged in the Merge RS phase. MergeRSCards below includes the cards from the Eager Reclaim phase.
+  size_t const merged_cards_from_card_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSFromRemSetCards) +
+                                           p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSFromRemSetCards);
+  // Number of cards attempted to merge in the Merge RS phase.
+  size_t const total_cards_from_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSTotalCards) +
+                                     p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSTotalCards);
+
+  // Cards marked as being to collection set. May be inaccurate due to races.
+  size_t const total_non_young_rs_cards = MIN2(pending_cards_from_refinement_table + merged_cards_from_card_rs, total_cards_scanned);
+
  if (update_stats) {
    // We maintain the invariant that all objects allocated by mutator
    // threads will be allocated out of eden regions. So, we can use
@ -827,6 +850,98 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
    uint regions_allocated = _collection_set->eden_region_length();
    double alloc_rate_ms = (double) regions_allocated / app_time_ms;
    _analytics->report_alloc_rate_ms(alloc_rate_ms);
+
+    double merge_refinement_table_time = p->cur_merge_refinement_table_time();
+    if (merge_refinement_table_time != 0.0) {
+      _analytics->report_merge_refinement_table_time_ms(merge_refinement_table_time);
+    }
+    if (merged_cards_from_card_rs >= G1NumCardsCostSampleThreshold) {
+      double avg_time_merge_cards = average_time_ms(G1GCPhaseTimes::MergeER) +
+                                    average_time_ms(G1GCPhaseTimes::MergeRS) +
+                                    average_time_ms(G1GCPhaseTimes::OptMergeRS);
+      _analytics->report_cost_per_card_merge_ms(avg_time_merge_cards / merged_cards_from_card_rs, is_young_only_pause);
+      log_debug(gc, ergo, cset)("cost per card merge (young %s): avg time %.2f merged cards %zu cost(1m) %.2f pred_cost(1m-yo) %.2f pred_cost(1m-old) %.2f",
+                                BOOL_TO_STR(is_young_only_pause),
+                                avg_time_merge_cards, merged_cards_from_card_rs, 1e6 * avg_time_merge_cards / merged_cards_from_card_rs, _analytics->predict_card_merge_time_ms(1e6, true), _analytics->predict_card_merge_time_ms(1e6, false));
+    } else {
+      log_debug(gc, ergo, cset)("cost per card merge (young: %s): skipped, total cards %zu", BOOL_TO_STR(is_young_only_pause), total_non_young_rs_cards);
+    }
+
+    // Update prediction for card scan
+
+    if (total_cards_scanned >= G1NumCardsCostSampleThreshold) {
+      double avg_card_scan_time = average_time_ms(G1GCPhaseTimes::ScanHR) +
+                                  average_time_ms(G1GCPhaseTimes::OptScanHR);
+
+      _analytics->report_cost_per_card_scan_ms(avg_card_scan_time / total_cards_scanned, is_young_only_pause);
+
+      log_debug(gc, ergo, cset)("cost per card scan (young: %s): avg time %.2f total cards %zu cost(1m) %.2f pred_cost(1m-yo) %.2f pred_cost(1m-old) %.2f",
+                                BOOL_TO_STR(is_young_only_pause),
+                                avg_card_scan_time, total_cards_scanned, 1e6 * avg_card_scan_time / total_cards_scanned, _analytics->predict_card_scan_time_ms(1e6, true), _analytics->predict_card_scan_time_ms(1e6, false));
+    } else {
+      log_debug(gc, ergo, cset)("cost per card scan (young: %s): skipped, total cards %zu", BOOL_TO_STR(is_young_only_pause), total_cards_scanned);
+    }
+
+    // Update prediction for the ratio between cards actually merged onto the card
+    // table from the remembered sets and the total number of cards attempted to
+    // merge.
+    double merge_to_scan_ratio = 1.0;
+    if (total_cards_from_rs > 0) {
+      merge_to_scan_ratio = (double)merged_cards_from_card_rs / total_cards_from_rs;
+    }
+    _analytics->report_card_merge_to_scan_ratio(merge_to_scan_ratio, is_young_only_pause);
+
+    // Update prediction for code root scan
+    size_t const total_code_roots_scanned = p->sum_thread_work_items(G1GCPhaseTimes::CodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods) +
+                                            p->sum_thread_work_items(G1GCPhaseTimes::OptCodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods);
+
+    if (total_code_roots_scanned >= G1NumCodeRootsCostSampleThreshold) {
+      double avg_time_code_root_scan = average_time_ms(G1GCPhaseTimes::CodeRoots) +
+                                       average_time_ms(G1GCPhaseTimes::OptCodeRoots);
+
+      _analytics->report_cost_per_code_root_scan_ms(avg_time_code_root_scan / total_code_roots_scanned, is_young_only_pause);
+    }
+
+    // Update prediction for copy cost per byte
+    size_t copied_bytes = p->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSCopiedBytes);
+
+    if (copied_bytes > 0) {
+      double avg_copy_time = average_time_ms(G1GCPhaseTimes::ObjCopy) + average_time_ms(G1GCPhaseTimes::OptObjCopy);
+      double cost_per_byte_ms = avg_copy_time / copied_bytes;
+      _analytics->report_cost_per_byte_ms(cost_per_byte_ms, is_young_only_pause);
+    }
+
+    if (_collection_set->young_region_length() > 0) {
+      _analytics->report_young_other_cost_per_region_ms(young_other_time_ms() /
+                                                        _collection_set->young_region_length());
+    }
+
+    if (_collection_set->initial_old_region_length() > 0) {
+      _analytics->report_non_young_other_cost_per_region_ms(non_young_other_time_ms() /
+                                                            _collection_set->initial_old_region_length());
+    }
+
+    _analytics->report_constant_other_time_ms(constant_other_time_ms(pause_time_ms));
+
+    _analytics->report_pending_cards(pending_cards_from_refinement_table, is_young_only_pause);
+
+    _analytics->report_card_rs_length(total_cards_scanned - total_non_young_rs_cards, is_young_only_pause);
+    _analytics->report_code_root_rs_length((double)total_code_roots_scanned, is_young_only_pause);
+  }
+
+  {
+    double mutator_end_time = cur_pause_start_sec() * MILLIUNITS;
+    G1ConcurrentRefineStats* stats = _g1h->concurrent_refine()->sweep_state().stats();
+    // Record any available refinement statistics.
+    record_refinement_stats(stats);
+
+    double yield_duration_ms = TimeHelper::counter_to_millis(_g1h->yield_duration_in_refinement_epoch());
+    record_dirtying_stats(TimeHelper::counter_to_millis(_g1h->last_refinement_epoch_start()),
+                          mutator_end_time,
+                          pending_cards_from_refinement_table,
+                          yield_duration_ms,
+                          phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSPendingCards),
+                          phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSToYoungGenCards));
  }

  record_pause(this_pause, start_time_sec, end_time_sec, allocation_failure);
@ -857,82 +972,6 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar

  _eden_surv_rate_group->start_adding_regions();

-  if (update_stats) {
-    // Update prediction for card merge.
-    size_t const merged_cards_from_log_buffers = p->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards);
-    // MergeRSCards includes the cards from the Eager Reclaim phase.
-    size_t const merged_cards_from_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSCards) +
-                                        p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSCards);
-    size_t const total_cards_merged = merged_cards_from_rs +
-                                      merged_cards_from_log_buffers;
-
-    if (total_cards_merged >= G1NumCardsCostSampleThreshold) {
-      double avg_time_merge_cards = average_time_ms(G1GCPhaseTimes::MergeER) +
-                                    average_time_ms(G1GCPhaseTimes::MergeRS) +
-                                    average_time_ms(G1GCPhaseTimes::MergeLB) +
-                                    p->cur_distribute_log_buffers_time_ms() +
-                                    average_time_ms(G1GCPhaseTimes::OptMergeRS);
-      _analytics->report_cost_per_card_merge_ms(avg_time_merge_cards / total_cards_merged, is_young_only_pause);
-    }
-
-    // Update prediction for card scan
-    size_t const total_cards_scanned = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
-                                       p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
-
-    if (total_cards_scanned >= G1NumCardsCostSampleThreshold) {
-      double avg_time_dirty_card_scan = average_time_ms(G1GCPhaseTimes::ScanHR) +
-                                        average_time_ms(G1GCPhaseTimes::OptScanHR);
-
-      _analytics->report_cost_per_card_scan_ms(avg_time_dirty_card_scan / total_cards_scanned, is_young_only_pause);
-    }
-
-    // Update prediction for the ratio between cards from the remembered
-    // sets and actually scanned cards from the remembered sets.
-    // Due to duplicates in the log buffers, the number of scanned cards
-    // can be smaller than the cards in the log buffers.
-    const size_t scanned_cards_from_rs = (total_cards_scanned > merged_cards_from_log_buffers) ? total_cards_scanned - merged_cards_from_log_buffers : 0;
-    double scan_to_merge_ratio = 0.0;
-    if (merged_cards_from_rs > 0) {
-      scan_to_merge_ratio = (double)scanned_cards_from_rs / merged_cards_from_rs;
-    }
-    _analytics->report_card_scan_to_merge_ratio(scan_to_merge_ratio, is_young_only_pause);
-
-    // Update prediction for code root scan
-    size_t const total_code_roots_scanned = p->sum_thread_work_items(G1GCPhaseTimes::CodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods) +
-                                            p->sum_thread_work_items(G1GCPhaseTimes::OptCodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods);
-
-    if (total_code_roots_scanned >= G1NumCodeRootsCostSampleThreshold) {
-      double avg_time_code_root_scan = average_time_ms(G1GCPhaseTimes::CodeRoots) +
-                                       average_time_ms(G1GCPhaseTimes::OptCodeRoots);
-
-      _analytics->report_cost_per_code_root_scan_ms(avg_time_code_root_scan / total_code_roots_scanned, is_young_only_pause);
-    }
-
-    // Update prediction for copy cost per byte
-    size_t copied_bytes = p->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSCopiedBytes);
-
-    if (copied_bytes > 0) {
-      double cost_per_byte_ms = (average_time_ms(G1GCPhaseTimes::ObjCopy) + average_time_ms(G1GCPhaseTimes::OptObjCopy)) / copied_bytes;
-      _analytics->report_cost_per_byte_ms(cost_per_byte_ms, is_young_only_pause);
-    }
-
-    if (_collection_set->young_region_length() > 0) {
-      _analytics->report_young_other_cost_per_region_ms(young_other_time_ms() /
-                                                        _collection_set->young_region_length());
-    }
-
-    if (_collection_set->initial_old_region_length() > 0) {
-      _analytics->report_non_young_other_cost_per_region_ms(non_young_other_time_ms() /
-                                                            _collection_set->initial_old_region_length());
-    }
-
-    _analytics->report_constant_other_time_ms(constant_other_time_ms(pause_time_ms));
-
-    _analytics->report_pending_cards((double)pending_cards_at_gc_start(), is_young_only_pause);
-    _analytics->report_card_rs_length((double)_card_rs_length, is_young_only_pause);
-    _analytics->report_code_root_rs_length((double)total_code_roots_scanned, is_young_only_pause);
-  }
-
  assert(!(G1GCPauseTypeHelper::is_concurrent_start_pause(this_pause) && collector_state()->mark_or_rebuild_in_progress()),
         "If the last pause has been concurrent start, we should not have been in the marking window");
  if (G1GCPauseTypeHelper::is_concurrent_start_pause(this_pause)) {
@ -963,29 +1002,26 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar
  }

  // Note that _mmu_tracker->max_gc_time() returns the time in seconds.
-  double logged_cards_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0;
+  double pending_cards_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0;

-  double const logged_cards_time_ms = logged_cards_processing_time();
-  size_t logged_cards =
-    phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB,
-                                         G1GCPhaseTimes::MergeLBDirtyCards);
-  bool exceeded_goal = logged_cards_time_goal_ms < logged_cards_time_ms;
-  size_t predicted_thread_buffer_cards = _analytics->predict_dirtied_cards_in_thread_buffers();
+  double const pending_cards_time_ms = pending_cards_processing_time();
+  size_t pending_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) +
+                         phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards);
+
+  bool exceeded_goal = pending_cards_time_goal_ms < pending_cards_time_ms;
  G1ConcurrentRefine* cr = _g1h->concurrent_refine();

  log_debug(gc, ergo, refine)
-           ("GC refinement: goal: %zu + %zu / %1.2fms, actual: %zu / %1.2fms, %s",
+           ("GC refinement: goal: %zu / %1.2fms, actual: %zu / %1.2fms, %s",
            cr->pending_cards_target(),
-            predicted_thread_buffer_cards,
-            logged_cards_time_goal_ms,
-            logged_cards,
-            logged_cards_time_ms,
+            pending_cards_time_goal_ms,
+            pending_cards,
+            pending_cards_time_ms,
            (exceeded_goal ? " (exceeded goal)" : ""));

-  cr->adjust_after_gc(logged_cards_time_ms,
-                      logged_cards,
-                      predicted_thread_buffer_cards,
-                      logged_cards_time_goal_ms);
+  cr->adjust_after_gc(pending_cards_time_ms,
+                      pending_cards,
+                      pending_cards_time_goal_ms);
 }

 G1IHOPControl* G1Policy::create_ihop_control(const G1OldGenAllocationTracker* old_gen_alloc_tracker,
@ -1057,34 +1093,27 @@ double G1Policy::predict_base_time_ms(size_t pending_cards,
                                      size_t code_root_rs_length) const {
  bool in_young_only_phase = collector_state()->in_young_only_phase();

-  size_t unique_cards_from_rs = _analytics->predict_scan_card_num(card_rs_length, in_young_only_phase);
-  // Assume that all cards from the log buffers will be scanned, i.e. there are no
-  // duplicates in that set.
-  size_t effective_scanned_cards = unique_cards_from_rs + pending_cards;
+  // Cards from the refinement table and the cards from the young gen remset are
+  // unique to each other as they are located on the card table.
+  size_t effective_scanned_cards = card_rs_length + pending_cards;

-  double card_merge_time = _analytics->predict_card_merge_time_ms(pending_cards + card_rs_length, in_young_only_phase);
+  double refinement_table_merge_time = _analytics->predict_merge_refinement_table_time_ms();
  double card_scan_time = _analytics->predict_card_scan_time_ms(effective_scanned_cards, in_young_only_phase);
  double code_root_scan_time = _analytics->predict_code_root_scan_time_ms(code_root_rs_length, in_young_only_phase);
  double constant_other_time = _analytics->predict_constant_other_time_ms();
  double survivor_evac_time = predict_survivor_regions_evac_time();

-  double total_time = card_merge_time + card_scan_time + code_root_scan_time + constant_other_time + survivor_evac_time;
+  double total_time = refinement_table_merge_time + card_scan_time + code_root_scan_time + constant_other_time + survivor_evac_time;

  log_trace(gc, ergo, heap)("Predicted base time: total %f lb_cards %zu card_rs_length %zu effective_scanned_cards %zu "
-                            "card_merge_time %f card_scan_time %f code_root_rs_length %zu code_root_scan_time %f "
+                            "refinement_table_merge_time %f card_scan_time %f code_root_rs_length %zu code_root_scan_time %f "
                            "constant_other_time %f survivor_evac_time %f",
                            total_time, pending_cards, card_rs_length, effective_scanned_cards,
-                            card_merge_time, card_scan_time, code_root_rs_length, code_root_scan_time,
+                            refinement_table_merge_time, card_scan_time, code_root_rs_length, code_root_scan_time,
                            constant_other_time, survivor_evac_time);
  return total_time;
 }

-double G1Policy::predict_base_time_ms(size_t pending_cards) const {
-  bool for_young_only_phase = collector_state()->in_young_only_phase();
-  size_t card_rs_length = _analytics->predict_card_rs_length(for_young_only_phase);
-  return predict_base_time_ms(pending_cards, card_rs_length);
-}
-
 double G1Policy::predict_base_time_ms(size_t pending_cards, size_t card_rs_length) const {
  bool for_young_only_phase = collector_state()->in_young_only_phase();
  size_t code_root_rs_length = _analytics->predict_code_root_rs_length(for_young_only_phase);
@ -1428,6 +1457,64 @@ size_t G1Policy::allowed_waste_in_collection_set() const {
  return G1HeapWastePercent * _g1h->capacity() / 100;
 }

+bool G1Policy::try_get_available_bytes_estimate(size_t& available_bytes) const {
+  // Getting used young bytes requires holding Heap_lock.  But we can't use
+  // normal lock and block until available.  Blocking on the lock could
+  // deadlock with a GC VMOp that is holding the lock and requesting a
+  // safepoint.  Instead try to lock, and return the result of that attempt,
+  // and the estimate if successful.
+  if (Heap_lock->try_lock()) {
+    size_t used_bytes = estimate_used_young_bytes_locked();
+    Heap_lock->unlock();
+
+    size_t young_bytes = young_list_target_length() * G1HeapRegion::GrainBytes;
+    available_bytes = young_bytes - MIN2(young_bytes, used_bytes);
+    return true;
+  } else {
+    available_bytes = 0;
+    return false;
+  }
+}
+
+double G1Policy::predict_time_to_next_gc_ms(size_t available_bytes) const {
+  double alloc_region_rate = _analytics->predict_alloc_rate_ms();
+  double alloc_bytes_rate = alloc_region_rate * G1HeapRegion::GrainBytes;
+  if (alloc_bytes_rate == 0.0) {
+    // A zero rate indicates we don't yet have data to use for predictions.
+    // Since we don't have any idea how long until the next GC, use a time of
+    // zero.
+    return 0.0;
+  } else {
+    // If the heap size is large and the allocation rate is small, we can get
+    // a predicted time until next GC that is so large it can cause problems
+    // (such as overflow) in other calculations.  Limit the prediction to one
+    // hour, which is still large in this context.
+    const double one_hour_ms = 60.0 * 60.0 * MILLIUNITS;
+    double raw_time_ms = available_bytes / alloc_bytes_rate;
+    return MIN2(raw_time_ms, one_hour_ms);
+  }
+}
+
+uint64_t G1Policy::adjust_wait_time_ms(double wait_time_ms, uint64_t min_time_ms) {
+  return MAX2(static_cast<uint64_t>(sqrt(wait_time_ms) * 4.0), min_time_ms);
+}
+
+double G1Policy::last_mutator_dirty_start_time_ms() {
+  return TimeHelper::counter_to_millis(_g1h->last_refinement_epoch_start());
+}
+
+size_t G1Policy::current_pending_cards() {
+  double now = os::elapsedTime() * MILLIUNITS;
+  return _pending_cards_from_gc + _analytics->predict_dirtied_cards_rate_ms() * (now - last_mutator_dirty_start_time_ms());
+}
+
+size_t G1Policy::current_to_collection_set_cards() {
+  // The incremental part is covered by the dirtied_cards_rate, i.e. pending cards
+  // cover both to collection set cards and other interesting cards because we do not
+  // know which is which until we look.
+  return _to_collection_set_cards;
+}
+
 uint G1Policy::min_retained_old_cset_length() const {
  // Guarantee some progress with retained regions regardless of available time by
  // taking at least one region.
--- a/src/hotspot/share/gc/g1/g1Policy.hpp
+++ b/src/hotspot/share/gc/g1/g1Policy.hpp
@ -48,6 +48,7 @@ class G1HeapRegion;
 class G1CollectionSet;
 class G1CollectionSetCandidates;
 class G1CollectionSetChooser;
+class G1ConcurrentRefineStats;
 class G1IHOPControl;
 class G1Analytics;
 class G1SurvivorRegions;
@ -101,9 +102,18 @@ class G1Policy: public CHeapObj<mtGC> {

  uint _free_regions_at_end_of_collection;

-  size_t _card_rs_length;
-
-  size_t _pending_cards_at_gc_start;
+  // Tracks the number of cards marked as dirty (only) during garbage collection
+  // (evacuation) on the card table.
+  // This is needed to properly account for those cards in the heuristics to start
+  // refinement at the correct time which needs to know how many cards are currently
+  // approximately on the card table.
+  // After the first completed refinement sweep of the refinement table between two
+  // garbage collections this value is reset to zero as that refinement processed all
+  // those cards.
+  size_t _pending_cards_from_gc;
+  // Tracks the approximate number of cards found as to-collection-set by either the
+  // garbage collection or the most recent refinement sweep.
+  size_t _to_collection_set_cards;

  G1ConcurrentStartToMixedTimeTracker _concurrent_start_to_mixed;

@ -111,7 +121,7 @@ class G1Policy: public CHeapObj<mtGC> {
    return collector_state()->in_young_only_phase() && !collector_state()->mark_or_rebuild_in_progress();
  }

-  double logged_cards_processing_time() const;
+  double pending_cards_processing_time() const;
 public:
  const G1Predictions& predictor() const { return _predictor; }
  const G1Analytics* analytics()   const { return const_cast<const G1Analytics*>(_analytics); }
@ -129,16 +139,10 @@ public:
    hr->install_surv_rate_group(_survivor_surv_rate_group);
  }

-  void record_card_rs_length(size_t num_cards) {
-    _card_rs_length = num_cards;
-  }
-
  double cur_pause_start_sec() const {
    return _cur_pause_start_sec;
  }

-  double predict_base_time_ms(size_t pending_cards) const;
-
  double predict_base_time_ms(size_t pending_cards, size_t card_rs_length) const;

  // Base time contains handling remembered sets and constant other time of the
@ -239,7 +243,13 @@ private:

 public:
  size_t predict_bytes_to_copy(G1HeapRegion* hr) const;
-  size_t pending_cards_at_gc_start() const { return _pending_cards_at_gc_start; }
+
+  double last_mutator_dirty_start_time_ms();
+  size_t pending_cards_from_gc() const { return _pending_cards_from_gc; }
+
+  size_t current_pending_cards();
+
+  size_t current_to_collection_set_cards();

  // GC efficiency for collecting the region based on the time estimate for
  // merging and scanning incoming references.
@ -286,7 +296,7 @@ public:
  // Check the current value of the young list RSet length and
  // compare it against the last prediction. If the current value is
  // higher, recalculate the young list target length prediction.
-  void revise_young_list_target_length(size_t card_rs_length, size_t code_root_rs_length);
+  void revise_young_list_target_length(size_t pending_cards, size_t card_rs_length, size_t code_root_rs_length);

  // This should be called after the heap is resized.
  void record_new_heap_size(uint new_number_of_regions);
@ -325,7 +335,6 @@ public:
  // Amount of allowed waste in bytes in the collection set.
  size_t allowed_waste_in_collection_set() const;

-
 private:

  // Predict the number of bytes of surviving objects from survivor and old
@ -359,17 +368,39 @@ public:

  bool use_adaptive_young_list_length() const;

+  // Try to get an estimate of the currently available bytes in the young gen. This
+  // operation considers itself low-priority: if other threads need the resources
+  // required to get the information, return false to indicate that the caller
+  // should retry "soon".
+  bool try_get_available_bytes_estimate(size_t& bytes) const;
+  // Estimate time until next GC, based on remaining bytes available for
+  // allocation and the allocation rate.
+  double predict_time_to_next_gc_ms(size_t available_bytes) const;
+
+  // Adjust wait times to make them less frequent the longer the next GC is away.
+  // But don't increase the wait time too rapidly, further bound it by min_time_ms.
+  // This reduces the number of thread wakeups that just immediately
+  // go back to waiting, while still being responsive to behavior changes.
+  uint64_t adjust_wait_time_ms(double wait_time_ms, uint64_t min_time_ms);
+
+private:
  // Return an estimate of the number of bytes used in young gen.
  // precondition: holding Heap_lock
  size_t estimate_used_young_bytes_locked() const;

+public:
+
  void transfer_survivors_to_cset(const G1SurvivorRegions* survivors);

-  // Record and log stats and pending cards before not-full collection.
-  // thread_buffer_cards is the number of cards that were in per-thread
-  // buffers.  pending_cards includes thread_buffer_cards.
-  void record_concurrent_refinement_stats(size_t pending_cards,
-                                          size_t thread_buffer_cards);
+  // Record and log stats and pending cards to update predictors.
+  void record_refinement_stats(G1ConcurrentRefineStats* stats);
+
+  void record_dirtying_stats(double last_mutator_start_dirty_ms,
+                             double last_mutator_end_dirty_ms,
+                             size_t pending_cards,
+                             double yield_duration,
+                             size_t next_pending_cards_from_gc,
+                             size_t next_to_collection_set_cards);

  bool should_retain_evac_failed_region(G1HeapRegion* r) const {
    return should_retain_evac_failed_region(r->hrm_index());
--- a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp
@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
-#include "gc/shared/bufferNode.hpp"
-#include "runtime/atomicAccess.hpp"
-#include "utilities/debug.hpp"
-#include "utilities/macros.hpp"
-
-// G1RedirtyCardsLocalQueueSet
-
-G1RedirtyCardsLocalQueueSet::G1RedirtyCardsLocalQueueSet(G1RedirtyCardsQueueSet* shared_qset) :
-  PtrQueueSet(shared_qset->allocator()),
-  _shared_qset(shared_qset),
-  _buffers(),
-  _queue(this)
-{}
-
-#ifdef ASSERT
-G1RedirtyCardsLocalQueueSet::~G1RedirtyCardsLocalQueueSet() {
-  assert(_buffers._head == nullptr, "unflushed qset");
-  assert(_buffers._tail == nullptr, "invariant");
-  assert(_buffers._entry_count == 0, "invariant");
-}
-#endif // ASSERT
-
-void G1RedirtyCardsLocalQueueSet::enqueue_completed_buffer(BufferNode* node) {
-  _buffers._entry_count += node->size();
-  node->set_next(_buffers._head);
-  _buffers._head = node;
-  if (_buffers._tail == nullptr) {
-    _buffers._tail = node;
-  }
-}
-
-void G1RedirtyCardsLocalQueueSet::enqueue(void* value) {
-  if (!try_enqueue(_queue, value)) {
-    BufferNode* old_node = exchange_buffer_with_new(_queue);
-    if (old_node != nullptr) {
-      enqueue_completed_buffer(old_node);
-    }
-    retry_enqueue(_queue, value);
-  }
-}
-
-BufferNodeList G1RedirtyCardsLocalQueueSet::flush() {
-  flush_queue(_queue);
-  BufferNodeList cur_buffers = _buffers;
-  _shared_qset->add_bufferlist(_buffers);
-  _buffers = BufferNodeList();
-  return cur_buffers;
-}
-
-// G1RedirtyCardsLocalQueueSet::Queue
-
-G1RedirtyCardsLocalQueueSet::Queue::Queue(G1RedirtyCardsLocalQueueSet* qset) :
-  PtrQueue(qset)
-{}
-
-#ifdef ASSERT
-G1RedirtyCardsLocalQueueSet::Queue::~Queue() {
-  assert(buffer() == nullptr, "unflushed queue");
-}
-#endif // ASSERT
-
-// G1RedirtyCardsQueueSet
-
-G1RedirtyCardsQueueSet::G1RedirtyCardsQueueSet(BufferNode::Allocator* allocator) :
-  PtrQueueSet(allocator),
-  _list(),
-  _entry_count(0),
-  _tail(nullptr)
-  DEBUG_ONLY(COMMA _collecting(true))
-{}
-
-G1RedirtyCardsQueueSet::~G1RedirtyCardsQueueSet() {
-  verify_empty();
-}
-
-#ifdef ASSERT
-void G1RedirtyCardsQueueSet::verify_empty() const {
-  assert(_list.empty(), "precondition");
-  assert(_tail == nullptr, "invariant");
-  assert(_entry_count == 0, "invariant");
-}
-#endif // ASSERT
-
-BufferNode* G1RedirtyCardsQueueSet::all_completed_buffers() const {
-  DEBUG_ONLY(_collecting = false;)
-  return _list.top();
-}
-
-BufferNodeList G1RedirtyCardsQueueSet::take_all_completed_buffers() {
-  DEBUG_ONLY(_collecting = false;)
-  BufferNodeList result(_list.pop_all(), _tail, _entry_count);
-  _tail = nullptr;
-  _entry_count = 0;
-  DEBUG_ONLY(_collecting = true;)
-  return result;
-}
-
-void G1RedirtyCardsQueueSet::update_tail(BufferNode* node) {
-  // Node is the tail of a (possibly single element) list just prepended to
-  // _list.  If, after that prepend, node's follower is null, then node is
-  // also the tail of _list, so record it as such.
-  if (node->next() == nullptr) {
-    assert(_tail == nullptr, "invariant");
-    _tail = node;
-  }
-}
-
-void G1RedirtyCardsQueueSet::enqueue_completed_buffer(BufferNode* node) {
-  assert(_collecting, "precondition");
-  AtomicAccess::add(&_entry_count, node->size());
-  _list.push(*node);
-  update_tail(node);
-}
-
-void G1RedirtyCardsQueueSet::add_bufferlist(const BufferNodeList& buffers) {
-  assert(_collecting, "precondition");
-  if (buffers._head != nullptr) {
-    assert(buffers._tail != nullptr, "invariant");
-    AtomicAccess::add(&_entry_count, buffers._entry_count);
-    _list.prepend(*buffers._head, *buffers._tail);
-    update_tail(buffers._tail);
-  }
-}
--- a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp
@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP
-#define SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP
-
-#include "gc/shared/bufferNode.hpp"
-#include "gc/shared/bufferNodeList.hpp"
-#include "gc/shared/ptrQueue.hpp"
-#include "memory/padded.hpp"
-#include "utilities/macros.hpp"
-
-class G1RedirtyCardsQueueSet;
-
-// A thread-local qset and queue.  It provides an uncontended staging
-// area for completed buffers, to be flushed to the shared qset en masse.
-class G1RedirtyCardsLocalQueueSet : private PtrQueueSet {
-  class Queue : public PtrQueue {
-  public:
-    Queue(G1RedirtyCardsLocalQueueSet* qset);
-    ~Queue() NOT_DEBUG(= default);
-  };
-
-  G1RedirtyCardsQueueSet* _shared_qset;
-  BufferNodeList _buffers;
-  Queue _queue;
-
-  // Add the buffer to the local list.
-  virtual void enqueue_completed_buffer(BufferNode* node);
-
-public:
-  G1RedirtyCardsLocalQueueSet(G1RedirtyCardsQueueSet* shared_qset);
-  ~G1RedirtyCardsLocalQueueSet() NOT_DEBUG(= default);
-
-  void enqueue(void* value);
-
-  // Transfer all completed buffers to the shared qset.
-  // Returns the flushed BufferNodeList which is later used
-  // as a shortcut into the shared qset.
-  BufferNodeList flush();
-};
-
-// Card table entries to be redirtied and the cards reprocessed later.
-// Has two phases, collecting and processing.  During the collecting
-// phase buffers are added to the set.  Once collecting is complete and
-// processing starts, buffers can no longer be added.  Taking all the
-// collected (and processed) buffers reverts back to collecting, allowing
-// the set to be reused for another round of redirtying.
-class G1RedirtyCardsQueueSet : public PtrQueueSet {
-  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, 0);
-  BufferNode::Stack _list;
-  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_PADDING_SIZE, sizeof(size_t));
-  volatile size_t _entry_count;
-  DEFINE_PAD_MINUS_SIZE(3, DEFAULT_PADDING_SIZE, sizeof(BufferNode*));
-  BufferNode* _tail;
-  DEBUG_ONLY(mutable bool _collecting;)
-
-  void update_tail(BufferNode* node);
-
-public:
-  G1RedirtyCardsQueueSet(BufferNode::Allocator* allocator);
-  ~G1RedirtyCardsQueueSet();
-
-  void verify_empty() const NOT_DEBUG_RETURN;
-
-  // Collect buffers.  These functions are thread-safe.
-  // precondition: Must not be concurrent with buffer processing.
-  virtual void enqueue_completed_buffer(BufferNode* node);
-  void add_bufferlist(const BufferNodeList& buffers);
-
-  // Processing phase operations.
-  // precondition: Must not be concurrent with buffer collection.
-  BufferNode* all_completed_buffers() const;
-  BufferNodeList take_all_completed_buffers();
-};
-
-#endif // SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
--- a/src/hotspot/share/gc/g1/g1RemSet.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.hpp
@ -26,6 +26,7 @@
 #define SHARE_GC_G1_G1REMSET_HPP

 #include "gc/g1/g1CardTable.hpp"
+#include "gc/g1/g1CardTableClaimTable.hpp"
 #include "gc/g1/g1GCPhaseTimes.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1OopClosures.hpp"
@ -65,20 +66,15 @@ private:

  G1CollectedHeap* _g1h;

-  G1CardTable*           _ct;
-  G1Policy*              _g1p;
-
-  void print_merge_heap_roots_stats();
+  G1Policy* _g1p;

  void assert_scan_top_is_null(uint hrm_index) NOT_DEBUG_RETURN;

-  void enqueue_for_reprocessing(CardValue* card_ptr);
-
 public:
  // Initialize data that depends on the heap size being known.
  void initialize(uint max_num_regions);

-  G1RemSet(G1CollectedHeap* g1h, G1CardTable* ct);
+  G1RemSet(G1CollectedHeap* g1h);
  ~G1RemSet();

  // Scan all cards in the non-collection set regions that potentially contain
@ -101,7 +97,7 @@ public:

  // Print coarsening stats.
  void print_coarsen_stats();
-  // Creates a task for cleaining up temporary data structures and the
+  // Creates a task for cleaning up temporary data structures and the
  // card table, removing temporary duplicate detection information.
  G1AbstractSubTask* create_cleanup_after_scan_heap_roots_task();
  // Excludes the given region from heap root scanning.
@ -122,16 +118,19 @@ public:
                                          G1GCPhaseTimes::GCParPhases scan_phase,
                                          G1GCPhaseTimes::GCParPhases objcopy_phase);

-  // Two methods for concurrent refinement support, executed concurrently to
-  // the mutator:
-  // Cleans the card at "*card_ptr_addr" before refinement, returns true iff the
-  // card needs later refinement.
-  bool clean_card_before_refine(CardValue** const card_ptr_addr);
+  enum RefineResult {
+      HasRefToCSet,          // The (dirty) card has a reference to the collection set.
+      AlreadyToCSet,         // The card is already one marked as having a reference to the collection set.
+      HasRefToOld,           // The dirty card contains references to other old regions (not the collection set).
+      NoCrossRegion,         // There is no interesting reference in the card any more. The mutator changed all
+                             // references to such after dirtying the card.
+      CouldNotParse          // The card is unparsable, need to retry later.
+  };
  // Refine the region corresponding to "card_ptr". Must be called after
  // being filtered by clean_card_before_refine(), and after proper
  // fence/synchronization.
-  void refine_card_concurrently(CardValue* const card_ptr,
-                                const uint worker_id);
+  RefineResult refine_card_concurrently(CardValue* const card_ptr,
+                                        const uint worker_id);

  // Print accumulated summary info from the start of the VM.
  void print_summary_info();
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
@ -27,7 +27,6 @@
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "gc/g1/g1HeapRegionRemSet.inline.hpp"
 #include "gc/g1/g1RemSet.hpp"
@ -37,39 +36,61 @@
 #include "runtime/javaThread.hpp"

 void G1RemSetSummary::update() {
-  class CollectData : public ThreadClosure {
+  G1ConcurrentRefine* refine = G1CollectedHeap::heap()->concurrent_refine();
+
+  class CollectWorkerData : public ThreadClosure {
    G1RemSetSummary* _summary;
    uint _counter;
  public:
-    CollectData(G1RemSetSummary * summary) : _summary(summary),  _counter(0) {}
+    CollectWorkerData(G1RemSetSummary* summary) : _summary(summary),  _counter(0) {}
    virtual void do_thread(Thread* t) {
      G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
-      _summary->set_refine_thread_cpu_time(_counter, crt->cpu_time());
+      _summary->set_worker_thread_cpu_time(_counter, crt->cpu_time());
      _counter++;
    }
  } collector(this);

-  G1CollectedHeap* g1h = G1CollectedHeap::heap();
-  g1h->concurrent_refine()->threads_do(&collector);
+  refine->worker_threads_do(&collector);
+
+  class CollectControlData : public ThreadClosure {
+    G1RemSetSummary* _summary;
+  public:
+    CollectControlData(G1RemSetSummary* summary) : _summary(summary) {}
+    virtual void do_thread(Thread* t) {
+      G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
+      _summary->set_control_thread_cpu_time(crt->cpu_time());
+    }
+  } control(this);
+
+  refine->control_thread_do(&control);
 }

-void G1RemSetSummary::set_refine_thread_cpu_time(uint thread, jlong value) {
-  assert(_refine_threads_cpu_times != nullptr, "just checking");
-  assert(thread < _num_refine_threads, "just checking");
-  _refine_threads_cpu_times[thread] = value;
+void G1RemSetSummary::set_worker_thread_cpu_time(uint thread, jlong value) {
+  assert(_worker_threads_cpu_times != nullptr, "just checking");
+  assert(thread < _num_worker_threads, "just checking");
+  _worker_threads_cpu_times[thread] = value;
 }

-jlong G1RemSetSummary::refine_thread_cpu_time(uint thread) const {
-  assert(_refine_threads_cpu_times != nullptr, "just checking");
-  assert(thread < _num_refine_threads, "just checking");
-  return _refine_threads_cpu_times[thread];
+void G1RemSetSummary::set_control_thread_cpu_time(jlong value) {
+  _control_thread_cpu_time = value;
+}
+
+jlong G1RemSetSummary::worker_thread_cpu_time(uint thread) const {
+  assert(_worker_threads_cpu_times != nullptr, "just checking");
+  assert(thread < _num_worker_threads, "just checking");
+  return _worker_threads_cpu_times[thread];
+}
+
+jlong G1RemSetSummary::control_thread_cpu_time() const {
+  return _control_thread_cpu_time;
 }

 G1RemSetSummary::G1RemSetSummary(bool should_update) :
-  _num_refine_threads(G1ConcRefinementThreads),
-  _refine_threads_cpu_times(NEW_C_HEAP_ARRAY(jlong, _num_refine_threads, mtGC)) {
+  _num_worker_threads(G1ConcRefinementThreads),
+  _worker_threads_cpu_times(NEW_C_HEAP_ARRAY(jlong, _num_worker_threads, mtGC)),
+  _control_thread_cpu_time(0) {

-  memset(_refine_threads_cpu_times, 0, sizeof(jlong) * _num_refine_threads);
+  memset(_worker_threads_cpu_times, 0, sizeof(jlong) * _num_worker_threads);

  if (should_update) {
    update();
@ -77,23 +98,25 @@ G1RemSetSummary::G1RemSetSummary(bool should_update) :
 }

 G1RemSetSummary::~G1RemSetSummary() {
-  FREE_C_HEAP_ARRAY(jlong, _refine_threads_cpu_times);
+  FREE_C_HEAP_ARRAY(jlong, _worker_threads_cpu_times);
 }

 void G1RemSetSummary::set(G1RemSetSummary* other) {
  assert(other != nullptr, "just checking");
-  assert(_num_refine_threads == other->_num_refine_threads, "just checking");
+  assert(_num_worker_threads == other->_num_worker_threads, "just checking");

-  memcpy(_refine_threads_cpu_times, other->_refine_threads_cpu_times, sizeof(jlong) * _num_refine_threads);
+  memcpy(_worker_threads_cpu_times, other->_worker_threads_cpu_times, sizeof(jlong) * _num_worker_threads);
+  _control_thread_cpu_time = other->_control_thread_cpu_time;
 }

 void G1RemSetSummary::subtract_from(G1RemSetSummary* other) {
  assert(other != nullptr, "just checking");
-  assert(_num_refine_threads == other->_num_refine_threads, "just checking");
+  assert(_num_worker_threads == other->_num_worker_threads, "just checking");

-  for (uint i = 0; i < _num_refine_threads; i++) {
-    set_refine_thread_cpu_time(i, other->refine_thread_cpu_time(i) - refine_thread_cpu_time(i));
+  for (uint i = 0; i < _num_worker_threads; i++) {
+    set_worker_thread_cpu_time(i, other->worker_thread_cpu_time(i) - worker_thread_cpu_time(i));
  }
+  _control_thread_cpu_time = other->_control_thread_cpu_time - _control_thread_cpu_time;
 }

 class G1PerRegionTypeRemSetCounters {
@ -376,9 +399,10 @@ public:
 void G1RemSetSummary::print_on(outputStream* out, bool show_thread_times) {
  if (show_thread_times) {
    out->print_cr(" Concurrent refinement threads times (s)");
+    out->print_cr(" Control %5.2f Workers", (double)control_thread_cpu_time() / NANOSECS_PER_SEC);
    out->print("     ");
-    for (uint i = 0; i < _num_refine_threads; i++) {
-      out->print("    %5.2f", (double)refine_thread_cpu_time(i) / NANOSECS_PER_SEC);
+    for (uint i = 0; i < _num_worker_threads; i++) {
+      out->print("    %5.2f", (double)worker_thread_cpu_time(i) / NANOSECS_PER_SEC);
    }
    out->cr();
  }
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
@ -33,10 +33,12 @@ class G1RemSet;

 // A G1RemSetSummary manages statistical information about the remembered set.
 class G1RemSetSummary {
-  size_t _num_refine_threads;
-  jlong* _refine_threads_cpu_times;
+  size_t _num_worker_threads;
+  jlong* _worker_threads_cpu_times;
+  jlong _control_thread_cpu_time;

-  void set_refine_thread_cpu_time(uint thread, jlong value);
+  void set_worker_thread_cpu_time(uint thread, jlong value);
+  void set_control_thread_cpu_time(jlong value);

  // Update this summary with current data from various places.
  void update();
@ -53,7 +55,8 @@ public:

  void print_on(outputStream* out, bool show_thread_times);

-  jlong refine_thread_cpu_time(uint thread) const;
+  jlong worker_thread_cpu_time(uint thread) const;
+  jlong control_thread_cpu_time() const;
 };

 #endif // SHARE_GC_G1_G1REMSETSUMMARY_HPP
--- a/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp
+++ b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp
@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "gc/g1/g1CollectedHeap.hpp"
+#include "gc/g1/g1Policy.hpp"
+#include "gc/g1/g1ReviseYoungLengthTask.hpp"
+#include "gc/g1/g1ServiceThread.hpp"
+#include "gc/shared/suspendibleThreadSet.hpp"
+
+
+jlong G1ReviseYoungLengthTask::reschedule_delay_ms() const {
+  G1Policy* policy = G1CollectedHeap::heap()->policy();
+  size_t available_bytes;
+  if (policy->try_get_available_bytes_estimate(available_bytes)) {
+    double predicted_time_to_next_gc_ms = policy->predict_time_to_next_gc_ms(available_bytes);
+
+    // Use a prime number close to 50ms as minimum time, different to other components
+    // that derive their wait time from the try_get_available_bytes_estimate() call
+    // to minimize interference.
+    uint64_t const min_wait_time_ms = 47;
+
+    return policy->adjust_wait_time_ms(predicted_time_to_next_gc_ms, min_wait_time_ms);
+  } else {
+    // Failed to get estimate of available bytes. Try again asap.
+    return 1;
+  }
+}
+
+class G1ReviseYoungLengthTask::RemSetSamplingClosure : public G1HeapRegionClosure {
+  size_t _sampled_code_root_rs_length;
+
+public:
+  RemSetSamplingClosure() : _sampled_code_root_rs_length(0) { }
+
+  bool do_heap_region(G1HeapRegion* r) override {
+    G1HeapRegionRemSet* rem_set = r->rem_set();
+    _sampled_code_root_rs_length += rem_set->code_roots_list_length();
+    return false;
+  }
+
+  size_t sampled_code_root_rs_length() const { return _sampled_code_root_rs_length; }
+};
+
+void G1ReviseYoungLengthTask::adjust_young_list_target_length() {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  G1Policy* policy = g1h->policy();
+
+  assert(policy->use_adaptive_young_list_length(), "should not call otherwise");
+
+  size_t pending_cards;
+  size_t current_to_collection_set_cards;
+  {
+    MutexLocker x(G1ReviseYoungLength_lock, Mutex::_no_safepoint_check_flag);
+    pending_cards = policy->current_pending_cards();
+    current_to_collection_set_cards = policy->current_to_collection_set_cards();
+  }
+
+  RemSetSamplingClosure cl;
+  g1h->collection_set()->iterate(&cl);
+
+  policy->revise_young_list_target_length(pending_cards,
+                                          current_to_collection_set_cards,
+                                          cl.sampled_code_root_rs_length());
+}
+
+G1ReviseYoungLengthTask::G1ReviseYoungLengthTask(const char* name) :
+  G1ServiceTask(name) { }
+
+void G1ReviseYoungLengthTask::execute() {
+  SuspendibleThreadSetJoiner sts;
+
+  adjust_young_list_target_length();
+
+  schedule(reschedule_delay_ms());
+}
--- a/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp
+++ b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp
@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP
+#define SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP
+
+#include "gc/g1/g1CardSetMemory.hpp"
+#include "gc/g1/g1HeapRegionRemSet.hpp"
+#include "gc/g1/g1MonotonicArenaFreePool.hpp"
+#include "gc/g1/g1ServiceThread.hpp"
+#include "utilities/growableArray.hpp"
+#include "utilities/ticks.hpp"
+
+// ServiceTask to revise the young generation target length.
+class G1ReviseYoungLengthTask : public G1ServiceTask {
+
+  // The delay used to reschedule this task.
+  jlong reschedule_delay_ms() const;
+
+  class RemSetSamplingClosure; // Helper class for calculating remembered set summary.
+
+  // Adjust the target length (in regions) of the young gen, based on the
+  // current length of the remembered sets.
+  //
+  // At the end of the GC G1 determines the length of the young gen based on
+  // how much time the next GC can take, and when the next GC may occur
+  // according to the MMU.
+  //
+  // The assumption is that a significant part of the GC is spent on scanning
+  // the remembered sets (and many other components), so this thread constantly
+  // reevaluates the prediction for the remembered set scanning costs, and potentially
+  // resizes the young gen. This may do a premature GC or even increase the young
+  // gen size to keep pause time length goal.
+  void adjust_young_list_target_length();
+
+public:
+  explicit G1ReviseYoungLengthTask(const char* name);
+
+  void execute() override;
+};
+
+#endif // SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP
--- a/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp
+++ b/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -25,7 +25,7 @@
 #define SHARE_GC_G1_G1THREADLOCALDATA_HPP

 #include "gc/g1/g1BarrierSet.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
+#include "gc/g1/g1CardTable.hpp"
 #include "gc/g1/g1RegionPinCache.hpp"
 #include "gc/shared/gc_globals.hpp"
 #include "gc/shared/satbMarkQueue.hpp"
@ -36,7 +36,7 @@
 class G1ThreadLocalData {
 private:
  SATBMarkQueue _satb_mark_queue;
-  G1DirtyCardQueue _dirty_card_queue;
+  G1CardTable::CardValue* _byte_map_base;

  // Per-thread cache of pinned object count to reduce atomic operation traffic
  // due to region pinning. Holds the last region where the mutator pinned an
@ -45,8 +45,8 @@ private:

  G1ThreadLocalData() :
      _satb_mark_queue(&G1BarrierSet::satb_mark_queue_set()),
-      _dirty_card_queue(&G1BarrierSet::dirty_card_queue_set()),
-      _pin_cache() {}
+      _byte_map_base(nullptr),
+      _pin_cache() { }

  static G1ThreadLocalData* data(Thread* thread) {
    assert(UseG1GC, "Sanity");
@ -57,10 +57,6 @@ private:
    return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _satb_mark_queue);
  }

-  static ByteSize dirty_card_queue_offset() {
-    return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _dirty_card_queue);
-  }
-
 public:
  static void create(Thread* thread) {
    new (data(thread)) G1ThreadLocalData();
@ -74,10 +70,6 @@ public:
    return data(thread)->_satb_mark_queue;
  }

-  static G1DirtyCardQueue& dirty_card_queue(Thread* thread) {
-    return data(thread)->_dirty_card_queue;
-  }
-
  static ByteSize satb_mark_queue_active_offset() {
    return satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active();
  }
@ -90,14 +82,20 @@ public:
    return satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf();
  }

-  static ByteSize dirty_card_queue_index_offset() {
-    return dirty_card_queue_offset() + G1DirtyCardQueue::byte_offset_of_index();
+  static ByteSize card_table_base_offset() {
+    return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _byte_map_base);
  }

-  static ByteSize dirty_card_queue_buffer_offset() {
-    return dirty_card_queue_offset() + G1DirtyCardQueue::byte_offset_of_buf();
+  static void set_byte_map_base(Thread* thread, G1CardTable::CardValue* new_byte_map_base) {
+    data(thread)->_byte_map_base = new_byte_map_base;
  }

+#ifndef PRODUCT
+  static G1CardTable::CardValue* get_byte_map_base(Thread* thread) {
+    return data(thread)->_byte_map_base;
+  }
+#endif
+
  static G1RegionPinCache& pin_count_cache(Thread* thread) {
    return data(thread)->_pin_cache;
  }
--- a/src/hotspot/share/gc/g1/g1YoungCollector.cpp
+++ b/src/hotspot/share/gc/g1/g1YoungCollector.cpp
@ -39,7 +39,6 @@
 #include "gc/g1/g1MonitoringSupport.hpp"
 #include "gc/g1/g1ParScanThreadState.inline.hpp"
 #include "gc/g1/g1Policy.hpp"
-#include "gc/g1/g1RedirtyCardsQueue.hpp"
 #include "gc/g1/g1RegionPinCache.inline.hpp"
 #include "gc/g1/g1RemSet.hpp"
 #include "gc/g1/g1RootProcessor.hpp"
@ -914,13 +913,8 @@ class G1STWRefProcProxyTask : public RefProcProxyTask {
  TaskTerminator _terminator;
  G1ScannerTasksQueueSet& _task_queues;

-  // Special closure for enqueuing discovered fields: during enqueue the card table
-  // may not be in shape to properly handle normal barrier calls (e.g. card marks
-  // in regions that failed evacuation, scribbling of various values by card table
-  // scan code). Additionally the regular barrier enqueues into the "global"
-  // DCQS, but during GC we need these to-be-refined entries in the GC local queue
-  // so that after clearing the card table, the redirty cards phase will properly
-  // mark all dirty cards to be picked up by refinement.
+  // G1 specific closure for marking discovered fields. Need to mark the card in the
+  // refinement table as the card table is in use by garbage collection.
  class G1EnqueueDiscoveredFieldClosure : public EnqueueDiscoveredFieldClosure {
    G1CollectedHeap* _g1h;
    G1ParScanThreadState* _pss;
--- a/src/hotspot/share/gc/g1/g1YoungCollector.hpp
+++ b/src/hotspot/share/gc/g1/g1YoungCollector.hpp
@ -45,7 +45,6 @@ class G1MonotonicArenaMemoryStats;
 class G1NewTracer;
 class G1ParScanThreadStateSet;
 class G1Policy;
-class G1RedirtyCardsQueueSet;
 class G1RemSet;
 class G1SurvivorRegions;
 class G1YoungGCAllocationFailureInjector;
--- a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp
+++ b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp
@ -287,7 +287,7 @@ public:
    _chunk_bitmap(mtGC) {

    _num_evac_fail_regions = _evac_failure_regions->num_regions_evac_failed();
-    _num_chunks_per_region = G1CollectedHeap::get_chunks_per_region();
+    _num_chunks_per_region = G1CollectedHeap::get_chunks_per_region_for_scan();

    _chunk_size = static_cast<uint>(G1HeapRegion::GrainWords / _num_chunks_per_region);

@ -300,7 +300,7 @@ public:
  double worker_cost() const override {
    assert(_evac_failure_regions->has_regions_evac_failed(), "Should not call this if there were no evacuation failures");

-    double workers_per_region = (double)G1CollectedHeap::get_chunks_per_region() / G1RestoreRetainedRegionChunksPerWorker;
+    double workers_per_region = (double)G1CollectedHeap::get_chunks_per_region_for_scan() / G1RestoreRetainedRegionChunksPerWorker;
    return workers_per_region * _evac_failure_regions->num_regions_evac_failed();
  }

@ -480,43 +480,6 @@ public:
  }
 };

-class RedirtyLoggedCardTableEntryClosure : public G1CardTableEntryClosure {
-  size_t _num_dirtied;
-  G1CollectedHeap* _g1h;
-  G1CardTable* _g1_ct;
-  G1EvacFailureRegions* _evac_failure_regions;
-
-  G1HeapRegion* region_for_card(CardValue* card_ptr) const {
-    return _g1h->heap_region_containing(_g1_ct->addr_for(card_ptr));
-  }
-
-  bool will_become_free(G1HeapRegion* hr) const {
-    // A region will be freed by during the FreeCollectionSet phase if the region is in the
-    // collection set and has not had an evacuation failure.
-    return _g1h->is_in_cset(hr) && !_evac_failure_regions->contains(hr->hrm_index());
-  }
-
-public:
-  RedirtyLoggedCardTableEntryClosure(G1CollectedHeap* g1h, G1EvacFailureRegions* evac_failure_regions) :
-    G1CardTableEntryClosure(),
-    _num_dirtied(0),
-    _g1h(g1h),
-    _g1_ct(g1h->card_table()),
-    _evac_failure_regions(evac_failure_regions) { }
-
-  void do_card_ptr(CardValue* card_ptr) override {
-    G1HeapRegion* hr = region_for_card(card_ptr);
-
-    // Should only dirty cards in regions that won't be freed.
-    if (!will_become_free(hr)) {
-      *card_ptr = G1CardTable::dirty_card_val();
-      _num_dirtied++;
-    }
-  }
-
-  size_t num_dirtied()   const { return _num_dirtied; }
-};
-
 class G1PostEvacuateCollectionSetCleanupTask2::ProcessEvacuationFailedRegionsTask : public G1AbstractSubTask {
  G1EvacFailureRegions* _evac_failure_regions;
  G1HeapRegionClaimer _claimer;
@ -572,48 +535,6 @@ public:
  }
 };

-class G1PostEvacuateCollectionSetCleanupTask2::RedirtyLoggedCardsTask : public G1AbstractSubTask {
-  BufferNodeList* _rdc_buffers;
-  uint _num_buffer_lists;
-  G1EvacFailureRegions* _evac_failure_regions;
-
-public:
-  RedirtyLoggedCardsTask(G1EvacFailureRegions* evac_failure_regions, BufferNodeList* rdc_buffers, uint num_buffer_lists) :
-    G1AbstractSubTask(G1GCPhaseTimes::RedirtyCards),
-    _rdc_buffers(rdc_buffers),
-    _num_buffer_lists(num_buffer_lists),
-    _evac_failure_regions(evac_failure_regions) { }
-
-  double worker_cost() const override {
-    // Needs more investigation.
-    return G1CollectedHeap::heap()->workers()->active_workers();
-  }
-
-  void do_work(uint worker_id) override {
-    RedirtyLoggedCardTableEntryClosure cl(G1CollectedHeap::heap(), _evac_failure_regions);
-
-    uint start = worker_id;
-    for (uint i = 0; i < _num_buffer_lists; i++) {
-      uint index = (start + i) % _num_buffer_lists;
-
-      BufferNode* next = AtomicAccess::load(&_rdc_buffers[index]._head);
-      BufferNode* tail = AtomicAccess::load(&_rdc_buffers[index]._tail);
-
-      while (next != nullptr) {
-        BufferNode* node = next;
-        next = AtomicAccess::cmpxchg(&_rdc_buffers[index]._head, node, (node != tail ) ? node->next() : nullptr);
-        if (next == node) {
-          cl.apply_to_buffer(node, worker_id);
-          next = (node != tail ) ? node->next() : nullptr;
-        } else {
-          break; // If there is contention, move to the next BufferNodeList
-        }
-      }
-    }
-    record_work_item(worker_id, 0, cl.num_dirtied());
-  }
-};
-
 // Helper class to keep statistics for the collection set freeing
 class FreeCSetStats {
  size_t _before_used_bytes;   // Usage in regions successfully evacuate
@ -797,7 +718,6 @@ public:
    JFREventForRegion event(r, _worker_id);
    TimerForRegion timer(timer_for_region(r));

-
    if (r->is_young()) {
      assert_tracks_surviving_words(r);
      r->record_surv_words_in_group(_surviving_young_words[r->young_index_in_cset()]);
@ -908,24 +828,34 @@ public:
  }
 };

-class G1PostEvacuateCollectionSetCleanupTask2::ResizeTLABsTask : public G1AbstractSubTask {
+class G1PostEvacuateCollectionSetCleanupTask2::ResizeTLABsAndSwapCardTableTask : public G1AbstractSubTask {
  G1JavaThreadsListClaimer _claimer;

  // There is not much work per thread so the number of threads per worker is high.
  static const uint ThreadsPerWorker = 250;

 public:
-  ResizeTLABsTask() : G1AbstractSubTask(G1GCPhaseTimes::ResizeThreadLABs), _claimer(ThreadsPerWorker) { }
+  ResizeTLABsAndSwapCardTableTask()
+    : G1AbstractSubTask(G1GCPhaseTimes::ResizeThreadLABs), _claimer(ThreadsPerWorker)
+  {
+    G1BarrierSet::g1_barrier_set()->swap_global_card_table();
+  }

  void do_work(uint worker_id) override {
-    class ResizeClosure : public ThreadClosure {
+
+    class ResizeAndSwapCardTableClosure : public ThreadClosure {
    public:

      void do_thread(Thread* thread) {
-        static_cast<JavaThread*>(thread)->tlab().resize();
+        if (UseTLAB && ResizeTLAB) {
+          static_cast<JavaThread*>(thread)->tlab().resize();
+        }
+
+        G1BarrierSet::g1_barrier_set()->update_card_table_base(thread);
      }
-    } cl;
-    _claimer.apply(&cl);
+    } resize_and_swap_cl;
+
+    _claimer.apply(&resize_and_swap_cl);
  }

  double worker_cost() const override {
@ -968,13 +898,8 @@ G1PostEvacuateCollectionSetCleanupTask2::G1PostEvacuateCollectionSetCleanupTask2
  if (evac_failure_regions->has_regions_evac_failed()) {
    add_parallel_task(new ProcessEvacuationFailedRegionsTask(evac_failure_regions));
  }
-  add_parallel_task(new RedirtyLoggedCardsTask(evac_failure_regions,
-                                               per_thread_states->rdc_buffers(),
-                                               per_thread_states->num_workers()));

-  if (UseTLAB && ResizeTLAB) {
-    add_parallel_task(new ResizeTLABsTask());
-  }
+  add_parallel_task(new ResizeTLABsAndSwapCardTableTask());
  add_parallel_task(new FreeCollectionSetTask(evacuation_info,
                                              per_thread_states->surviving_young_words(),
                                              evac_failure_regions));
--- a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp
+++ b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -55,9 +55,8 @@ public:
 // - Eagerly Reclaim Humongous Objects (s)
 // - Update Derived Pointers (s)
 // - Clear Retained Region Data (on evacuation failure)
-// - Redirty Logged Cards
 // - Free Collection Set
-// - Resize TLABs
+// - Resize TLABs and Swap Card Table
 // - Reset the reusable PartialArrayStateManager.
 class G1PostEvacuateCollectionSetCleanupTask2 : public G1BatchedTask {
  class EagerlyReclaimHumongousObjectsTask;
@ -66,9 +65,8 @@ class G1PostEvacuateCollectionSetCleanupTask2 : public G1BatchedTask {
 #endif

  class ProcessEvacuationFailedRegionsTask;
-  class RedirtyLoggedCardsTask;
  class FreeCollectionSetTask;
-  class ResizeTLABsTask;
+  class ResizeTLABsAndSwapCardTableTask;
  class ResetPartialArrayStateManagerTask;

 public:
--- a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp
+++ b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp
@ -24,7 +24,6 @@

 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
-#include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1RegionPinCache.inline.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/g1/g1YoungGCPreEvacuateTasks.hpp"
@ -35,23 +34,21 @@
 #include "runtime/thread.inline.hpp"
 #include "runtime/threads.hpp"

-class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABAndFlushLogs : public G1AbstractSubTask {
+class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABs : public G1AbstractSubTask {
  G1JavaThreadsListClaimer _claimer;

  // Per worker thread statistics.
  ThreadLocalAllocStats* _local_tlab_stats;
-  G1ConcurrentRefineStats* _local_refinement_stats;

  uint _num_workers;

  // There is relatively little work to do per thread.
  static const uint ThreadsPerWorker = 250;

-  struct RetireTLABAndFlushLogsClosure : public ThreadClosure {
+  struct RetireTLABClosure : public ThreadClosure {
    ThreadLocalAllocStats _tlab_stats;
-    G1ConcurrentRefineStats _refinement_stats;

-    RetireTLABAndFlushLogsClosure() : _tlab_stats(), _refinement_stats() { }
+    RetireTLABClosure() : _tlab_stats() { }

    void do_thread(Thread* thread) override {
      assert(thread->is_Java_thread(), "must be");
@ -61,37 +58,29 @@ class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABAndFlushLogs : pu
      if (UseTLAB) {
        thread->retire_tlab(&_tlab_stats);
      }
-      // Concatenate logs.
-      G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-      _refinement_stats += qset.concatenate_log_and_stats(thread);
      // Flush region pin count cache.
      G1ThreadLocalData::pin_count_cache(thread).flush();
    }
  };

 public:
-  JavaThreadRetireTLABAndFlushLogs() :
-    G1AbstractSubTask(G1GCPhaseTimes::RetireTLABsAndFlushLogs),
+  JavaThreadRetireTLABs() :
+    G1AbstractSubTask(G1GCPhaseTimes::RetireTLABs),
    _claimer(ThreadsPerWorker),
    _local_tlab_stats(nullptr),
-    _local_refinement_stats(nullptr),
    _num_workers(0) {
  }

-  ~JavaThreadRetireTLABAndFlushLogs() {
-    static_assert(std::is_trivially_destructible<G1ConcurrentRefineStats>::value, "must be");
-    FREE_C_HEAP_ARRAY(G1ConcurrentRefineStats, _local_refinement_stats);
-
+  ~JavaThreadRetireTLABs() {
    static_assert(std::is_trivially_destructible<ThreadLocalAllocStats>::value, "must be");
    FREE_C_HEAP_ARRAY(ThreadLocalAllocStats, _local_tlab_stats);
  }

  void do_work(uint worker_id) override {
-    RetireTLABAndFlushLogsClosure tc;
+    RetireTLABClosure tc;
    _claimer.apply(&tc);

    _local_tlab_stats[worker_id] = tc._tlab_stats;
-    _local_refinement_stats[worker_id] = tc._refinement_stats;
  }

  double worker_cost() const override {
@ -101,11 +90,9 @@ public:
  void set_max_workers(uint max_workers) override {
    _num_workers = max_workers;
    _local_tlab_stats = NEW_C_HEAP_ARRAY(ThreadLocalAllocStats, _num_workers, mtGC);
-    _local_refinement_stats = NEW_C_HEAP_ARRAY(G1ConcurrentRefineStats, _num_workers, mtGC);

    for (uint i = 0; i < _num_workers; i++) {
      ::new (&_local_tlab_stats[i]) ThreadLocalAllocStats();
-      ::new (&_local_refinement_stats[i]) G1ConcurrentRefineStats();
    }
  }

@ -116,85 +103,15 @@ public:
    }
    return result;
  }
-
-  G1ConcurrentRefineStats refinement_stats() const {
-    G1ConcurrentRefineStats result;
-    for (uint i = 0; i < _num_workers; i++) {
-      result += _local_refinement_stats[i];
-    }
-    return result;
-  }
-};
-
-class G1PreEvacuateCollectionSetBatchTask::NonJavaThreadFlushLogs : public G1AbstractSubTask {
-  struct FlushLogsClosure : public ThreadClosure {
-    G1ConcurrentRefineStats _refinement_stats;
-
-    FlushLogsClosure() : _refinement_stats() { }
-
-    void do_thread(Thread* thread) override {
-      G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-      _refinement_stats += qset.concatenate_log_and_stats(thread);
-
-      assert(G1ThreadLocalData::pin_count_cache(thread).count() == 0, "NonJava thread has pinned Java objects");
-    }
-  } _tc;
-
-public:
-  NonJavaThreadFlushLogs() : G1AbstractSubTask(G1GCPhaseTimes::NonJavaThreadFlushLogs), _tc() { }
-
-  void do_work(uint worker_id) override {
-    Threads::non_java_threads_do(&_tc);
-  }
-
-  double worker_cost() const override {
-    return 1.0;
-  }
-
-  G1ConcurrentRefineStats refinement_stats() const { return _tc._refinement_stats; }
 };

 G1PreEvacuateCollectionSetBatchTask::G1PreEvacuateCollectionSetBatchTask() :
  G1BatchedTask("Pre Evacuate Prepare", G1CollectedHeap::heap()->phase_times()),
-  _old_pending_cards(G1BarrierSet::dirty_card_queue_set().num_cards()),
-  _java_retire_task(new JavaThreadRetireTLABAndFlushLogs()),
-  _non_java_retire_task(new NonJavaThreadFlushLogs()) {
+  _java_retire_task(new JavaThreadRetireTLABs()) {

-  // Disable mutator refinement until concurrent refinement decides otherwise.
-  G1BarrierSet::dirty_card_queue_set().set_mutator_refinement_threshold(SIZE_MAX);
-
-  add_serial_task(_non_java_retire_task);
  add_parallel_task(_java_retire_task);
 }

-static void verify_empty_dirty_card_logs() {
-#ifdef ASSERT
-  ResourceMark rm;
-
-  struct Verifier : public ThreadClosure {
-    Verifier() {}
-    void do_thread(Thread* t) override {
-      G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t);
-      assert(queue.is_empty(), "non-empty dirty card queue for thread %s", t->name());
-    }
-  } verifier;
-  Threads::threads_do(&verifier);
-#endif
-}
-
 G1PreEvacuateCollectionSetBatchTask::~G1PreEvacuateCollectionSetBatchTask() {
  _java_retire_task->tlab_stats().publish();
-
-  G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set();
-
-  G1ConcurrentRefineStats total_refinement_stats;
-  total_refinement_stats += _java_retire_task->refinement_stats();
-  total_refinement_stats += _non_java_retire_task->refinement_stats();
-  qset.update_refinement_stats(total_refinement_stats);
-
-  verify_empty_dirty_card_logs();
-
-  size_t pending_cards = qset.num_cards();
-  size_t thread_buffer_cards = pending_cards - _old_pending_cards;
-  G1CollectedHeap::heap()->policy()->record_concurrent_refinement_stats(pending_cards, thread_buffer_cards);
 }
--- a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp
+++ b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -28,18 +28,13 @@
 #include "gc/g1/g1BatchedTask.hpp"

 // Set of pre evacuate collection set tasks containing ("s" means serial):
-// - Retire TLAB and Flush Logs (Java threads)
+// - Retire TLABs (Java threads)
 // - Flush pin count cache (Java threads)
-// - Flush Logs (s) (Non-Java threads)
 class G1PreEvacuateCollectionSetBatchTask : public G1BatchedTask {
-  class JavaThreadRetireTLABAndFlushLogs;
-  class NonJavaThreadFlushLogs;
-
-  size_t _old_pending_cards;
+  class JavaThreadRetireTLABs;

  // References to the tasks to retain access to statistics.
-  JavaThreadRetireTLABAndFlushLogs* _java_retire_task;
-  NonJavaThreadFlushLogs* _non_java_retire_task;
+  JavaThreadRetireTLABs* _java_retire_task;

 public:
  G1PreEvacuateCollectionSetBatchTask();
--- a/src/hotspot/share/gc/g1/g1_globals.hpp
+++ b/src/hotspot/share/gc/g1/g1_globals.hpp
@ -162,6 +162,11 @@
          "a single expand attempt.")                                       \
          range(0, 100)                                                     \
                                                                            \
+  product(size_t, G1PerThreadPendingCardThreshold, 256, DIAGNOSTIC,         \
+          "Number of pending cards allowed on the card table per GC "       \
+          "worker thread before considering starting refinement.")          \
+          range(0, UINT_MAX)                                                \
+                                                                            \
  product(uint, G1ShrinkByPercentOfAvailable, 50, DIAGNOSTIC,               \
          "When shrinking, maximum % of free space to free for a single "   \
          "shrink attempt.")                                                \
@ -188,10 +193,6 @@
          "bound of acceptable deviation range.")                           \
          constraint(G1CPUUsageShrinkConstraintFunc, AfterErgo)             \
                                                                            \
-  product(size_t, G1UpdateBufferSize, 256,                                  \
-          "Size of an update buffer")                                       \
-          constraint(G1UpdateBufferSizeConstraintFunc, AfterErgo)           \
-                                                                            \
  product(uint, G1RSetUpdatingPauseTimePercent, 10,                         \
          "A target percentage of time that is allowed to be spend on "     \
          "processing remembered set update buffers during the collection " \
--- a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp
+++ b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp
@ -206,12 +206,6 @@ JVMFlag::Error G1SATBBufferSizeConstraintFunc(size_t value, bool verbose) {
                                       verbose);
 }

-JVMFlag::Error G1UpdateBufferSizeConstraintFunc(size_t value, bool verbose) {
-  return buffer_size_constraint_helper(FLAG_MEMBER_ENUM(G1UpdateBufferSize),
-                                       value,
-                                       verbose);
-}
-
 JVMFlag::Error gc_cpu_usage_threshold_helper(JVMFlagsEnum flagid,
                                             uint value,
                                             bool verbose) {
--- a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp
+++ b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp
@ -47,7 +47,6 @@
                                                      \
  /* G1 PtrQueue buffer size constraints */           \
  f(size_t, G1SATBBufferSizeConstraintFunc)           \
-  f(size_t, G1UpdateBufferSizeConstraintFunc)         \
                                                      \
  /* G1 GC deviation counter threshold constraints */ \
  f(uint, G1CPUUsageExpandConstraintFunc)             \
--- a/src/hotspot/share/gc/g1/vmStructs_g1.hpp
+++ b/src/hotspot/share/gc/g1/vmStructs_g1.hpp
@ -82,8 +82,7 @@
  declare_constant(G1HeapRegionType::StartsHumongousTag)                      \
  declare_constant(G1HeapRegionType::ContinuesHumongousTag)                   \
  declare_constant(G1HeapRegionType::OldMask)                                 \
-  declare_constant(BarrierSet::G1BarrierSet)                                  \
-  declare_constant(G1CardTable::g1_young_gen)
+  declare_constant(BarrierSet::G1BarrierSet)

 #define VM_TYPES_G1GC(declare_type,                                           \
                      declare_toplevel_type,                                  \
@ -100,7 +99,6 @@
  declare_toplevel_type(PtrQueue)                                             \
  declare_toplevel_type(G1HeapRegionType)                                     \
  declare_toplevel_type(SATBMarkQueue)                                        \
-  declare_toplevel_type(G1DirtyCardQueue)                                     \
                                                                              \
  declare_toplevel_type(G1CollectedHeap*)                                     \
  declare_toplevel_type(G1HeapRegion*)                                        \
--- a/src/hotspot/share/gc/shared/bufferNodeList.cpp
+++ b/src/hotspot/share/gc/shared/bufferNodeList.cpp
@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#include "gc/shared/bufferNodeList.hpp"
-#include "utilities/debug.hpp"
-
-BufferNodeList::BufferNodeList() :
-  _head(nullptr), _tail(nullptr), _entry_count(0) {}
-
-BufferNodeList::BufferNodeList(BufferNode* head,
-                               BufferNode* tail,
-                               size_t entry_count) :
-  _head(head), _tail(tail), _entry_count(entry_count)
-{
-  assert((_head == nullptr) == (_tail == nullptr), "invariant");
-  assert((_head == nullptr) == (_entry_count == 0), "invariant");
-}
--- a/src/hotspot/share/gc/shared/cardTable.cpp
+++ b/src/hotspot/share/gc/shared/cardTable.cpp
@ -225,6 +225,9 @@ uintx CardTable::ct_max_alignment_constraint() {

 #ifndef PRODUCT
 void CardTable::verify_region(MemRegion mr, CardValue val, bool val_equals) {
+  if (mr.is_empty()) {
+    return;
+  }
  CardValue* start    = byte_for(mr.start());
  CardValue* end      = byte_for(mr.last());
  bool failures = false;
@ -255,7 +258,8 @@ void CardTable::verify_dirty_region(MemRegion mr) {
 }
 #endif

-void CardTable::print_on(outputStream* st) const {
-  st->print_cr("Card table byte_map: [" PTR_FORMAT "," PTR_FORMAT "] _byte_map_base: " PTR_FORMAT,
+void CardTable::print_on(outputStream* st, const char* description) const {
+  st->print_cr("%s table byte_map: [" PTR_FORMAT "," PTR_FORMAT "] _byte_map_base: " PTR_FORMAT,
+               description,
               p2i(_byte_map), p2i(_byte_map + _byte_map_size), p2i(_byte_map_base));
 }
--- a/src/hotspot/share/gc/shared/cardTable.hpp
+++ b/src/hotspot/share/gc/shared/cardTable.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -203,12 +203,12 @@ public:

  virtual bool is_in_young(const void* p) const = 0;

-  // Print a description of the memory for the card table
-  virtual void print_on(outputStream* st) const;
+  // Print card table information.
+  void print_on(outputStream* st, const char* description = "Card") const;

  // val_equals -> it will check that all cards covered by mr equal val
  // !val_equals -> it will check that all cards covered by mr do not equal val
-  void verify_region(MemRegion mr, CardValue val, bool val_equals) PRODUCT_RETURN;
+  virtual void verify_region(MemRegion mr, CardValue val, bool val_equals) PRODUCT_RETURN;
  void verify_not_dirty_region(MemRegion mr) PRODUCT_RETURN;
  void verify_dirty_region(MemRegion mr) PRODUCT_RETURN;
 };
--- a/src/hotspot/share/gc/shared/workerDataArray.hpp
+++ b/src/hotspot/share/gc/shared/workerDataArray.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2025, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -34,7 +34,7 @@ template <class T>
 class WorkerDataArray  : public CHeapObj<mtGC> {
  friend class WDAPrinter;
 public:
-  static const uint MaxThreadWorkItems = 9;
+  static const uint MaxThreadWorkItems = 10;
 private:
  T*          _data;
  uint        _length;
--- a/src/hotspot/share/jvmci/jvmciRuntime.cpp
+++ b/src/hotspot/share/jvmci/jvmciRuntime.cpp
@ -589,10 +589,6 @@ void JVMCIRuntime::write_barrier_pre(JavaThread* thread, oopDesc* obj) {
  G1BarrierSetRuntime::write_ref_field_pre_entry(obj, thread);
 }

-void JVMCIRuntime::write_barrier_post(JavaThread* thread, volatile CardValue* card_addr) {
-  G1BarrierSetRuntime::write_ref_field_post_entry(card_addr, thread);
-}
-
 #endif // INCLUDE_G1GC

 JRT_LEAF(jboolean, JVMCIRuntime::validate_object(JavaThread* thread, oopDesc* parent, oopDesc* child))
--- a/Show More
+++ b/Show More