From 8d5c0056420731cbbd83f2d23837bbb5cdc9e4cc Mon Sep 17 00:00:00 2001 From: Thomas Schatzl Date: Mon, 22 Sep 2025 13:47:45 +0000 Subject: [PATCH] 8342382: Implement JEP 522: G1 GC: Improve Throughput by Reducing Synchronization Co-authored-by: Amit Kumar Co-authored-by: Martin Doerr Co-authored-by: Carlo Refice Co-authored-by: Fei Yang Reviewed-by: iwalulya, rcastanedalo, aph, ayang --- .../gc/g1/g1BarrierSetAssembler_aarch64.cpp | 239 ++---- .../gc/g1/g1BarrierSetAssembler_aarch64.hpp | 17 +- src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad | 8 +- .../arm/gc/g1/g1BarrierSetAssembler_arm.cpp | 239 +----- .../arm/gc/g1/g1BarrierSetAssembler_arm.hpp | 17 +- src/hotspot/cpu/arm/gc/g1/g1_arm.ad | 8 +- .../ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp | 270 ++---- .../ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp | 21 +- src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad | 8 +- .../gc/g1/g1BarrierSetAssembler_riscv.cpp | 263 ++---- .../gc/g1/g1BarrierSetAssembler_riscv.hpp | 18 +- src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad | 8 +- .../s390/gc/g1/g1BarrierSetAssembler_s390.cpp | 351 ++------ .../s390/gc/g1/g1BarrierSetAssembler_s390.hpp | 18 +- src/hotspot/cpu/s390/gc/g1/g1_s390.ad | 8 +- .../x86/gc/g1/g1BarrierSetAssembler_x86.cpp | 265 ++---- .../x86/gc/g1/g1BarrierSetAssembler_x86.hpp | 31 +- src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad | 38 +- src/hotspot/share/code/aotCodeCache.cpp | 1 - src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp | 130 ++- src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp | 33 +- src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp | 69 +- src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp | 25 +- src/hotspot/share/gc/g1/g1Allocator.cpp | 3 - src/hotspot/share/gc/g1/g1Analytics.cpp | 40 +- src/hotspot/share/gc/g1/g1Analytics.hpp | 19 +- src/hotspot/share/gc/g1/g1Arguments.cpp | 9 +- src/hotspot/share/gc/g1/g1BarrierSet.cpp | 91 +- src/hotspot/share/gc/g1/g1BarrierSet.hpp | 60 +- .../share/gc/g1/g1BarrierSet.inline.hpp | 7 +- .../share/gc/g1/g1BarrierSetRuntime.cpp | 14 +- .../share/gc/g1/g1BarrierSetRuntime.hpp | 3 +- src/hotspot/share/gc/g1/g1CardTable.cpp | 40 +- src/hotspot/share/gc/g1/g1CardTable.hpp | 59 +- .../share/gc/g1/g1CardTable.inline.hpp | 52 +- .../share/gc/g1/g1CardTableClaimTable.cpp | 97 +++ .../share/gc/g1/g1CardTableClaimTable.hpp | 137 ++++ .../gc/g1/g1CardTableClaimTable.inline.hpp | 128 +++ src/hotspot/share/gc/g1/g1CollectedHeap.cpp | 125 ++- src/hotspot/share/gc/g1/g1CollectedHeap.hpp | 46 +- .../share/gc/g1/g1CollectedHeap.inline.hpp | 24 - src/hotspot/share/gc/g1/g1CollectionSet.cpp | 9 +- src/hotspot/share/gc/g1/g1ConcurrentMark.cpp | 7 +- src/hotspot/share/gc/g1/g1ConcurrentMark.hpp | 2 + .../gc/g1/g1ConcurrentMarkRemarkTasks.cpp | 10 +- .../gc/g1/g1ConcurrentRebuildAndScrub.cpp | 2 +- .../share/gc/g1/g1ConcurrentRefine.cpp | 674 +++++++++------ .../share/gc/g1/g1ConcurrentRefine.hpp | 247 ++++-- .../share/gc/g1/g1ConcurrentRefineStats.cpp | 50 +- .../share/gc/g1/g1ConcurrentRefineStats.hpp | 71 +- .../gc/g1/g1ConcurrentRefineSweepTask.cpp | 191 +++++ .../g1ConcurrentRefineSweepTask.hpp} | 31 +- .../share/gc/g1/g1ConcurrentRefineThread.cpp | 270 +++--- .../share/gc/g1/g1ConcurrentRefineThread.hpp | 42 +- .../gc/g1/g1ConcurrentRefineThreadsNeeded.cpp | 52 +- src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp | 599 -------------- src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp | 302 ------- src/hotspot/share/gc/g1/g1FromCardCache.cpp | 4 +- .../share/gc/g1/g1FullGCCompactTask.cpp | 4 + .../gc/g1/g1FullGCPrepareTask.inline.hpp | 4 + .../share/gc/g1/g1FullGCResetMetadataTask.cpp | 2 +- src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp | 34 +- src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp | 43 +- src/hotspot/share/gc/g1/g1HeapRegion.cpp | 38 +- src/hotspot/share/gc/g1/g1HeapRegion.hpp | 6 +- .../share/gc/g1/g1HeapRegionManager.cpp | 25 +- .../share/gc/g1/g1HeapRegionManager.hpp | 6 +- src/hotspot/share/gc/g1/g1HeapVerifier.cpp | 106 ++- src/hotspot/share/gc/g1/g1HeapVerifier.hpp | 15 +- src/hotspot/share/gc/g1/g1OopClosures.hpp | 36 +- .../share/gc/g1/g1OopClosures.inline.hpp | 31 +- .../share/gc/g1/g1ParScanThreadState.cpp | 56 +- .../share/gc/g1/g1ParScanThreadState.hpp | 50 +- .../gc/g1/g1ParScanThreadState.inline.hpp | 34 +- src/hotspot/share/gc/g1/g1Policy.cpp | 407 +++++---- src/hotspot/share/gc/g1/g1Policy.hpp | 67 +- .../share/gc/g1/g1RedirtyCardsQueue.cpp | 148 ---- .../share/gc/g1/g1RedirtyCardsQueue.hpp | 98 --- src/hotspot/share/gc/g1/g1RemSet.cpp | 776 ++++++------------ src/hotspot/share/gc/g1/g1RemSet.hpp | 29 +- src/hotspot/share/gc/g1/g1RemSetSummary.cpp | 74 +- src/hotspot/share/gc/g1/g1RemSetSummary.hpp | 11 +- .../share/gc/g1/g1ReviseYoungLengthTask.cpp | 96 +++ .../share/gc/g1/g1ReviseYoungLengthTask.hpp | 63 ++ src/hotspot/share/gc/g1/g1ThreadLocalData.hpp | 32 +- src/hotspot/share/gc/g1/g1YoungCollector.cpp | 10 +- src/hotspot/share/gc/g1/g1YoungCollector.hpp | 1 - .../gc/g1/g1YoungGCPostEvacuateTasks.cpp | 113 +-- .../gc/g1/g1YoungGCPostEvacuateTasks.hpp | 8 +- .../share/gc/g1/g1YoungGCPreEvacuateTasks.cpp | 99 +-- .../share/gc/g1/g1YoungGCPreEvacuateTasks.hpp | 13 +- src/hotspot/share/gc/g1/g1_globals.hpp | 9 +- .../share/gc/g1/jvmFlagConstraintsG1.cpp | 6 - .../share/gc/g1/jvmFlagConstraintsG1.hpp | 1 - src/hotspot/share/gc/g1/vmStructs_g1.hpp | 4 +- .../share/gc/shared/bufferNodeList.cpp | 38 - src/hotspot/share/gc/shared/cardTable.cpp | 8 +- src/hotspot/share/gc/shared/cardTable.hpp | 8 +- .../share/gc/shared/workerDataArray.hpp | 4 +- src/hotspot/share/jvmci/jvmciRuntime.cpp | 4 - src/hotspot/share/jvmci/vmStructs_jvmci.cpp | 6 +- src/hotspot/share/oops/oop.cpp | 11 +- src/hotspot/share/runtime/arguments.cpp | 1 + src/hotspot/share/runtime/cpuTimeCounters.cpp | 3 + src/hotspot/share/runtime/cpuTimeCounters.hpp | 1 + src/hotspot/share/runtime/mutexLocker.cpp | 10 +- src/hotspot/share/runtime/mutexLocker.hpp | 2 +- src/hotspot/share/runtime/vmOperation.hpp | 3 +- .../gcbarriers/TestG1BarrierGeneration.java | 4 +- .../jtreg/gc/g1/TestGCLogMessages.java | 25 +- .../TestOptionsWithRanges.java | 1 - .../ir_framework/tests/TestIRMatching.java | 2 +- .../vmTestbase/gc/ArrayJuggle/Juggle2.java | 7 +- .../gc/collection/TestG1ParallelPhases.java | 11 +- 114 files changed, 3625 insertions(+), 4681 deletions(-) create mode 100644 src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp create mode 100644 src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp create mode 100644 src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp create mode 100644 src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp rename src/hotspot/share/gc/{shared/bufferNodeList.hpp => g1/g1ConcurrentRefineSweepTask.hpp} (57%) delete mode 100644 src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp delete mode 100644 src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp delete mode 100644 src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp delete mode 100644 src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp create mode 100644 src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp create mode 100644 src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp delete mode 100644 src/hotspot/share/gc/shared/bufferNodeList.cpp diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp index 42f3c4a015a..9950feb7470 100644 --- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp @@ -86,15 +86,48 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm } } -void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators, - Register start, Register count, Register scratch, RegSet saved_regs) { - __ push(saved_regs, sp); - assert_different_registers(start, count, scratch); - assert_different_registers(c_rarg0, count); - __ mov(c_rarg0, start); - __ mov(c_rarg1, count); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2); - __ pop(saved_regs, sp); +void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, + DecoratorSet decorators, + Register start, + Register count, + Register scratch, + RegSet saved_regs) { + + Label done; + Label loop; + Label next; + + __ cbz(count, done); + + // Calculate the number of card marks to set. Since the object might start and + // end within a card, we need to calculate this via the card table indexes of + // the actual start and last addresses covered by the object. + // Temporarily use the count register for the last element address. + __ lea(count, Address(start, count, Address::lsl(LogBytesPerHeapOop))); // end = start + count << LogBytesPerHeapOop + __ sub(count, count, BytesPerHeapOop); // Use last element address for end. + + __ lsr(start, start, CardTable::card_shift()); + __ lsr(count, count, CardTable::card_shift()); + __ sub(count, count, start); // Number of bytes to mark - 1. + + // Add card table base offset to start. + __ ldr(scratch, Address(rthread, in_bytes(G1ThreadLocalData::card_table_base_offset()))); + __ add(start, start, scratch); + + __ bind(loop); + if (UseCondCardMark) { + __ ldrb(scratch, Address(start, count)); + // Instead of loading clean_card_val and comparing, we exploit the fact that + // the LSB of non-clean cards is always 0, and the LSB of clean cards 1. + __ tbz(scratch, 0, next); + } + static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr"); + __ strb(zr, Address(start, count)); + __ bind(next); + __ subs(count, count, 1); + __ br(Assembler::GE, loop); + + __ bind(done); } static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime, @@ -202,10 +235,14 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, static void generate_post_barrier_fast_path(MacroAssembler* masm, const Register store_addr, const Register new_val, + const Register thread, const Register tmp1, const Register tmp2, Label& done, bool new_val_may_be_null) { + assert(thread == rthread, "must be"); + assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg, rscratch1); + // Does store cross heap regions? __ eor(tmp1, store_addr, new_val); // tmp1 := store address ^ new value __ lsr(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes) @@ -214,33 +251,19 @@ static void generate_post_barrier_fast_path(MacroAssembler* masm, if (new_val_may_be_null) { __ cbz(new_val, done); } - // Storing region crossing non-null, is card young? + // Storing region crossing non-null. __ lsr(tmp1, store_addr, CardTable::card_shift()); // tmp1 := card address relative to card table base - __ load_byte_map_base(tmp2); // tmp2 := card table base address - __ add(tmp1, tmp1, tmp2); // tmp1 := card address - __ ldrb(tmp2, Address(tmp1)); // tmp2 := card - __ cmpw(tmp2, (int)G1CardTable::g1_young_card_val()); // tmp2 := card == young_card_val? -} -static void generate_post_barrier_slow_path(MacroAssembler* masm, - const Register thread, - const Register tmp1, - const Register tmp2, - Label& done, - Label& runtime) { - __ membar(Assembler::StoreLoad); // StoreLoad membar - __ ldrb(tmp2, Address(tmp1)); // tmp2 := card - __ cbzw(tmp2, done); - // Storing a region crossing, non-null oop, card is clean. - // Dirty card and log. - STATIC_ASSERT(CardTable::dirty_card_val() == 0); - __ strb(zr, Address(tmp1)); // *(card address) := dirty_card_val - generate_queue_test_and_insertion(masm, - G1ThreadLocalData::dirty_card_queue_index_offset(), - G1ThreadLocalData::dirty_card_queue_buffer_offset(), - runtime, - thread, tmp1, tmp2, rscratch1); - __ b(done); + Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset())); + __ ldr(tmp2, card_table_addr); // tmp2 := card table base address + if (UseCondCardMark) { + __ ldrb(rscratch1, Address(tmp1, tmp2)); // rscratch1 := card + // Instead of loading clean_card_val and comparing, we exploit the fact that + // the LSB of non-clean cards is always 0, and the LSB of clean cards 1. + __ tbz(rscratch1, 0, done); + } + static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr"); + __ strb(zr, Address(tmp1, tmp2)); // *(card address) := dirty_card_val } void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, @@ -249,27 +272,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Register thread, Register tmp1, Register tmp2) { - assert(thread == rthread, "must be"); - assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, - rscratch1); - assert(store_addr != noreg && new_val != noreg && tmp1 != noreg - && tmp2 != noreg, "expecting a register"); - Label done; - Label runtime; - - generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */); - // If card is young, jump to done - __ br(Assembler::EQ, done); - generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime); - - __ bind(runtime); - // save the live input values - RegSet saved = RegSet::of(store_addr); - __ push(saved, sp); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread); - __ pop(saved, sp); - + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, false /* new_val_may_be_null */); __ bind(done); } @@ -329,38 +333,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm, Register thread, Register tmp1, Register tmp2, - G1PostBarrierStubC2* stub) { - assert(thread == rthread, "must be"); - assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, - rscratch1); - assert(store_addr != noreg && new_val != noreg && tmp1 != noreg - && tmp2 != noreg, "expecting a register"); - - stub->initialize_registers(thread, tmp1, tmp2); - - bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0; - generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null); - // If card is not young, jump to stub (slow path) - __ br(Assembler::NE, *stub->entry()); - - __ bind(*stub->continuation()); -} - -void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const { - Assembler::InlineSkippedInstructionsCounter skip_counter(masm); - Label runtime; - Register thread = stub->thread(); - Register tmp1 = stub->tmp1(); // tmp1 holds the card address. - Register tmp2 = stub->tmp2(); - assert(stub->tmp3() == noreg, "not needed in this platform"); - - __ bind(*stub->entry()); - generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime); - - __ bind(runtime); - generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry)); - __ b(*stub->continuation()); + bool new_val_may_be_null) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null); + __ bind(done); } #endif // COMPILER2 @@ -456,20 +432,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier __ b(*stub->continuation()); } -void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) { - G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1(); - __ bind(*stub->entry()); - assert(stub->addr()->is_register(), "Precondition."); - assert(stub->new_val()->is_register(), "Precondition."); - Register new_val_reg = stub->new_val()->as_register(); - __ cbz(new_val_reg, *stub->continuation()); - ce->store_parameter(stub->addr()->as_pointer_register(), 0); - __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin())); - __ b(*stub->continuation()); -} - #undef __ +void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */); + masm->bind(done); +} + #define __ sasm-> void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) { @@ -521,74 +496,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* __ epilogue(); } -void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) { - __ prologue("g1_post_barrier", false); - - // arg0: store_address - Address store_addr(rfp, 2*BytesPerWord); - - BarrierSet* bs = BarrierSet::barrier_set(); - CardTableBarrierSet* ctbs = barrier_set_cast(bs); - CardTable* ct = ctbs->card_table(); - - Label done; - Label runtime; - - // At this point we know new_value is non-null and the new_value crosses regions. - // Must check to see if card is already dirty - - const Register thread = rthread; - - Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset())); - Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset())); - - const Register card_offset = rscratch2; - // LR is free here, so we can use it to hold the byte_map_base. - const Register byte_map_base = lr; - - assert_different_registers(card_offset, byte_map_base, rscratch1); - - __ load_parameter(0, card_offset); - __ lsr(card_offset, card_offset, CardTable::card_shift()); - __ load_byte_map_base(byte_map_base); - __ ldrb(rscratch1, Address(byte_map_base, card_offset)); - __ cmpw(rscratch1, (int)G1CardTable::g1_young_card_val()); - __ br(Assembler::EQ, done); - - assert((int)CardTable::dirty_card_val() == 0, "must be 0"); - - __ membar(Assembler::StoreLoad); - __ ldrb(rscratch1, Address(byte_map_base, card_offset)); - __ cbzw(rscratch1, done); - - // storing region crossing non-null, card is clean. - // dirty card and log. - __ strb(zr, Address(byte_map_base, card_offset)); - - // Convert card offset into an address in card_addr - Register card_addr = card_offset; - __ add(card_addr, byte_map_base, card_addr); - - __ ldr(rscratch1, queue_index); - __ cbz(rscratch1, runtime); - __ sub(rscratch1, rscratch1, wordSize); - __ str(rscratch1, queue_index); - - // Reuse LR to hold buffer_addr - const Register buffer_addr = lr; - - __ ldr(buffer_addr, buffer); - __ str(card_addr, Address(buffer_addr, rscratch1)); - __ b(done); - - __ bind(runtime); - __ push_call_clobbered_registers(); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread); - __ pop_call_clobbered_registers(); - __ bind(done); - __ epilogue(); -} - #undef __ #endif // COMPILER1 diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp index 04ac2096096..72040cd7ad2 100644 --- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,9 +32,7 @@ class LIR_Assembler; class StubAssembler; class G1PreBarrierStub; -class G1PostBarrierStub; class G1PreBarrierStubC2; -class G1PostBarrierStubC2; class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { protected: @@ -65,10 +63,15 @@ protected: public: #ifdef COMPILER1 void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub); - void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub); void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm); - void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm); + + void g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2); #endif #ifdef COMPILER2 @@ -87,9 +90,7 @@ public: Register thread, Register tmp1, Register tmp2, - G1PostBarrierStubC2* c2_stub); - void generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const; + bool new_val_may_be_null); #endif void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad index 081a67d6880..18fc27a4af4 100644 --- a/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad +++ b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm, Register new_val, Register tmp1, Register tmp2) { - if (!G1PostBarrierStubC2::needs_barrier(node)) { + if (!G1BarrierStubC2::needs_post_barrier(node)) { return; } Assembler::InlineSkippedInstructionsCounter skip_counter(masm); G1BarrierSetAssembler* g1_asm = static_cast(BarrierSet::barrier_set()->barrier_set_assembler()); - G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node); - g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, stub); + bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node); + g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, new_val_may_be_null); } %} diff --git a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp index 049477cda76..71f8931eb5f 100644 --- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp +++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp @@ -201,12 +201,15 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, static void generate_post_barrier_fast_path(MacroAssembler* masm, const Register store_addr, const Register new_val, + const Register thread, const Register tmp1, const Register tmp2, Label& done, bool new_val_may_be_null) { - // Does store cross heap regions? + assert(thread == Rthread, "must be"); + assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg); + // Does store cross heap regions? __ eor(tmp1, store_addr, new_val); __ movs(tmp1, AsmOperand(tmp1, lsr, G1HeapRegion::LogOfHRGrainBytes)); __ b(done, eq); @@ -215,76 +218,34 @@ static void generate_post_barrier_fast_path(MacroAssembler* masm, if (new_val_may_be_null) { __ cbz(new_val, done); } - // storing region crossing non-null, is card already dirty? - const Register card_addr = tmp1; - CardTableBarrierSet* ct = barrier_set_cast(BarrierSet::barrier_set()); - __ mov_address(tmp2, (address)ct->card_table()->byte_map_base()); - __ add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift())); + // storing region crossing non-null, is card already non-clean? + Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset())); + __ ldr(tmp2, card_table_addr); + __ add(tmp1, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift())); - __ ldrb(tmp2, Address(card_addr)); - __ cmp(tmp2, (int)G1CardTable::g1_young_card_val()); + if (UseCondCardMark) { + __ ldrb(tmp2, Address(tmp1)); + // Instead of loading clean_card_val and comparing, we exploit the fact that + // the LSB of non-clean cards is always 0, and the LSB of clean cards 1. + __ tbz(tmp2, 0, done); + } + + static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zero_register()"); + __ zero_register(tmp2); + __ strb(tmp2, Address(tmp1)); // *(card address) := dirty_card_val } -static void generate_post_barrier_slow_path(MacroAssembler* masm, - const Register thread, - const Register tmp1, - const Register tmp2, - const Register tmp3, - Label& done, - Label& runtime) { - __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2); - assert(CardTable::dirty_card_val() == 0, "adjust this code"); - // card_addr is loaded by generate_post_barrier_fast_path - const Register card_addr = tmp1; - __ ldrb(tmp2, Address(card_addr)); - __ cbz(tmp2, done); - - // storing a region crossing, non-null oop, card is clean. - // dirty card and log. - - __ strb(__ zero_register(tmp2), Address(card_addr)); - generate_queue_test_and_insertion(masm, - G1ThreadLocalData::dirty_card_queue_index_offset(), - G1ThreadLocalData::dirty_card_queue_buffer_offset(), - runtime, - thread, card_addr, tmp2, tmp3); - __ b(done); -} - - // G1 post-barrier. // Blows all volatile registers R0-R3, LR). void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, - Register store_addr, - Register new_val, - Register tmp1, - Register tmp2, - Register tmp3) { + Register store_addr, + Register new_val, + Register tmp1, + Register tmp2, + Register tmp3) { Label done; - Label runtime; - - generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */); - // If card is young, jump to done - // card_addr and card are loaded by generate_post_barrier_fast_path - const Register card = tmp2; - const Register card_addr = tmp1; - __ b(done, eq); - generate_post_barrier_slow_path(masm, Rthread, card_addr, tmp2, tmp3, done, runtime); - - __ bind(runtime); - - RegisterSet set = RegisterSet(store_addr) | RegisterSet(R0, R3) | RegisterSet(R12); - __ push(set); - - if (card_addr != R0) { - __ mov(R0, card_addr); - } - __ mov(R1, Rthread); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), R0, R1); - - __ pop(set); - + generate_post_barrier_fast_path(masm, store_addr, new_val, Rthread, tmp1, tmp2, done, true /* new_val_may_be_null */); __ bind(done); } @@ -344,35 +305,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm, Register tmp1, Register tmp2, Register tmp3, - G1PostBarrierStubC2* stub) { - assert(thread == Rthread, "must be"); - assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg); - - stub->initialize_registers(thread, tmp1, tmp2, tmp3); - - bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0; - generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null); - // If card is not young, jump to stub (slow path) - __ b(*stub->entry(), ne); - - __ bind(*stub->continuation()); -} - -void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const { - Assembler::InlineSkippedInstructionsCounter skip_counter(masm); - Label runtime; - Register thread = stub->thread(); - Register tmp1 = stub->tmp1(); // tmp1 holds the card address. - Register tmp2 = stub->tmp2(); - Register tmp3 = stub->tmp3(); - - __ bind(*stub->entry()); - generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, tmp3, *stub->continuation(), runtime); - - __ bind(runtime); - generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp2); - __ b(*stub->continuation()); + bool new_val_may_be_null) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null); + __ bind(done); } #endif // COMPILER2 @@ -463,20 +399,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier __ b(*stub->continuation()); } -void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) { - G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1(); - __ bind(*stub->entry()); - assert(stub->addr()->is_register(), "Precondition."); - assert(stub->new_val()->is_register(), "Precondition."); - Register new_val_reg = stub->new_val()->as_register(); - __ cbz(new_val_reg, *stub->continuation()); - ce->verify_reserved_argument_area_size(1); - __ str(stub->addr()->as_pointer_register(), Address(SP)); - __ call(bs->post_barrier_c1_runtime_code_blob()->code_begin(), relocInfo::runtime_call_type); - __ b(*stub->continuation()); +#undef __ + +void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */); + masm->bind(done); } -#undef __ #define __ sasm-> void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) { @@ -536,102 +471,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* __ b(done); } -void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) { - // Input: - // - store_addr, pushed on the stack - - __ set_info("g1_post_barrier_slow_id", false); - - Label done; - Label recheck; - Label runtime; - - Address queue_index(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset())); - Address buffer(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset())); - - AddressLiteral cardtable(ci_card_table_address_as
(), relocInfo::none); - - // save at least the registers that need saving if the runtime is called - const RegisterSet saved_regs = RegisterSet(R0,R3) | RegisterSet(R12) | RegisterSet(LR); - const int nb_saved_regs = 6; - assert(nb_saved_regs == saved_regs.size(), "fix nb_saved_regs"); - __ push(saved_regs); - - const Register r_card_addr_0 = R0; // must be R0 for the slow case - const Register r_obj_0 = R0; - const Register r_card_base_1 = R1; - const Register r_tmp2 = R2; - const Register r_index_2 = R2; - const Register r_buffer_3 = R3; - const Register tmp1 = Rtemp; - - __ ldr(r_obj_0, Address(SP, nb_saved_regs*wordSize)); - // Note: there is a comment in x86 code about not using - // ExternalAddress / lea, due to relocation not working - // properly for that address. Should be OK for arm, where we - // explicitly specify that 'cardtable' has a relocInfo::none - // type. - __ lea(r_card_base_1, cardtable); - __ add(r_card_addr_0, r_card_base_1, AsmOperand(r_obj_0, lsr, CardTable::card_shift())); - - // first quick check without barrier - __ ldrb(r_tmp2, Address(r_card_addr_0)); - - __ cmp(r_tmp2, (int)G1CardTable::g1_young_card_val()); - __ b(recheck, ne); - - __ bind(done); - - __ pop(saved_regs); - - __ ret(); - - __ bind(recheck); - - __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp1); - - // reload card state after the barrier that ensures the stored oop was visible - __ ldrb(r_tmp2, Address(r_card_addr_0)); - - assert(CardTable::dirty_card_val() == 0, "adjust this code"); - __ cbz(r_tmp2, done); - - // storing region crossing non-null, card is clean. - // dirty card and log. - - assert(0 == (int)CardTable::dirty_card_val(), "adjust this code"); - if ((ci_card_table_address_as() & 0xff) == 0) { - // Card table is aligned so the lowest byte of the table address base is zero. - __ strb(r_card_base_1, Address(r_card_addr_0)); - } else { - __ strb(__ zero_register(r_tmp2), Address(r_card_addr_0)); - } - - __ ldr(r_index_2, queue_index); - __ ldr(r_buffer_3, buffer); - - __ subs(r_index_2, r_index_2, wordSize); - __ b(runtime, lt); // go to runtime if now negative - - __ str(r_index_2, queue_index); - - __ str(r_card_addr_0, Address(r_buffer_3, r_index_2)); - - __ b(done); - - __ bind(runtime); - - __ save_live_registers(); - - assert(r_card_addr_0 == c_rarg0, "card_addr should be in R0"); - __ mov(c_rarg1, Rthread); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), c_rarg0, c_rarg1); - - __ restore_live_registers_without_return(); - - __ b(done); -} - #undef __ #endif // COMPILER1 diff --git a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp index 4e49e655e3e..9e0eff4601b 100644 --- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp +++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,9 +32,7 @@ class LIR_Assembler; class StubAssembler; class G1PreBarrierStub; -class G1PostBarrierStub; class G1PreBarrierStubC2; -class G1PostBarrierStubC2; class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { protected: @@ -66,10 +64,15 @@ public: #ifdef COMPILER1 public: void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub); - void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub); void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm); - void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm); + + void g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2); #endif #ifdef COMPILER2 @@ -89,9 +92,7 @@ public: Register tmp1, Register tmp2, Register tmp3, - G1PostBarrierStubC2* c2_stub); - void generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const; + bool new_val_may_be_null); #endif }; diff --git a/src/hotspot/cpu/arm/gc/g1/g1_arm.ad b/src/hotspot/cpu/arm/gc/g1/g1_arm.ad index 8a0a9e1aa53..e905ba9ff67 100644 --- a/src/hotspot/cpu/arm/gc/g1/g1_arm.ad +++ b/src/hotspot/cpu/arm/gc/g1/g1_arm.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -63,13 +63,13 @@ static void write_barrier_post(MacroAssembler* masm, Register tmp1, Register tmp2, Register tmp3) { - if (!G1PostBarrierStubC2::needs_barrier(node)) { + if (!G1BarrierStubC2::needs_post_barrier(node)) { return; } Assembler::InlineSkippedInstructionsCounter skip_counter(masm); G1BarrierSetAssembler* g1_asm = static_cast(BarrierSet::barrier_set()->barrier_set_assembler()); - G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node); - g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, stub); + bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node); + g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, new_val_may_be_null); } %} diff --git a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp index 4fb13422f59..262bb1eae89 100644 --- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp +++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp @@ -28,7 +28,6 @@ #include "gc/g1/g1BarrierSetAssembler.hpp" #include "gc/g1/g1BarrierSetRuntime.hpp" #include "gc/g1/g1CardTable.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1HeapRegion.hpp" #include "gc/g1/g1SATBMarkQueueSet.hpp" #include "gc/g1/g1ThreadLocalData.hpp" @@ -230,78 +229,52 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator __ bind(filtered); } -static void generate_region_crossing_test(MacroAssembler* masm, const Register store_addr, const Register new_val) { - __ xorr(R0, store_addr, new_val); // tmp1 := store address ^ new value - __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes) -} +static void generate_post_barrier_fast_path(MacroAssembler* masm, + const Register store_addr, + const Register new_val, + const Register thread, + const Register tmp1, + const Register tmp2, + Label& done, + bool new_val_may_be_null) { + assert_different_registers(store_addr, new_val, tmp1, R0); + assert_different_registers(store_addr, tmp1, tmp2, R0); -static Address generate_card_young_test(MacroAssembler* masm, const Register store_addr, const Register tmp1, const Register tmp2) { - CardTableBarrierSet* ct = barrier_set_cast(BarrierSet::barrier_set()); - __ load_const_optimized(tmp1, (address)(ct->card_table()->byte_map_base()), tmp2); - __ srdi(tmp2, store_addr, CardTable::card_shift()); // tmp1 := card address relative to card table base - __ lbzx(R0, tmp1, tmp2); // tmp1 := card address - __ cmpwi(CR0, R0, (int)G1CardTable::g1_young_card_val()); - return Address(tmp1, tmp2); // return card address -} + __ xorr(R0, store_addr, new_val); // R0 := store address ^ new value + __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes); // R0 := ((store address ^ new value) >> LogOfHRGrainBytes) + __ beq(CR0, done); -static void generate_card_dirty_test(MacroAssembler* masm, Address card_addr) { - __ membar(Assembler::StoreLoad); // Must reload after StoreLoad membar due to concurrent refinement - __ lbzx(R0, card_addr.base(), card_addr.index()); // tmp2 := card - __ cmpwi(CR0, R0, (int)G1CardTable::dirty_card_val()); // tmp2 := card == dirty_card_val? + // Crosses regions, storing null? + if (!new_val_may_be_null) { +#ifdef ASSERT + __ cmpdi(CR0, new_val, 0); + __ asm_assert_ne("null oop not allowed (G1 post)"); // Checked by caller. +#endif + } else { + __ cmpdi(CR0, new_val, 0); + __ beq(CR0, done); + } + + __ ld(tmp1, G1ThreadLocalData::card_table_base_offset(), thread); + __ srdi(tmp2, store_addr, CardTable::card_shift()); // tmp2 := card address relative to card table base + if (UseCondCardMark) { + __ lbzx(R0, tmp1, tmp2); + __ cmpwi(CR0, R0, (int)G1CardTable::clean_card_val()); + __ bne(CR0, done); + } + + __ li(R0, G1CardTable::dirty_card_val()); + __ stbx(R0, tmp1, tmp2); } void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators, Register store_addr, Register new_val, - Register tmp1, Register tmp2, Register tmp3, - MacroAssembler::PreservationLevel preservation_level) { + Register tmp1, Register tmp2) { bool not_null = (decorators & IS_NOT_NULL) != 0; - Label runtime, filtered; - assert_different_registers(store_addr, new_val, tmp1, tmp2); - - CardTableBarrierSet* ct = barrier_set_cast(BarrierSet::barrier_set()); - - generate_region_crossing_test(masm, store_addr, new_val); - __ beq(CR0, filtered); - - // Crosses regions, storing null? - if (not_null) { -#ifdef ASSERT - __ cmpdi(CR0, new_val, 0); - __ asm_assert_ne("null oop not allowed (G1 post)"); // Checked by caller. -#endif - } else { - __ cmpdi(CR0, new_val, 0); - __ beq(CR0, filtered); - } - - Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2); - __ beq(CR0, filtered); - - generate_card_dirty_test(masm, card_addr); - __ beq(CR0, filtered); - - __ li(R0, (int)G1CardTable::dirty_card_val()); - __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val - - Register Rcard_addr = tmp3; - __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued. - - generate_queue_insertion(masm, - G1ThreadLocalData::dirty_card_queue_index_offset(), - G1ThreadLocalData::dirty_card_queue_buffer_offset(), - runtime, Rcard_addr, tmp1); - __ b(filtered); - - __ bind(runtime); - - assert(preservation_level == MacroAssembler::PRESERVATION_NONE, - "g1_write_barrier_post doesn't support preservation levels higher than PRESERVATION_NONE"); - - // Save the live input values. - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), Rcard_addr, R16_thread); - - __ bind(filtered); + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, R16_thread, tmp1, tmp2, done, !not_null); + __ bind(done); } void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, @@ -333,8 +306,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco } g1_write_barrier_post(masm, decorators, base, val, - tmp1, tmp2, tmp3, - preservation_level); + tmp1, tmp2); } } @@ -457,70 +429,29 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm, Register new_val, Register tmp1, Register tmp2, - G1PostBarrierStubC2* stub, + bool new_val_may_be_null, bool decode_new_val) { assert_different_registers(store_addr, new_val, tmp1, R0); assert_different_registers(store_addr, tmp1, tmp2, R0); - stub->initialize_registers(R16_thread, tmp1, tmp2); + Label done; - bool null_check_required = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0; Register new_val_decoded = new_val; if (decode_new_val) { assert(UseCompressedOops, "or should not be here"); - if (null_check_required && CompressedOops::base() != nullptr) { + if (new_val_may_be_null && CompressedOops::base() != nullptr) { // We prefer doing the null check after the region crossing check. // Only compressed oop modes with base != null require a null check here. __ cmpwi(CR0, new_val, 0); - __ beq(CR0, *stub->continuation()); - null_check_required = false; + __ beq(CR0, done); + new_val_may_be_null = false; } new_val_decoded = __ decode_heap_oop_not_null(tmp2, new_val); } - generate_region_crossing_test(masm, store_addr, new_val_decoded); - __ beq(CR0, *stub->continuation()); - - // crosses regions, storing null? - if (null_check_required) { - __ cmpdi(CR0, new_val_decoded, 0); - __ beq(CR0, *stub->continuation()); - } - - Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2); - assert(card_addr.base() == tmp1 && card_addr.index() == tmp2, "needed by post barrier stub"); - __ bc_far_optimized(Assembler::bcondCRbiIs0, __ bi0(CR0, Assembler::equal), *stub->entry()); - - __ bind(*stub->continuation()); -} - -void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const { - Assembler::InlineSkippedInstructionsCounter skip_counter(masm); - Label runtime; - Address card_addr(stub->tmp1(), stub->tmp2()); // See above. - - __ bind(*stub->entry()); - - generate_card_dirty_test(masm, card_addr); - __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CR0, Assembler::equal), *stub->continuation()); - - __ li(R0, (int)G1CardTable::dirty_card_val()); - __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val - - Register Rcard_addr = stub->tmp1(); - __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued. - - generate_queue_insertion(masm, - G1ThreadLocalData::dirty_card_queue_index_offset(), - G1ThreadLocalData::dirty_card_queue_buffer_offset(), - runtime, Rcard_addr, stub->tmp2()); - __ b(*stub->continuation()); - - __ bind(runtime); - generate_c2_barrier_runtime_call(masm, stub, Rcard_addr, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry)); - __ b(*stub->continuation()); + generate_post_barrier_fast_path(masm, store_addr, new_val_decoded, R16_thread, tmp1, tmp2, done, new_val_may_be_null); + __ bind(done); } #endif // COMPILER2 @@ -558,28 +489,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier __ b(*stub->continuation()); } -void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) { - G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1(); - __ bind(*stub->entry()); +#undef __ - assert(stub->addr()->is_register(), "Precondition."); - assert(stub->new_val()->is_register(), "Precondition."); - Register addr_reg = stub->addr()->as_pointer_register(); - Register new_val_reg = stub->new_val()->as_register(); - - __ cmpdi(CR0, new_val_reg, 0); - __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CR0, Assembler::equal), *stub->continuation()); - - address c_code = bs->post_barrier_c1_runtime_code_blob()->code_begin(); - //__ load_const_optimized(R0, c_code); - __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(c_code)); - __ mtctr(R0); - __ mr(R0, addr_reg); // Pass addr in R0. - __ bctrl(); - __ b(*stub->continuation()); +void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */); + masm->bind(done); } -#undef __ #define __ sasm-> void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) { @@ -642,86 +564,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* __ b(restart); } -void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) { - G1BarrierSet* bs = barrier_set_cast(BarrierSet::barrier_set()); - - __ set_info("g1_post_barrier_slow_id", false); - - // Using stack slots: spill addr, spill tmp2 - const int stack_slots = 2; - Register tmp = R0; - Register addr = R14; - Register tmp2 = R15; - CardTable::CardValue* byte_map_base = bs->card_table()->byte_map_base(); - - Label restart, refill, ret; - - // Spill - __ std(addr, -8, R1_SP); - __ std(tmp2, -16, R1_SP); - - __ srdi(addr, R0, CardTable::card_shift()); // Addr is passed in R0. - __ load_const_optimized(/*cardtable*/ tmp2, byte_map_base, tmp); - __ add(addr, tmp2, addr); - __ lbz(tmp, 0, addr); // tmp := [addr + cardtable] - - // Return if young card. - __ cmpwi(CR0, tmp, G1CardTable::g1_young_card_val()); - __ beq(CR0, ret); - - // Return if sequential consistent value is already dirty. - __ membar(Assembler::StoreLoad); - __ lbz(tmp, 0, addr); // tmp := [addr + cardtable] - - __ cmpwi(CR0, tmp, G1CardTable::dirty_card_val()); - __ beq(CR0, ret); - - // Not dirty. - - // First, dirty it. - __ li(tmp, G1CardTable::dirty_card_val()); - __ stb(tmp, 0, addr); - - int dirty_card_q_index_byte_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()); - int dirty_card_q_buf_byte_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()); - - __ bind(restart); - - // Get the index into the update buffer. G1DirtyCardQueue::_index is - // a size_t so ld_ptr is appropriate here. - __ ld(tmp2, dirty_card_q_index_byte_offset, R16_thread); - - // index == 0? - __ cmpdi(CR0, tmp2, 0); - __ beq(CR0, refill); - - __ ld(tmp, dirty_card_q_buf_byte_offset, R16_thread); - __ addi(tmp2, tmp2, -oopSize); - - __ std(tmp2, dirty_card_q_index_byte_offset, R16_thread); - __ add(tmp2, tmp, tmp2); - __ std(addr, 0, tmp2); // [_buf + index] := - - // Restore temp registers and return-from-leaf. - __ bind(ret); - __ ld(tmp2, -16, R1_SP); - __ ld(addr, -8, R1_SP); - __ blr(); - - __ bind(refill); - const int nbytes_save = (MacroAssembler::num_volatile_regs + stack_slots) * BytesPerWord; - __ save_volatile_gprs(R1_SP, -nbytes_save); // except R0 - __ mflr(R0); - __ std(R0, _abi0(lr), R1_SP); - __ push_frame_reg_args(nbytes_save, R0); // dummy frame for C call - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1DirtyCardQueueSet::handle_zero_index_for_thread), R16_thread); - __ pop_frame(); - __ ld(R0, _abi0(lr), R1_SP); - __ mtlr(R0); - __ restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 - __ b(restart); -} - #undef __ #endif // COMPILER1 diff --git a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp index 33cb89dacc6..e059cc661af 100644 --- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp +++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, 2021 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -37,9 +37,7 @@ class LIR_Assembler; class StubAssembler; class G1PreBarrierStub; -class G1PostBarrierStub; class G1PreBarrierStubC2; -class G1PostBarrierStubC2; class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { protected: @@ -56,8 +54,7 @@ protected: MacroAssembler::PreservationLevel preservation_level); void g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators, Register store_addr, Register new_val, - Register tmp1, Register tmp2, Register tmp3, - MacroAssembler::PreservationLevel preservation_level); + Register tmp1, Register tmp2); virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, Register base, RegisterOrConstant ind_or_offs, Register val, @@ -79,17 +76,21 @@ public: Register new_val, Register tmp1, Register tmp2, - G1PostBarrierStubC2* c2_stub, + bool new_val_may_be_null, bool decode_new_val); - void generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const; #endif #ifdef COMPILER1 void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub); - void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub); void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm); - void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm); + + void g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2); + #endif virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, diff --git a/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad b/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad index 4f24efe872b..0a4a9442855 100644 --- a/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad +++ b/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2025 SAP SE. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -64,13 +64,13 @@ static void post_write_barrier(MacroAssembler* masm, Register tmp1, Register tmp2, bool decode_new_val = false) { - if (!G1PostBarrierStubC2::needs_barrier(node)) { + if (!G1BarrierStubC2::needs_post_barrier(node)) { return; } Assembler::InlineSkippedInstructionsCounter skip_counter(masm); G1BarrierSetAssembler* g1_asm = static_cast(BarrierSet::barrier_set()->barrier_set_assembler()); - G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node); - g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub, decode_new_val); + bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node); + g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, new_val_may_be_null, decode_new_val); } %} diff --git a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp index ef5dcdd8074..9c3bd93f8a6 100644 --- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp @@ -87,15 +87,54 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm } } -void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators, - Register start, Register count, Register tmp, RegSet saved_regs) { - __ push_reg(saved_regs, sp); +void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, + DecoratorSet decorators, + Register start, + Register count, + Register tmp, + RegSet saved_regs) { assert_different_registers(start, count, tmp); - assert_different_registers(c_rarg0, count); - __ mv(c_rarg0, start); - __ mv(c_rarg1, count); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2); - __ pop_reg(saved_regs, sp); + + Label loop, next, done; + + // Zero count? Nothing to do. + __ beqz(count, done); + + // Calculate the number of card marks to set. Since the object might start and + // end within a card, we need to calculate this via the card table indexes of + // the actual start and last addresses covered by the object. + // Temporarily use the count register for the last element address. + __ shadd(count, count, start, tmp, LogBytesPerHeapOop); // end = start + count << LogBytesPerHeapOop + __ subi(count, count, BytesPerHeapOop); // Use last element address for end. + + __ srli(start, start, CardTable::card_shift()); + __ srli(count, count, CardTable::card_shift()); + __ sub(count, count, start); // Number of bytes to mark - 1. + + // Add card table base offset to start. + Address card_table_address(xthread, G1ThreadLocalData::card_table_base_offset()); + __ ld(tmp, card_table_address); + __ add(start, start, tmp); + + __ bind(loop); + if (UseCondCardMark) { + __ add(tmp, start, count); + __ lbu(tmp, Address(tmp, 0)); + static_assert((uint)G1CardTable::clean_card_val() == 0xff, "must be"); + __ subi(tmp, tmp, G1CardTable::clean_card_val()); // Convert to clean_card_value() to a comparison + // against zero to avoid use of an extra temp. + __ bnez(tmp, next); + } + + __ add(tmp, start, count); + static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr"); + __ sb(zr, Address(tmp, 0)); + + __ bind(next); + __ subi(count, count, 1); + __ bgez(count, loop); + + __ bind(done); } static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime, @@ -192,44 +231,37 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, static void generate_post_barrier_fast_path(MacroAssembler* masm, const Register store_addr, const Register new_val, - const Register tmp1, - const Register tmp2, - Label& done, - bool new_val_may_be_null) { - // Does store cross heap regions? - __ xorr(tmp1, store_addr, new_val); // tmp1 := store address ^ new value - __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes) - __ beqz(tmp1, done); - // Crosses regions, storing null? - if (new_val_may_be_null) { - __ beqz(new_val, done); - } - // Storing region crossing non-null, is card young? - __ srli(tmp1, store_addr, CardTable::card_shift()); // tmp1 := card address relative to card table base - __ load_byte_map_base(tmp2); // tmp2 := card table base address - __ add(tmp1, tmp1, tmp2); // tmp1 := card address - __ lbu(tmp2, Address(tmp1)); // tmp2 := card -} - -static void generate_post_barrier_slow_path(MacroAssembler* masm, const Register thread, const Register tmp1, const Register tmp2, Label& done, - Label& runtime) { - __ membar(MacroAssembler::StoreLoad); // StoreLoad membar - __ lbu(tmp2, Address(tmp1)); // tmp2 := card - __ beqz(tmp2, done, true); - // Storing a region crossing, non-null oop, card is clean. - // Dirty card and log. - STATIC_ASSERT(CardTable::dirty_card_val() == 0); - __ sb(zr, Address(tmp1)); // *(card address) := dirty_card_val - generate_queue_test_and_insertion(masm, - G1ThreadLocalData::dirty_card_queue_index_offset(), - G1ThreadLocalData::dirty_card_queue_buffer_offset(), - runtime, - thread, tmp1, tmp2, t0); - __ j(done); + bool new_val_may_be_null) { + assert(thread == xthread, "must be"); + assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg); + // Does store cross heap regions? + __ xorr(tmp1, store_addr, new_val); // tmp1 := store address ^ new value + __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes) + __ beqz(tmp1, done); + + // Crosses regions, storing null? + if (new_val_may_be_null) { + __ beqz(new_val, done); + } + // Storing region crossing non-null, is card clean? + __ srli(tmp1, store_addr, CardTable::card_shift()); // tmp1 := card address relative to card table base + + Address card_table_address(xthread, G1ThreadLocalData::card_table_base_offset()); + __ ld(tmp2, card_table_address); // tmp2 := card table base address + __ add(tmp1, tmp1, tmp2); // tmp1 := card address + if (UseCondCardMark) { + static_assert((uint)G1CardTable::clean_card_val() == 0xff, "must be"); + __ lbu(tmp2, Address(tmp1, 0)); // tmp2 := card + __ subi(tmp2, tmp2, G1CardTable::clean_card_val()); // Convert to clean_card_value() to a comparison + // against zero to avoid use of an extra temp. + __ bnez(tmp2, done); + } + static_assert((uint)G1CardTable::dirty_card_val() == 0, "must be to use zr"); + __ sb(zr, Address(tmp1, 0)); } void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, @@ -238,27 +270,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Register thread, Register tmp1, Register tmp2) { - assert(thread == xthread, "must be"); - assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0); - assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg, - "expecting a register"); - Label done; - Label runtime; - - generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */); - // If card is young, jump to done (tmp2 holds the card value) - __ mv(t0, (int)G1CardTable::g1_young_card_val()); - __ beq(tmp2, t0, done); // card == young_card_val? - generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime); - - __ bind(runtime); - // save the live input values - RegSet saved = RegSet::of(store_addr); - __ push_reg(saved, sp); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread); - __ pop_reg(saved, sp); - + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */); __ bind(done); } @@ -318,37 +331,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm, Register thread, Register tmp1, Register tmp2, - G1PostBarrierStubC2* stub) { - assert(thread == xthread, "must be"); - assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0); - assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg, - "expecting a register"); - - stub->initialize_registers(thread, tmp1, tmp2); - - bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0; - generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null); - // If card is not young, jump to stub (slow path) (tmp2 holds the card value) - __ mv(t0, (int)G1CardTable::g1_young_card_val()); - __ bne(tmp2, t0, *stub->entry(), true); - - __ bind(*stub->continuation()); -} - -void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const { - Assembler::InlineSkippedInstructionsCounter skip_counter(masm); - Label runtime; - Register thread = stub->thread(); - Register tmp1 = stub->tmp1(); // tmp1 holds the card address. - Register tmp2 = stub->tmp2(); - - __ bind(*stub->entry()); - generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime); - - __ bind(runtime); - generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry)); - __ j(*stub->continuation()); + bool new_val_may_be_null) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null); + __ bind(done); } #endif // COMPILER2 @@ -443,20 +429,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier __ j(*stub->continuation()); } -void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) { - G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1(); - __ bind(*stub->entry()); - assert(stub->addr()->is_register(), "Precondition"); - assert(stub->new_val()->is_register(), "Precondition"); - Register new_val_reg = stub->new_val()->as_register(); - __ beqz(new_val_reg, *stub->continuation(), /* is_far */ true); - ce->store_parameter(stub->addr()->as_pointer_register(), 0); - __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin())); - __ j(*stub->continuation()); -} - #undef __ +void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */); + masm->bind(done); +} + #define __ sasm-> void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) { @@ -507,74 +492,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* __ epilogue(); } -void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) { - __ prologue("g1_post_barrier", false); - - // arg0 : store_address - Address store_addr(fp, 2 * BytesPerWord); // 2 BytesPerWord from fp - - BarrierSet* bs = BarrierSet::barrier_set(); - CardTableBarrierSet* ctbs = barrier_set_cast(bs); - - Label done; - Label runtime; - - // At this point we know new_value is non-null and the new_value crosses regions. - // Must check to see if card is already dirty - const Register thread = xthread; - - Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset())); - Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset())); - - const Register card_offset = t1; - // RA is free here, so we can use it to hold the byte_map_base. - const Register byte_map_base = ra; - - assert_different_registers(card_offset, byte_map_base, t0); - - __ load_parameter(0, card_offset); - __ srli(card_offset, card_offset, CardTable::card_shift()); - __ load_byte_map_base(byte_map_base); - - // Convert card offset into an address in card_addr - Register card_addr = card_offset; - __ add(card_addr, byte_map_base, card_addr); - - __ lbu(t0, Address(card_addr, 0)); - __ sub(t0, t0, (int)G1CardTable::g1_young_card_val()); - __ beqz(t0, done); - - assert((int)CardTable::dirty_card_val() == 0, "must be 0"); - - __ membar(MacroAssembler::StoreLoad); - __ lbu(t0, Address(card_addr, 0)); - __ beqz(t0, done); - - // storing region crossing non-null, card is clean. - // dirty card and log. - __ sb(zr, Address(card_addr, 0)); - - __ ld(t0, queue_index); - __ beqz(t0, runtime); - __ subi(t0, t0, wordSize); - __ sd(t0, queue_index); - - // Reuse RA to hold buffer_addr - const Register buffer_addr = ra; - - __ ld(buffer_addr, buffer); - __ add(t0, buffer_addr, t0); - __ sd(card_addr, Address(t0, 0)); - __ j(done); - - __ bind(runtime); - __ push_call_clobbered_registers(); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread); - __ pop_call_clobbered_registers(); - __ bind(done); - __ epilogue(); -} - #undef __ #endif // COMPILER1 diff --git a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp index 26310231362..654ba934242 100644 --- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -35,9 +35,7 @@ class LIR_Assembler; #endif class StubAssembler; class G1PreBarrierStub; -class G1PostBarrierStub; class G1PreBarrierStubC2; -class G1PostBarrierStubC2; class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { protected: @@ -68,10 +66,16 @@ protected: public: #ifdef COMPILER1 void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub); - void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub); void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm); - void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm); + + void g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2); + #endif #ifdef COMPILER2 @@ -90,9 +94,7 @@ public: Register thread, Register tmp1, Register tmp2, - G1PostBarrierStubC2* c2_stub); - void generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const; + bool new_val_may_be_null); #endif void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, diff --git a/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad b/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad index 7a525323021..8461a36e68c 100644 --- a/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad +++ b/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -63,13 +63,13 @@ static void write_barrier_post(MacroAssembler* masm, Register new_val, Register tmp1, Register tmp2) { - if (!G1PostBarrierStubC2::needs_barrier(node)) { + if (!G1BarrierStubC2::needs_post_barrier(node)) { return; } Assembler::InlineSkippedInstructionsCounter skip_counter(masm); G1BarrierSetAssembler* g1_asm = static_cast(BarrierSet::barrier_set()->barrier_set_assembler()); - G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node); - g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, stub); + bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node); + g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, new_val_may_be_null); } %} diff --git a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp index dea3317270e..3e176309c27 100644 --- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp +++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp @@ -28,7 +28,6 @@ #include "gc/g1/g1BarrierSetAssembler.hpp" #include "gc/g1/g1BarrierSetRuntime.hpp" #include "gc/g1/g1CardTable.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1HeapRegion.hpp" #include "gc/g1/g1SATBMarkQueueSet.hpp" #include "gc/g1/g1ThreadLocalData.hpp" @@ -205,104 +204,71 @@ void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm, BLOCK_COMMENT("} generate_c2_pre_barrier_stub"); } +static void generate_post_barrier_fast_path(MacroAssembler* masm, + const Register store_addr, + const Register new_val, + const Register thread, + const Register tmp1, + const Register tmp2, + Label& done, + bool new_val_may_be_null) { + + __ block_comment("generate_post_barrier_fast_path {"); + + assert(thread == Z_thread, "must be"); + assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg); + + // Does store cross heap regions? + if (VM_Version::has_DistinctOpnds()) { + __ z_xgrk(tmp1, store_addr, new_val); // tmp1 := store address ^ new value + } else { + __ z_lgr(tmp1, store_addr); + __ z_xgr(tmp1, new_val); + } + __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes) + __ branch_optimized(Assembler::bcondEqual, done); + + // Crosses regions, storing null? + if (new_val_may_be_null) { + __ z_ltgr(new_val, new_val); + __ z_bre(done); + } else { +#ifdef ASSERT + __ z_ltgr(new_val, new_val); + __ asm_assert(Assembler::bcondNotZero, "null oop not allowed (G1 post)", 0x322); // Checked by caller. +#endif + } + + __ z_srag(tmp1, store_addr, CardTable::card_shift()); + + Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset())); + __ z_alg(tmp1, card_table_addr); // tmp1 := card address + + if(UseCondCardMark) { + __ z_cli(0, tmp1, G1CardTable::clean_card_val()); + __ branch_optimized(Assembler::bcondNotEqual, done); + } + + static_assert(G1CardTable::dirty_card_val() == 0, "must be to use z_mvi"); + __ z_mvi(0, tmp1, G1CardTable::dirty_card_val()); // *(card address) := dirty_card_val + + __ block_comment("} generate_post_barrier_fast_path"); +} + void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm, Register store_addr, Register new_val, Register thread, Register tmp1, Register tmp2, - G1PostBarrierStubC2* stub) { + bool new_val_may_be_null) { BLOCK_COMMENT("g1_write_barrier_post_c2 {"); - - assert(thread == Z_thread, "must be"); - assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, Z_R1_scratch); - - assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register"); - - stub->initialize_registers(thread, tmp1, tmp2); - - BLOCK_COMMENT("generate_region_crossing_test {"); - if (VM_Version::has_DistinctOpnds()) { - __ z_xgrk(tmp1, store_addr, new_val); - } else { - __ z_lgr(tmp1, store_addr); - __ z_xgr(tmp1, new_val); - } - __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); - __ branch_optimized(Assembler::bcondEqual, *stub->continuation()); - BLOCK_COMMENT("} generate_region_crossing_test"); - - // crosses regions, storing null? - if ((stub->barrier_data() & G1C2BarrierPostNotNull) == 0) { - __ z_ltgr(new_val, new_val); - __ branch_optimized(Assembler::bcondEqual, *stub->continuation()); - } - - BLOCK_COMMENT("generate_card_young_test {"); - CardTableBarrierSet* ct = barrier_set_cast(BarrierSet::barrier_set()); - // calculate address of card - __ load_const_optimized(tmp2, (address)ct->card_table()->byte_map_base()); // Card table base. - __ z_srlg(tmp1, store_addr, CardTable::card_shift()); // Index into card table. - __ z_algr(tmp1, tmp2); // Explicit calculation needed for cli. - - // Filter young. - __ z_cli(0, tmp1, G1CardTable::g1_young_card_val()); - - BLOCK_COMMENT("} generate_card_young_test"); - - // From here on, tmp1 holds the card address. - __ branch_optimized(Assembler::bcondNotEqual, *stub->entry()); - - __ bind(*stub->continuation()); - + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_may_be_null); + __ bind(done); BLOCK_COMMENT("} g1_write_barrier_post_c2"); } -void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const { - - BLOCK_COMMENT("generate_c2_post_barrier_stub {"); - - Assembler::InlineSkippedInstructionsCounter skip_counter(masm); - Label runtime; - - Register thread = stub->thread(); - Register tmp1 = stub->tmp1(); // tmp1 holds the card address. - Register tmp2 = stub->tmp2(); - Register Rcard_addr = tmp1; - - __ bind(*stub->entry()); - - BLOCK_COMMENT("generate_card_clean_test {"); - __ z_sync(); // Required to support concurrent cleaning. - __ z_cli(0, Rcard_addr, 0); // Reload after membar. - __ branch_optimized(Assembler::bcondEqual, *stub->continuation()); - BLOCK_COMMENT("} generate_card_clean_test"); - - BLOCK_COMMENT("generate_dirty_card {"); - // Storing a region crossing, non-null oop, card is clean. - // Dirty card and log. - STATIC_ASSERT(CardTable::dirty_card_val() == 0); - __ z_mvi(0, Rcard_addr, CardTable::dirty_card_val()); - BLOCK_COMMENT("} generate_dirty_card"); - - generate_queue_test_and_insertion(masm, - G1ThreadLocalData::dirty_card_queue_index_offset(), - G1ThreadLocalData::dirty_card_queue_buffer_offset(), - runtime, - Z_thread, tmp1, tmp2); - - __ branch_optimized(Assembler::bcondAlways, *stub->continuation()); - - __ bind(runtime); - - generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry)); - - __ branch_optimized(Assembler::bcondAlways, *stub->continuation()); - - BLOCK_COMMENT("} generate_c2_post_barrier_stub"); -} - #endif //COMPILER2 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, @@ -451,99 +417,9 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Decorato Register Rtmp1, Register Rtmp2, Register Rtmp3) { bool not_null = (decorators & IS_NOT_NULL) != 0; - assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); // Most probably, Rnew_val == Rtmp3. - - Label callRuntime, filtered; - - CardTableBarrierSet* ct = barrier_set_cast(BarrierSet::barrier_set()); - - BLOCK_COMMENT("g1_write_barrier_post {"); - - // Does store cross heap regions? - // It does if the two addresses specify different grain addresses. - if (VM_Version::has_DistinctOpnds()) { - __ z_xgrk(Rtmp1, Rstore_addr, Rnew_val); - } else { - __ z_lgr(Rtmp1, Rstore_addr); - __ z_xgr(Rtmp1, Rnew_val); - } - __ z_srag(Rtmp1, Rtmp1, G1HeapRegion::LogOfHRGrainBytes); - __ z_bre(filtered); - - // Crosses regions, storing null? - if (not_null) { -#ifdef ASSERT - __ z_ltgr(Rnew_val, Rnew_val); - __ asm_assert(Assembler::bcondNotZero, "null oop not allowed (G1 post)", 0x322); // Checked by caller. -#endif - } else { - __ z_ltgr(Rnew_val, Rnew_val); - __ z_bre(filtered); - } - - Rnew_val = noreg; // end of lifetime - - // Storing region crossing non-null, is card already dirty? - assert_different_registers(Rtmp1, Rtmp2, Rtmp3); - // Make sure not to use Z_R0 for any of these registers. - Register Rcard_addr = (Rtmp1 != Z_R0_scratch) ? Rtmp1 : Rtmp3; - Register Rbase = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp3; - - // calculate address of card - __ load_const_optimized(Rbase, (address)ct->card_table()->byte_map_base()); // Card table base. - __ z_srlg(Rcard_addr, Rstore_addr, CardTable::card_shift()); // Index into card table. - __ z_algr(Rcard_addr, Rbase); // Explicit calculation needed for cli. - Rbase = noreg; // end of lifetime - - // Filter young. - __ z_cli(0, Rcard_addr, G1CardTable::g1_young_card_val()); - __ z_bre(filtered); - - // Check the card value. If dirty, we're done. - // This also avoids false sharing of the (already dirty) card. - __ z_sync(); // Required to support concurrent cleaning. - __ z_cli(0, Rcard_addr, G1CardTable::dirty_card_val()); // Reload after membar. - __ z_bre(filtered); - - // Storing a region crossing, non-null oop, card is clean. - // Dirty card and log. - __ z_mvi(0, Rcard_addr, G1CardTable::dirty_card_val()); - - Register Rcard_addr_x = Rcard_addr; - Register Rqueue_index = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp1; - if (Rcard_addr == Rqueue_index) { - Rcard_addr_x = Z_R0_scratch; // Register shortage. We have to use Z_R0. - } - __ lgr_if_needed(Rcard_addr_x, Rcard_addr); - - generate_queue_test_and_insertion(masm, - G1ThreadLocalData::dirty_card_queue_index_offset(), - G1ThreadLocalData::dirty_card_queue_buffer_offset(), - callRuntime, - Z_thread, Rcard_addr_x, Rqueue_index); - __ z_bru(filtered); - - __ bind(callRuntime); - - // TODO: do we need a frame? Introduced to be on the safe side. - bool needs_frame = true; - __ lgr_if_needed(Rcard_addr, Rcard_addr_x); // copy back asap. push_frame will destroy Z_R0_scratch! - - // VM call need frame to access(write) O register. - if (needs_frame) { - __ save_return_pc(); - __ push_frame_abi160(0); // Will use Z_R0 as tmp on old CPUs. - } - - // Save the live input values. - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), Rcard_addr, Z_thread); - - if (needs_frame) { - __ pop_frame(); - __ restore_return_pc(); - } - - __ bind(filtered); + Label done; + generate_post_barrier_fast_path(masm, Rstore_addr, Rnew_val, Z_thread, Rtmp1, Rtmp2, done, !not_null); + __ bind(done); BLOCK_COMMENT("} g1_write_barrier_post"); } @@ -615,22 +491,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier __ branch_optimized(Assembler::bcondAlways, *stub->continuation()); } -void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) { - G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1(); - __ bind(*stub->entry()); - ce->check_reserved_argument_area(16); // RT stub needs 2 spill slots. - assert(stub->addr()->is_register(), "Precondition."); - assert(stub->new_val()->is_register(), "Precondition."); - Register new_val_reg = stub->new_val()->as_register(); - __ z_ltgr(new_val_reg, new_val_reg); - __ branch_optimized(Assembler::bcondZero, *stub->continuation()); - __ z_lgr(Z_R1_scratch, stub->addr()->as_pointer_register()); - ce->emit_call_c(bs->post_barrier_c1_runtime_code_blob()->code_begin()); - __ branch_optimized(Assembler::bcondAlways, *stub->continuation()); -} - #undef __ +void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_may_be_null */); + masm->bind(done); +} + #define __ sasm-> static OopMap* save_volatile_registers(StubAssembler* sasm, Register return_pc = Z_R14) { @@ -705,92 +578,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* __ z_bru(restart); } -void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) { - // Z_R1_scratch: oop address, address of updated memory slot - - BarrierSet* bs = BarrierSet::barrier_set(); - __ set_info("g1_post_barrier_slow_id", false); - - Register addr_oop = Z_R1_scratch; - Register addr_card = Z_R1_scratch; - Register r1 = Z_R6; // Must be saved/restored. - Register r2 = Z_R7; // Must be saved/restored. - Register cardtable = r1; // Must be non-volatile, because it is used to save addr_card. - CardTableBarrierSet* ctbs = barrier_set_cast(bs); - CardTable* ct = ctbs->card_table(); - CardTable::CardValue* byte_map_base = ct->byte_map_base(); - - // Save registers used below (see assertion in G1PreBarrierStub::emit_code()). - __ z_stg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP); - - Label not_already_dirty, restart, refill, young_card; - - // Calculate address of card corresponding to the updated oop slot. - AddressLiteral rs(byte_map_base); - __ z_srlg(addr_card, addr_oop, CardTable::card_shift()); - addr_oop = noreg; // dead now - __ load_const_optimized(cardtable, rs); // cardtable := - __ z_agr(addr_card, cardtable); // addr_card := addr_oop>>card_shift + cardtable - - __ z_cli(0, addr_card, (int)G1CardTable::g1_young_card_val()); - __ z_bre(young_card); - - __ z_sync(); // Required to support concurrent cleaning. - - __ z_cli(0, addr_card, (int)CardTable::dirty_card_val()); - __ z_brne(not_already_dirty); - - __ bind(young_card); - // We didn't take the branch, so we're already dirty: restore - // used registers and return. - __ z_lg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP); - __ z_br(Z_R14); - - // Not dirty. - __ bind(not_already_dirty); - - // First, dirty it: [addr_card] := 0 - __ z_mvi(0, addr_card, CardTable::dirty_card_val()); - - Register idx = cardtable; // Must be non-volatile, because it is used to save addr_card. - Register buf = r2; - cardtable = noreg; // now dead - - // Save registers used below (see assertion in G1PreBarrierStub::emit_code()). - __ z_stg(r2, 1*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP); - - ByteSize dirty_card_q_index_byte_offset = G1ThreadLocalData::dirty_card_queue_index_offset(); - ByteSize dirty_card_q_buf_byte_offset = G1ThreadLocalData::dirty_card_queue_buffer_offset(); - - __ bind(restart); - - // Get the index into the update buffer. G1DirtyCardQueue::_index is - // a size_t so z_ltg is appropriate here. - __ z_ltg(idx, Address(Z_thread, dirty_card_q_index_byte_offset)); - - // index == 0? - __ z_brz(refill); - - __ z_lg(buf, Address(Z_thread, dirty_card_q_buf_byte_offset)); - __ add2reg(idx, -oopSize); - - __ z_stg(addr_card, 0, idx, buf); // [_buf + index] := - __ z_stg(idx, Address(Z_thread, dirty_card_q_index_byte_offset)); - // Restore killed registers and return. - __ z_lg(r1, 0*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP); - __ z_lg(r2, 1*BytesPerWord + FrameMap::first_available_sp_in_frame, Z_SP); - __ z_br(Z_R14); - - __ bind(refill); - save_volatile_registers(sasm); - __ z_lgr(idx, addr_card); // Save addr_card, tmp3 must be non-volatile. - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1DirtyCardQueueSet::handle_zero_index_for_thread), - Z_thread); - __ z_lgr(addr_card, idx); - restore_volatile_registers(sasm); // Restore addr_card. - __ z_bru(restart); -} - #undef __ #endif // COMPILER1 diff --git a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp index 0f0bdd8b83c..fdec751c43b 100644 --- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp +++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, 2024 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -33,9 +33,7 @@ class LIR_Assembler; class StubAssembler; class G1PreBarrierStub; -class G1PostBarrierStub; class G1PreBarrierStubC2; -class G1PostBarrierStubC2; class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { protected: @@ -60,10 +58,16 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { public: #ifdef COMPILER1 void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub); - void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub); void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm); - void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm); + + void g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2); + #endif // COMPILER1 #ifdef COMPILER2 @@ -81,9 +85,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { Register thread, Register tmp1, Register tmp2, - G1PostBarrierStubC2* c2_stub); - void generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const; + bool new_val_may_be_null); #endif // COMPILER2 virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, diff --git a/src/hotspot/cpu/s390/gc/g1/g1_s390.ad b/src/hotspot/cpu/s390/gc/g1/g1_s390.ad index 31f60c4aeff..7aed374fdae 100644 --- a/src/hotspot/cpu/s390/gc/g1/g1_s390.ad +++ b/src/hotspot/cpu/s390/gc/g1/g1_s390.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright 2024 IBM Corporation. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm, Register new_val, Register tmp1, Register tmp2) { - if (!G1PostBarrierStubC2::needs_barrier(node)) { + if (!G1BarrierStubC2::needs_post_barrier(node)) { return; } Assembler::InlineSkippedInstructionsCounter skip_counter(masm); G1BarrierSetAssembler* g1_asm = static_cast(BarrierSet::barrier_set()->barrier_set_assembler()); - G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node); - g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, stub); + bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node); + g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, new_val_may_be_null); } %} // source diff --git a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp index c1920b52837..31f27e140e0 100644 --- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp @@ -89,19 +89,53 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators, Register addr, Register count, Register tmp) { - __ push_call_clobbered_registers(false /* save_fpu */); - if (c_rarg0 == count) { // On win64 c_rarg0 == rcx - assert_different_registers(c_rarg1, addr); - __ mov(c_rarg1, count); - __ mov(c_rarg0, addr); - } else { - assert_different_registers(c_rarg0, count); - __ mov(c_rarg0, addr); - __ mov(c_rarg1, count); - } - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2); - __ pop_call_clobbered_registers(false /* save_fpu */); + Label done; + __ testptr(count, count); + __ jcc(Assembler::zero, done); + + // Calculate end address in "count". + Address::ScaleFactor scale = UseCompressedOops ? Address::times_4 : Address::times_8; + __ leaq(count, Address(addr, count, scale)); + + // Calculate start card address in "addr". + __ shrptr(addr, CardTable::card_shift()); + + Register thread = r15_thread; + + __ movptr(tmp, Address(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()))); + __ addptr(addr, tmp); + + // Calculate address of card of last word in the array. + __ subptr(count, 1); + __ shrptr(count, CardTable::card_shift()); + __ addptr(count, tmp); + + Label loop; + // Iterate from start card to end card (inclusive). + __ bind(loop); + + Label is_clean_card; + if (UseCondCardMark) { + __ cmpb(Address(addr, 0), G1CardTable::clean_card_val()); + __ jcc(Assembler::equal, is_clean_card); + } else { + __ movb(Address(addr, 0), G1CardTable::dirty_card_val()); + } + + Label next_card; + __ bind(next_card); + __ addptr(addr, sizeof(CardTable::CardValue)); + __ cmpptr(addr, count); + __ jcc(Assembler::belowEqual, loop); + __ jmp(done); + + __ bind(is_clean_card); + // Card was clean. Dirty card and go to next.. + __ movb(Address(addr, 0), G1CardTable::dirty_card_val()); + __ jmp(next_card); + + __ bind(done); } void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, @@ -182,7 +216,6 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, // If expand_call is true then we expand the call_VM_leaf macro // directly to skip generating the check by // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp. - const Register thread = r15_thread; Label done; @@ -238,73 +271,46 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, static void generate_post_barrier_fast_path(MacroAssembler* masm, const Register store_addr, const Register new_val, - const Register tmp, - const Register tmp2, + const Register tmp1, Label& done, bool new_val_may_be_null) { - CardTableBarrierSet* ct = barrier_set_cast(BarrierSet::barrier_set()); + + assert_different_registers(store_addr, new_val, tmp1, noreg); + + Register thread = r15_thread; + // Does store cross heap regions? - __ movptr(tmp, store_addr); // tmp := store address - __ xorptr(tmp, new_val); // tmp := store address ^ new value - __ shrptr(tmp, G1HeapRegion::LogOfHRGrainBytes); // ((store address ^ new value) >> LogOfHRGrainBytes) == 0? + __ movptr(tmp1, store_addr); // tmp1 := store address + __ xorptr(tmp1, new_val); // tmp1 := store address ^ new value + __ shrptr(tmp1, G1HeapRegion::LogOfHRGrainBytes); // ((store address ^ new value) >> LogOfHRGrainBytes) == 0? __ jcc(Assembler::equal, done); + // Crosses regions, storing null? if (new_val_may_be_null) { - __ cmpptr(new_val, NULL_WORD); // new value == null? + __ cmpptr(new_val, NULL_WORD); // new value == null? __ jcc(Assembler::equal, done); } - // Storing region crossing non-null, is card young? - __ movptr(tmp, store_addr); // tmp := store address - __ shrptr(tmp, CardTable::card_shift()); // tmp := card address relative to card table base - // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT - // a valid address and therefore is not properly handled by the relocation code. - __ movptr(tmp2, (intptr_t)ct->card_table()->byte_map_base()); // tmp2 := card table base address - __ addptr(tmp, tmp2); // tmp := card address - __ cmpb(Address(tmp, 0), G1CardTable::g1_young_card_val()); // *(card address) == young_card_val? -} -static void generate_post_barrier_slow_path(MacroAssembler* masm, - const Register thread, - const Register tmp, - const Register tmp2, - Label& done, - Label& runtime) { - __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad)); // StoreLoad membar - __ cmpb(Address(tmp, 0), G1CardTable::dirty_card_val()); // *(card address) == dirty_card_val? - __ jcc(Assembler::equal, done); + __ movptr(tmp1, store_addr); // tmp1 := store address + __ shrptr(tmp1, CardTable::card_shift()); // tmp1 := card address relative to card table base + + Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset())); + __ addptr(tmp1, card_table_addr); // tmp1 := card address + if (UseCondCardMark) { + __ cmpb(Address(tmp1, 0), G1CardTable::clean_card_val()); // *(card address) == clean_card_val? + __ jcc(Assembler::notEqual, done); + } // Storing a region crossing, non-null oop, card is clean. - // Dirty card and log. - __ movb(Address(tmp, 0), G1CardTable::dirty_card_val()); // *(card address) := dirty_card_val - generate_queue_insertion(masm, - G1ThreadLocalData::dirty_card_queue_index_offset(), - G1ThreadLocalData::dirty_card_queue_buffer_offset(), - runtime, - thread, tmp, tmp2); - __ jmp(done); + // Dirty card. + __ movb(Address(tmp1, 0), G1CardTable::dirty_card_val()); // *(card address) := dirty_card_val } void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Register store_addr, Register new_val, - Register tmp, - Register tmp2) { - const Register thread = r15_thread; - + Register tmp) { Label done; - Label runtime; - - generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, done, true /* new_val_may_be_null */); - // If card is young, jump to done - __ jcc(Assembler::equal, done); - generate_post_barrier_slow_path(masm, thread, tmp, tmp2, done, runtime); - - __ bind(runtime); - // save the live input values - RegSet saved = RegSet::of(store_addr); - __ push_set(saved); - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp, thread); - __ pop_set(saved); - + generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, done, true /* new_val_may_be_null */); __ bind(done); } @@ -367,34 +373,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm, Register store_addr, Register new_val, Register tmp, - Register tmp2, - G1PostBarrierStubC2* stub) { - const Register thread = r15_thread; - stub->initialize_registers(thread, tmp, tmp2); - - bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0; - generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, *stub->continuation(), new_val_may_be_null); - // If card is not young, jump to stub (slow path) - __ jcc(Assembler::notEqual, *stub->entry()); - - __ bind(*stub->continuation()); -} - -void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const { - Assembler::InlineSkippedInstructionsCounter skip_counter(masm); - Label runtime; - Register thread = stub->thread(); - Register tmp = stub->tmp1(); // tmp holds the card address. - Register tmp2 = stub->tmp2(); - assert(stub->tmp3() == noreg, "not needed in this platform"); - - __ bind(*stub->entry()); - generate_post_barrier_slow_path(masm, thread, tmp, tmp2, *stub->continuation(), runtime); - - __ bind(runtime); - generate_c2_barrier_runtime_call(masm, stub, tmp, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry)); - __ jmp(*stub->continuation()); + bool new_val_may_be_null) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, done, new_val_may_be_null); + __ bind(done); } #endif // COMPILER2 @@ -441,8 +423,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco g1_write_barrier_post(masm /*masm*/, tmp1 /* store_adr */, new_val /* new_val */, - tmp3 /* tmp */, - tmp2 /* tmp2 */); + tmp3 /* tmp */); } } } @@ -476,21 +457,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier } -void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) { - G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1(); - __ bind(*stub->entry()); - assert(stub->addr()->is_register(), "Precondition."); - assert(stub->new_val()->is_register(), "Precondition."); - Register new_val_reg = stub->new_val()->as_register(); - __ cmpptr(new_val_reg, NULL_WORD); - __ jcc(Assembler::equal, *stub->continuation()); - ce->store_parameter(stub->addr()->as_pointer_register(), 0); - __ call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin())); - __ jmp(*stub->continuation()); -} - #undef __ +void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2 /* unused on x86 */) { + Label done; + generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, done, true /* new_val_may_be_null */); + masm->bind(done); +} + #define __ sasm-> void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) { @@ -555,78 +534,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* __ epilogue(); } -void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) { - __ prologue("g1_post_barrier", false); - - CardTableBarrierSet* ct = - barrier_set_cast(BarrierSet::barrier_set()); - - Label done; - Label enqueued; - Label runtime; - - // At this point we know new_value is non-null and the new_value crosses regions. - // Must check to see if card is already dirty - - const Register thread = r15_thread; - - Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset())); - Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset())); - - __ push_ppx(rax); - __ push_ppx(rcx); - - const Register cardtable = rax; - const Register card_addr = rcx; - - __ load_parameter(0, card_addr); - __ shrptr(card_addr, CardTable::card_shift()); - // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT - // a valid address and therefore is not properly handled by the relocation code. - __ movptr(cardtable, (intptr_t)ct->card_table()->byte_map_base()); - __ addptr(card_addr, cardtable); - - __ cmpb(Address(card_addr, 0), G1CardTable::g1_young_card_val()); - __ jcc(Assembler::equal, done); - - __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad)); - __ cmpb(Address(card_addr, 0), CardTable::dirty_card_val()); - __ jcc(Assembler::equal, done); - - // storing region crossing non-null, card is clean. - // dirty card and log. - - __ movb(Address(card_addr, 0), CardTable::dirty_card_val()); - - const Register tmp = rdx; - __ push_ppx(rdx); - - __ movptr(tmp, queue_index); - __ testptr(tmp, tmp); - __ jcc(Assembler::zero, runtime); - __ subptr(tmp, wordSize); - __ movptr(queue_index, tmp); - __ addptr(tmp, buffer); - __ movptr(Address(tmp, 0), card_addr); - __ jmp(enqueued); - - __ bind(runtime); - __ push_call_clobbered_registers(); - - __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread); - - __ pop_call_clobbered_registers(); - - __ bind(enqueued); - __ pop_ppx(rdx); - - __ bind(done); - __ pop_ppx(rcx); - __ pop_ppx(rax); - - __ epilogue(); -} - #undef __ #endif // COMPILER1 diff --git a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp index 774e87b916c..4b2de41de69 100644 --- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -31,10 +31,8 @@ class LIR_Assembler; class StubAssembler; class G1PreBarrierStub; -class G1PostBarrierStub; class G1BarrierStubC2; class G1PreBarrierStubC2; -class G1PostBarrierStubC2; class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { protected: @@ -51,22 +49,28 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { void g1_write_barrier_post(MacroAssembler* masm, Register store_addr, Register new_val, - Register tmp, - Register tmp2); + Register tmp); virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, Address dst, Register val, Register tmp1, Register tmp2, Register tmp3); public: - void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub); - void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub); - - void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm); - void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm); - virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type, Register dst, Address src, Register tmp1); +#ifdef COMPILER1 + void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub); + + void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm); + + void g1_write_barrier_post_c1(MacroAssembler* masm, + Register store_addr, + Register new_val, + Register thread, + Register tmp1, + Register tmp2); +#endif + #ifdef COMPILER2 void g1_write_barrier_pre_c2(MacroAssembler* masm, Register obj, @@ -79,10 +83,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler { Register store_addr, Register new_val, Register tmp, - Register tmp2, - G1PostBarrierStubC2* c2_stub); - void generate_c2_post_barrier_stub(MacroAssembler* masm, - G1PostBarrierStubC2* stub) const; + bool new_val_may_be_null); #endif // COMPILER2 }; diff --git a/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad b/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad index 819cd97696c..94607cd6796 100644 --- a/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad +++ b/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -59,15 +59,14 @@ static void write_barrier_post(MacroAssembler* masm, const MachNode* node, Register store_addr, Register new_val, - Register tmp1, - Register tmp2) { - if (!G1PostBarrierStubC2::needs_barrier(node)) { + Register tmp1) { + if (!G1BarrierStubC2::needs_post_barrier(node)) { return; } Assembler::InlineSkippedInstructionsCounter skip_counter(masm); G1BarrierSetAssembler* g1_asm = static_cast(BarrierSet::barrier_set()->barrier_set_assembler()); - G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node); - g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub); + bool new_val_may_be_null = G1BarrierStubC2::post_new_val_may_be_null(node); + g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, new_val_may_be_null); } %} @@ -95,8 +94,7 @@ instruct g1StoreP(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rRegP tmp3, write_barrier_post(masm, this, $tmp1$$Register /* store_addr */, $src$$Register /* new_val */, - $tmp3$$Register /* tmp1 */, - $tmp2$$Register /* tmp2 */); + $tmp3$$Register /* tmp1 */); %} ins_pipe(ialu_mem_reg); %} @@ -127,8 +125,7 @@ instruct g1StoreN(memory mem, rRegN src, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFl write_barrier_post(masm, this, $tmp1$$Register /* store_addr */, $tmp2$$Register /* new_val */, - $tmp3$$Register /* tmp1 */, - $tmp2$$Register /* tmp2 */); + $tmp3$$Register /* tmp1 */); %} ins_pipe(ialu_mem_reg); %} @@ -158,8 +155,7 @@ instruct g1EncodePAndStoreN(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rR write_barrier_post(masm, this, $tmp1$$Register /* store_addr */, $src$$Register /* new_val */, - $tmp3$$Register /* tmp1 */, - $tmp2$$Register /* tmp2 */); + $tmp3$$Register /* tmp1 */); %} ins_pipe(ialu_mem_reg); %} @@ -187,8 +183,7 @@ instruct g1CompareAndExchangeP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp write_barrier_post(masm, this, $mem$$Register /* store_addr */, $tmp1$$Register /* new_val */, - $tmp2$$Register /* tmp1 */, - $tmp3$$Register /* tmp2 */); + $tmp2$$Register /* tmp1 */); %} ins_pipe(pipe_cmpxchg); %} @@ -214,8 +209,7 @@ instruct g1CompareAndExchangeN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp write_barrier_post(masm, this, $mem$$Register /* store_addr */, $tmp1$$Register /* new_val */, - $tmp2$$Register /* tmp1 */, - $tmp3$$Register /* tmp2 */); + $tmp2$$Register /* tmp1 */); %} ins_pipe(pipe_cmpxchg); %} @@ -246,8 +240,7 @@ instruct g1CompareAndSwapP(rRegI res, indirect mem, rRegP newval, rRegP tmp1, rR write_barrier_post(masm, this, $mem$$Register /* store_addr */, $tmp1$$Register /* new_val */, - $tmp2$$Register /* tmp1 */, - $tmp3$$Register /* tmp2 */); + $tmp2$$Register /* tmp1 */); %} ins_pipe(pipe_cmpxchg); %} @@ -279,8 +272,7 @@ instruct g1CompareAndSwapN(rRegI res, indirect mem, rRegN newval, rRegP tmp1, rR write_barrier_post(masm, this, $mem$$Register /* store_addr */, $tmp1$$Register /* new_val */, - $tmp2$$Register /* tmp1 */, - $tmp3$$Register /* tmp2 */); + $tmp2$$Register /* tmp1 */); %} ins_pipe(pipe_cmpxchg); %} @@ -303,8 +295,7 @@ instruct g1GetAndSetP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp2, rRegP write_barrier_post(masm, this, $mem$$Register /* store_addr */, $tmp1$$Register /* new_val */, - $tmp2$$Register /* tmp1 */, - $tmp3$$Register /* tmp2 */); + $tmp2$$Register /* tmp1 */); %} ins_pipe(pipe_cmpxchg); %} @@ -328,8 +319,7 @@ instruct g1GetAndSetN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp2, rRegP write_barrier_post(masm, this, $mem$$Register /* store_addr */, $tmp1$$Register /* new_val */, - $tmp2$$Register /* tmp1 */, - $tmp3$$Register /* tmp2 */); + $tmp2$$Register /* tmp1 */); %} ins_pipe(pipe_cmpxchg); %} diff --git a/src/hotspot/share/code/aotCodeCache.cpp b/src/hotspot/share/code/aotCodeCache.cpp index a24bae03137..04776f4c16c 100644 --- a/src/hotspot/share/code/aotCodeCache.cpp +++ b/src/hotspot/share/code/aotCodeCache.cpp @@ -1365,7 +1365,6 @@ void AOTCodeAddressTable::init_extrs() { #endif // COMPILER2 #if INCLUDE_G1GC - SET_ADDRESS(_extrs, G1BarrierSetRuntime::write_ref_field_post_entry); SET_ADDRESS(_extrs, G1BarrierSetRuntime::write_ref_field_pre_entry); #endif #if INCLUDE_SHENANDOAHGC diff --git a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp index 425be474602..51c8a53b54a 100644 --- a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp +++ b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.cpp @@ -23,12 +23,15 @@ */ #include "c1/c1_CodeStubs.hpp" +#include "c1/c1_LIRAssembler.hpp" #include "c1/c1_LIRGenerator.hpp" +#include "c1/c1_MacroAssembler.hpp" #include "gc/g1/c1/g1BarrierSetC1.hpp" #include "gc/g1/g1BarrierSet.hpp" #include "gc/g1/g1BarrierSetAssembler.hpp" #include "gc/g1/g1HeapRegion.hpp" #include "gc/g1/g1ThreadLocalData.hpp" +#include "utilities/formatBuffer.hpp" #include "utilities/macros.hpp" #ifdef ASSERT @@ -42,11 +45,6 @@ void G1PreBarrierStub::emit_code(LIR_Assembler* ce) { bs->gen_pre_barrier_stub(ce, this); } -void G1PostBarrierStub::emit_code(LIR_Assembler* ce) { - G1BarrierSetAssembler* bs = (G1BarrierSetAssembler*)BarrierSet::barrier_set()->barrier_set_assembler(); - bs->gen_post_barrier_stub(ce, this); -} - void G1BarrierSetC1::pre_barrier(LIRAccess& access, LIR_Opr addr_opr, LIR_Opr pre_val, CodeEmitInfo* info) { LIRGenerator* gen = access.gen(); @@ -114,6 +112,87 @@ void G1BarrierSetC1::pre_barrier(LIRAccess& access, LIR_Opr addr_opr, __ branch_destination(slow->continuation()); } +class LIR_OpG1PostBarrier : public LIR_Op { + friend class LIR_OpVisitState; + +private: + LIR_Opr _addr; + LIR_Opr _new_val; + LIR_Opr _thread; + LIR_Opr _tmp1; + LIR_Opr _tmp2; + +public: + LIR_OpG1PostBarrier(LIR_Opr addr, + LIR_Opr new_val, + LIR_Opr thread, + LIR_Opr tmp1, + LIR_Opr tmp2) + : LIR_Op(lir_none, lir_none, nullptr), + _addr(addr), + _new_val(new_val), + _thread(thread), + _tmp1(tmp1), + _tmp2(tmp2) + {} + + virtual void visit(LIR_OpVisitState* state) { + state->do_input(_addr); + state->do_input(_new_val); + state->do_input(_thread); + + // Use temps to enforce different registers. + state->do_temp(_addr); + state->do_temp(_new_val); + state->do_temp(_thread); + state->do_temp(_tmp1); + state->do_temp(_tmp2); + + if (_info != nullptr) { + state->do_info(_info); + } + } + + virtual void emit_code(LIR_Assembler* ce) { + if (_info != nullptr) { + ce->add_debug_info_for_null_check_here(_info); + } + + Register addr = _addr->as_pointer_register(); + Register new_val = _new_val->as_pointer_register(); + Register thread = _thread->as_pointer_register(); + Register tmp1 = _tmp1->as_pointer_register(); + Register tmp2 = _tmp2->as_pointer_register(); + + // This may happen for a store of x.a = x - we do not need a post barrier for those + // as the cross-region test will always exit early anyway. + // The post barrier implementations can assume that addr and new_val are different + // then. + if (addr == new_val) { + ce->masm()->block_comment(err_msg("same addr/new_val due to self-referential store with imprecise card mark %s", addr->name())); + return; + } + + G1BarrierSetAssembler* bs_asm = static_cast(BarrierSet::barrier_set()->barrier_set_assembler()); + bs_asm->g1_write_barrier_post_c1(ce->masm(), addr, new_val, thread, tmp1, tmp2); + } + + virtual void print_instr(outputStream* out) const { + _addr->print(out); out->print(" "); + _new_val->print(out); out->print(" "); + _thread->print(out); out->print(" "); + _tmp1->print(out); out->print(" "); + _tmp2->print(out); out->print(" "); + out->cr(); + } + +#ifndef PRODUCT + virtual const char* name() const { + return "lir_g1_post_barrier"; + } +#endif // PRODUCT +}; + void G1BarrierSetC1::post_barrier(LIRAccess& access, LIR_Opr addr, LIR_Opr new_val) { LIRGenerator* gen = access.gen(); DecoratorSet decorators = access.decorators(); @@ -150,29 +229,11 @@ void G1BarrierSetC1::post_barrier(LIRAccess& access, LIR_Opr addr, LIR_Opr new_v } assert(addr->is_register(), "must be a register at this point"); - LIR_Opr xor_res = gen->new_pointer_register(); - LIR_Opr xor_shift_res = gen->new_pointer_register(); - if (two_operand_lir_form) { - __ move(addr, xor_res); - __ logical_xor(xor_res, new_val, xor_res); - __ move(xor_res, xor_shift_res); - __ unsigned_shift_right(xor_shift_res, - LIR_OprFact::intConst(checked_cast(G1HeapRegion::LogOfHRGrainBytes)), - xor_shift_res, - LIR_Opr::illegalOpr()); - } else { - __ logical_xor(addr, new_val, xor_res); - __ unsigned_shift_right(xor_res, - LIR_OprFact::intConst(checked_cast(G1HeapRegion::LogOfHRGrainBytes)), - xor_shift_res, - LIR_Opr::illegalOpr()); - } - - __ cmp(lir_cond_notEqual, xor_shift_res, LIR_OprFact::intptrConst(NULL_WORD)); - - CodeStub* slow = new G1PostBarrierStub(addr, new_val); - __ branch(lir_cond_notEqual, slow); - __ branch_destination(slow->continuation()); + __ append(new LIR_OpG1PostBarrier(addr, + new_val, + gen->getThreadPointer() /* thread */, + gen->new_pointer_register() /* tmp1 */, + gen->new_pointer_register() /* tmp2 */)); } void G1BarrierSetC1::load_at_resolved(LIRAccess& access, LIR_Opr result) { @@ -207,20 +268,9 @@ class C1G1PreBarrierCodeGenClosure : public StubAssemblerCodeGenClosure { } }; -class C1G1PostBarrierCodeGenClosure : public StubAssemblerCodeGenClosure { - virtual OopMapSet* generate_code(StubAssembler* sasm) { - G1BarrierSetAssembler* bs = (G1BarrierSetAssembler*)BarrierSet::barrier_set()->barrier_set_assembler(); - bs->generate_c1_post_barrier_runtime_stub(sasm); - return nullptr; - } -}; - bool G1BarrierSetC1::generate_c1_runtime_stubs(BufferBlob* buffer_blob) { C1G1PreBarrierCodeGenClosure pre_code_gen_cl; - C1G1PostBarrierCodeGenClosure post_code_gen_cl; _pre_barrier_c1_runtime_code_blob = Runtime1::generate_blob(buffer_blob, StubId::NO_STUBID, "g1_pre_barrier_slow", false, &pre_code_gen_cl); - _post_barrier_c1_runtime_code_blob = Runtime1::generate_blob(buffer_blob, StubId::NO_STUBID, "g1_post_barrier_slow", - false, &post_code_gen_cl); - return _pre_barrier_c1_runtime_code_blob != nullptr && _post_barrier_c1_runtime_code_blob != nullptr; + return _pre_barrier_c1_runtime_code_blob != nullptr; } diff --git a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp index 4baaf8ac58c..89f5676a2d2 100644 --- a/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp +++ b/src/hotspot/share/gc/g1/c1/g1BarrierSetC1.hpp @@ -91,40 +91,11 @@ class G1PreBarrierStub: public CodeStub { #endif // PRODUCT }; -class G1PostBarrierStub: public CodeStub { - friend class G1BarrierSetC1; - private: - LIR_Opr _addr; - LIR_Opr _new_val; - - public: - // addr (the address of the object head) and new_val must be registers. - G1PostBarrierStub(LIR_Opr addr, LIR_Opr new_val): _addr(addr), _new_val(new_val) { - FrameMap* f = Compilation::current()->frame_map(); - f->update_reserved_argument_area_size(2 * BytesPerWord); - } - - LIR_Opr addr() const { return _addr; } - LIR_Opr new_val() const { return _new_val; } - - virtual void emit_code(LIR_Assembler* e); - virtual void visit(LIR_OpVisitState* visitor) { - // don't pass in the code emit info since it's processed in the fast path - visitor->do_slow_case(); - visitor->do_input(_addr); - visitor->do_input(_new_val); - } -#ifndef PRODUCT - virtual void print_name(outputStream* out) const { out->print("G1PostBarrierStub"); } -#endif // PRODUCT -}; - class CodeBlob; class G1BarrierSetC1 : public ModRefBarrierSetC1 { protected: CodeBlob* _pre_barrier_c1_runtime_code_blob; - CodeBlob* _post_barrier_c1_runtime_code_blob; virtual void pre_barrier(LIRAccess& access, LIR_Opr addr_opr, LIR_Opr pre_val, CodeEmitInfo* info); @@ -134,11 +105,9 @@ class G1BarrierSetC1 : public ModRefBarrierSetC1 { public: G1BarrierSetC1() - : _pre_barrier_c1_runtime_code_blob(nullptr), - _post_barrier_c1_runtime_code_blob(nullptr) {} + : _pre_barrier_c1_runtime_code_blob(nullptr) {} CodeBlob* pre_barrier_c1_runtime_code_blob() { return _pre_barrier_c1_runtime_code_blob; } - CodeBlob* post_barrier_c1_runtime_code_blob() { return _post_barrier_c1_runtime_code_blob; } virtual bool generate_c1_runtime_stubs(BufferBlob* buffer_blob); }; diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp index bca2255479b..61402301eb1 100644 --- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp +++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp @@ -298,7 +298,13 @@ uint G1BarrierSetC2::estimated_barrier_size(const Node* node) const { nodes += 6; } if ((barrier_data & G1C2BarrierPost) != 0) { - nodes += 60; + // Approximate the number of nodes needed; an if costs 4 nodes (Cmp, Bool, + // If, If projection), any other (Assembly) instruction is approximated with + // a cost of 1. + nodes += 4 // base cost for the card write containing getting base offset, address calculation and the card write; + + 6 // same region check: Uncompress (new_val) oop, xor, shr, (cmp), jmp + + 4 // new_val is null check + + (UseCondCardMark ? 4 : 0); // card not clean check. } return nodes; } @@ -386,8 +392,9 @@ public: } bool needs_liveness_data(const MachNode* mach) const { - return G1PreBarrierStubC2::needs_barrier(mach) || - G1PostBarrierStubC2::needs_barrier(mach); + // Liveness data is only required to compute registers that must be preserved + // across the runtime call in the pre-barrier stub. + return G1BarrierStubC2::needs_pre_barrier(mach); } bool needs_livein_data() const { @@ -401,10 +408,22 @@ static G1BarrierSetC2State* barrier_set_state() { G1BarrierStubC2::G1BarrierStubC2(const MachNode* node) : BarrierStubC2(node) {} +bool G1BarrierStubC2::needs_pre_barrier(const MachNode* node) { + return (node->barrier_data() & G1C2BarrierPre) != 0; +} + +bool G1BarrierStubC2::needs_post_barrier(const MachNode* node) { + return (node->barrier_data() & G1C2BarrierPost) != 0; +} + +bool G1BarrierStubC2::post_new_val_may_be_null(const MachNode* node) { + return (node->barrier_data() & G1C2BarrierPostNotNull) == 0; +} + G1PreBarrierStubC2::G1PreBarrierStubC2(const MachNode* node) : G1BarrierStubC2(node) {} bool G1PreBarrierStubC2::needs_barrier(const MachNode* node) { - return (node->barrier_data() & G1C2BarrierPre) != 0; + return needs_pre_barrier(node); } G1PreBarrierStubC2* G1PreBarrierStubC2::create(const MachNode* node) { @@ -448,48 +467,6 @@ void G1PreBarrierStubC2::emit_code(MacroAssembler& masm) { bs->generate_c2_pre_barrier_stub(&masm, this); } -G1PostBarrierStubC2::G1PostBarrierStubC2(const MachNode* node) : G1BarrierStubC2(node) {} - -bool G1PostBarrierStubC2::needs_barrier(const MachNode* node) { - return (node->barrier_data() & G1C2BarrierPost) != 0; -} - -G1PostBarrierStubC2* G1PostBarrierStubC2::create(const MachNode* node) { - G1PostBarrierStubC2* const stub = new (Compile::current()->comp_arena()) G1PostBarrierStubC2(node); - if (!Compile::current()->output()->in_scratch_emit_size()) { - barrier_set_state()->stubs()->append(stub); - } - return stub; -} - -void G1PostBarrierStubC2::initialize_registers(Register thread, Register tmp1, Register tmp2, Register tmp3) { - _thread = thread; - _tmp1 = tmp1; - _tmp2 = tmp2; - _tmp3 = tmp3; -} - -Register G1PostBarrierStubC2::thread() const { - return _thread; -} - -Register G1PostBarrierStubC2::tmp1() const { - return _tmp1; -} - -Register G1PostBarrierStubC2::tmp2() const { - return _tmp2; -} - -Register G1PostBarrierStubC2::tmp3() const { - return _tmp3; -} - -void G1PostBarrierStubC2::emit_code(MacroAssembler& masm) { - G1BarrierSetAssembler* bs = static_cast(BarrierSet::barrier_set()->barrier_set_assembler()); - bs->generate_c2_post_barrier_stub(&masm, this); -} - void* G1BarrierSetC2::create_barrier_state(Arena* comp_arena) const { return new (comp_arena) G1BarrierSetC2State(comp_arena); } diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp index 5f85714d889..601d0f1138e 100644 --- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp +++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp @@ -37,6 +37,10 @@ const int G1C2BarrierPostNotNull = 4; class G1BarrierStubC2 : public BarrierStubC2 { public: + static bool needs_pre_barrier(const MachNode* node); + static bool needs_post_barrier(const MachNode* node); + static bool post_new_val_may_be_null(const MachNode* node); + G1BarrierStubC2(const MachNode* node); virtual void emit_code(MacroAssembler& masm) = 0; }; @@ -64,27 +68,6 @@ public: virtual void emit_code(MacroAssembler& masm); }; -class G1PostBarrierStubC2 : public G1BarrierStubC2 { -private: - Register _thread; - Register _tmp1; - Register _tmp2; - Register _tmp3; - -protected: - G1PostBarrierStubC2(const MachNode* node); - -public: - static bool needs_barrier(const MachNode* node); - static G1PostBarrierStubC2* create(const MachNode* node); - void initialize_registers(Register thread, Register tmp1 = noreg, Register tmp2 = noreg, Register tmp3 = noreg); - Register thread() const; - Register tmp1() const; - Register tmp2() const; - Register tmp3() const; - virtual void emit_code(MacroAssembler& masm); -}; - class G1BarrierSetC2: public CardTableBarrierSetC2 { private: void analyze_dominating_barriers() const; diff --git a/src/hotspot/share/gc/g1/g1Allocator.cpp b/src/hotspot/share/gc/g1/g1Allocator.cpp index 7f2916ae895..713bafd4782 100644 --- a/src/hotspot/share/gc/g1/g1Allocator.cpp +++ b/src/hotspot/share/gc/g1/g1Allocator.cpp @@ -262,9 +262,6 @@ HeapWord* G1Allocator::survivor_attempt_allocation(uint node_index, } } } - if (result != nullptr) { - _g1h->dirty_young_block(result, *actual_word_size); - } return result; } diff --git a/src/hotspot/share/gc/g1/g1Analytics.cpp b/src/hotspot/share/gc/g1/g1Analytics.cpp index 8fe0b25ceb7..6e7f46ca1d1 100644 --- a/src/hotspot/share/gc/g1/g1Analytics.cpp +++ b/src/hotspot/share/gc/g1/g1Analytics.cpp @@ -37,12 +37,10 @@ // They were chosen by running GCOld and SPECjbb on debris with different // numbers of GC threads and choosing them based on the results -static double cost_per_logged_card_ms_defaults[] = { - 0.01, 0.005, 0.005, 0.003, 0.003, 0.002, 0.002, 0.0015 -}; +static double cost_per_pending_card_ms_default = 0.01; // all the same -static double young_card_scan_to_merge_ratio_defaults[] = { +static double young_card_merge_to_scan_ratio_defaults[] = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; @@ -78,8 +76,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) : _concurrent_gc_cpu_time_ms(), _concurrent_refine_rate_ms_seq(TruncatedSeqLength), _dirtied_cards_rate_ms_seq(TruncatedSeqLength), - _dirtied_cards_in_thread_buffers_seq(TruncatedSeqLength), - _card_scan_to_merge_ratio_seq(TruncatedSeqLength), + _card_merge_to_scan_ratio_seq(TruncatedSeqLength), _cost_per_card_scan_ms_seq(TruncatedSeqLength), _cost_per_card_merge_ms_seq(TruncatedSeqLength), _cost_per_code_root_ms_seq(TruncatedSeqLength), @@ -87,6 +84,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) : _pending_cards_seq(TruncatedSeqLength), _card_rs_length_seq(TruncatedSeqLength), _code_root_rs_length_seq(TruncatedSeqLength), + _merge_refinement_table_ms_seq(TruncatedSeqLength), _constant_other_time_ms_seq(TruncatedSeqLength), _young_other_cost_per_region_ms_seq(TruncatedSeqLength), _non_young_other_cost_per_region_ms_seq(TruncatedSeqLength), @@ -100,17 +98,17 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) : uint index = MIN2(ParallelGCThreads - 1, 7u); - // Start with inverse of maximum STW cost. - _concurrent_refine_rate_ms_seq.add(1/cost_per_logged_card_ms_defaults[0]); - // Some applications have very low rates for logging cards. + _concurrent_refine_rate_ms_seq.add(1 / cost_per_pending_card_ms_default); + // Some applications have very low rates for dirtying cards. _dirtied_cards_rate_ms_seq.add(0.0); - _card_scan_to_merge_ratio_seq.set_initial(young_card_scan_to_merge_ratio_defaults[index]); + _card_merge_to_scan_ratio_seq.set_initial(young_card_merge_to_scan_ratio_defaults[index]); _cost_per_card_scan_ms_seq.set_initial(young_only_cost_per_card_scan_ms_defaults[index]); _card_rs_length_seq.set_initial(0); _code_root_rs_length_seq.set_initial(0); _cost_per_byte_copied_ms_seq.set_initial(cost_per_byte_ms_defaults[index]); + _merge_refinement_table_ms_seq.add(0); _constant_other_time_ms_seq.add(constant_other_time_ms_defaults[index]); _young_other_cost_per_region_ms_seq.add(young_other_cost_per_region_ms_defaults[index]); _non_young_other_cost_per_region_ms_seq.add(non_young_other_cost_per_region_ms_defaults[index]); @@ -196,10 +194,6 @@ void G1Analytics::report_dirtied_cards_rate_ms(double cards_per_ms) { _dirtied_cards_rate_ms_seq.add(cards_per_ms); } -void G1Analytics::report_dirtied_cards_in_thread_buffers(size_t cards) { - _dirtied_cards_in_thread_buffers_seq.add(double(cards)); -} - void G1Analytics::report_cost_per_card_scan_ms(double cost_per_card_ms, bool for_young_only_phase) { _cost_per_card_scan_ms_seq.add(cost_per_card_ms, for_young_only_phase); } @@ -212,8 +206,8 @@ void G1Analytics::report_cost_per_code_root_scan_ms(double cost_per_code_root_ms _cost_per_code_root_ms_seq.add(cost_per_code_root_ms, for_young_only_phase); } -void G1Analytics::report_card_scan_to_merge_ratio(double merge_to_scan_ratio, bool for_young_only_phase) { - _card_scan_to_merge_ratio_seq.add(merge_to_scan_ratio, for_young_only_phase); +void G1Analytics::report_card_merge_to_scan_ratio(double merge_to_scan_ratio, bool for_young_only_phase) { + _card_merge_to_scan_ratio_seq.add(merge_to_scan_ratio, for_young_only_phase); } void G1Analytics::report_cost_per_byte_ms(double cost_per_byte_ms, bool for_young_only_phase) { @@ -228,6 +222,10 @@ void G1Analytics::report_non_young_other_cost_per_region_ms(double other_cost_pe _non_young_other_cost_per_region_ms_seq.add(other_cost_per_region_ms); } +void G1Analytics::report_merge_refinement_table_time_ms(double merge_refinement_table_time_ms) { + _merge_refinement_table_ms_seq.add(merge_refinement_table_time_ms); +} + void G1Analytics::report_constant_other_time_ms(double constant_other_time_ms) { _constant_other_time_ms_seq.add(constant_other_time_ms); } @@ -260,12 +258,8 @@ double G1Analytics::predict_dirtied_cards_rate_ms() const { return predict_zero_bounded(&_dirtied_cards_rate_ms_seq); } -size_t G1Analytics::predict_dirtied_cards_in_thread_buffers() const { - return predict_size(&_dirtied_cards_in_thread_buffers_seq); -} - size_t G1Analytics::predict_scan_card_num(size_t card_rs_length, bool for_young_only_phase) const { - return card_rs_length * predict_in_unit_interval(&_card_scan_to_merge_ratio_seq, for_young_only_phase); + return card_rs_length * predict_in_unit_interval(&_card_merge_to_scan_ratio_seq, for_young_only_phase); } double G1Analytics::predict_card_merge_time_ms(size_t card_num, bool for_young_only_phase) const { @@ -284,6 +278,10 @@ double G1Analytics::predict_object_copy_time_ms(size_t bytes_to_copy, bool for_y return bytes_to_copy * predict_zero_bounded(&_cost_per_byte_copied_ms_seq, for_young_only_phase); } +double G1Analytics::predict_merge_refinement_table_time_ms() const { + return predict_zero_bounded(&_merge_refinement_table_ms_seq); +} + double G1Analytics::predict_constant_other_time_ms() const { return predict_zero_bounded(&_constant_other_time_ms_seq); } diff --git a/src/hotspot/share/gc/g1/g1Analytics.hpp b/src/hotspot/share/gc/g1/g1Analytics.hpp index e5e2dd74101..1f609815632 100644 --- a/src/hotspot/share/gc/g1/g1Analytics.hpp +++ b/src/hotspot/share/gc/g1/g1Analytics.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -56,14 +56,13 @@ class G1Analytics: public CHeapObj { TruncatedSeq _concurrent_refine_rate_ms_seq; TruncatedSeq _dirtied_cards_rate_ms_seq; - TruncatedSeq _dirtied_cards_in_thread_buffers_seq; - // The ratio between the number of scanned cards and actually merged cards, for - // young-only and mixed gcs. - G1PhaseDependentSeq _card_scan_to_merge_ratio_seq; + // The ratio between the number of merged cards to actually scanned cards for + // card based remembered sets, for young-only and mixed gcs. + G1PhaseDependentSeq _card_merge_to_scan_ratio_seq; // The cost to scan a card during young-only and mixed gcs in ms. G1PhaseDependentSeq _cost_per_card_scan_ms_seq; - // The cost to merge a card during young-only and mixed gcs in ms. + // The cost to merge a card from the remembered sets for non-young regions in ms. G1PhaseDependentSeq _cost_per_card_merge_ms_seq; // The cost to scan entries in the code root remembered set in ms. G1PhaseDependentSeq _cost_per_code_root_ms_seq; @@ -74,6 +73,8 @@ class G1Analytics: public CHeapObj { G1PhaseDependentSeq _card_rs_length_seq; G1PhaseDependentSeq _code_root_rs_length_seq; + // Prediction for merging the refinement table to the card table during GC. + TruncatedSeq _merge_refinement_table_ms_seq; TruncatedSeq _constant_other_time_ms_seq; TruncatedSeq _young_other_cost_per_region_ms_seq; TruncatedSeq _non_young_other_cost_per_region_ms_seq; @@ -149,14 +150,14 @@ public: void report_alloc_rate_ms(double alloc_rate); void report_concurrent_refine_rate_ms(double cards_per_ms); void report_dirtied_cards_rate_ms(double cards_per_ms); - void report_dirtied_cards_in_thread_buffers(size_t num_cards); void report_cost_per_card_scan_ms(double cost_per_remset_card_ms, bool for_young_only_phase); void report_cost_per_card_merge_ms(double cost_per_card_ms, bool for_young_only_phase); void report_cost_per_code_root_scan_ms(double cost_per_code_root_ms, bool for_young_only_phase); - void report_card_scan_to_merge_ratio(double cards_per_entry_ratio, bool for_young_only_phase); + void report_card_merge_to_scan_ratio(double merge_to_scan_ratio, bool for_young_only_phase); void report_cost_per_byte_ms(double cost_per_byte_ms, bool for_young_only_phase); void report_young_other_cost_per_region_ms(double other_cost_per_region_ms); void report_non_young_other_cost_per_region_ms(double other_cost_per_region_ms); + void report_merge_refinement_table_time_ms(double pending_card_merge_time_ms); void report_constant_other_time_ms(double constant_other_time_ms); void report_pending_cards(double pending_cards, bool for_young_only_phase); void report_card_rs_length(double card_rs_length, bool for_young_only_phase); @@ -167,7 +168,6 @@ public: double predict_concurrent_refine_rate_ms() const; double predict_dirtied_cards_rate_ms() const; - size_t predict_dirtied_cards_in_thread_buffers() const; // Predict how many of the given remembered set of length card_rs_length will add to // the number of total cards scanned. @@ -180,6 +180,7 @@ public: double predict_object_copy_time_ms(size_t bytes_to_copy, bool for_young_only_phase) const; + double predict_merge_refinement_table_time_ms() const; double predict_constant_other_time_ms() const; double predict_young_other_time_ms(size_t young_num) const; diff --git a/src/hotspot/share/gc/g1/g1Arguments.cpp b/src/hotspot/share/gc/g1/g1Arguments.cpp index ee91c327337..5cbafd2ae94 100644 --- a/src/hotspot/share/gc/g1/g1Arguments.cpp +++ b/src/hotspot/share/gc/g1/g1Arguments.cpp @@ -68,6 +68,12 @@ void G1Arguments::initialize_alignments() { if (FLAG_IS_DEFAULT(G1EagerReclaimRemSetThreshold)) { FLAG_SET_ERGO(G1EagerReclaimRemSetThreshold, G1RemSetArrayOfCardsEntries); } + // G1 prefers to use conditional card marking to avoid overwriting cards that + // have already been found to contain a to-collection set reference. This reduces + // refinement effort. + if (FLAG_IS_DEFAULT(UseCondCardMark)) { + FLAG_SET_ERGO(UseCondCardMark, true); + } } size_t G1Arguments::conservative_max_heap_alignment() { @@ -241,9 +247,8 @@ void G1Arguments::initialize() { // Verify that the maximum parallelism isn't too high to eventually overflow // the refcount in G1CardSetContainer. - uint max_parallel_refinement_threads = G1ConcRefinementThreads + G1DirtyCardQueueSet::num_par_ids(); uint const divisor = 3; // Safe divisor; we increment by 2 for each claim, but there is a small initial value. - if (max_parallel_refinement_threads > UINT_MAX / divisor) { + if (G1ConcRefinementThreads > UINT_MAX / divisor) { vm_exit_during_initialization("Too large parallelism for remembered sets."); } diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.cpp b/src/hotspot/share/gc/g1/g1BarrierSet.cpp index c56434340cd..ab7d6febf4c 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp +++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp @@ -32,12 +32,14 @@ #include "gc/g1/g1ThreadLocalData.hpp" #include "gc/shared/satbMarkQueue.hpp" #include "logging/log.hpp" +#include "memory/iterator.hpp" #include "oops/access.inline.hpp" #include "oops/compressedOops.inline.hpp" #include "oops/oop.inline.hpp" #include "runtime/interfaceSupport.inline.hpp" #include "runtime/javaThread.hpp" #include "runtime/orderAccess.hpp" +#include "runtime/threads.hpp" #include "utilities/macros.hpp" #ifdef COMPILER1 #include "gc/g1/c1/g1BarrierSetC1.hpp" @@ -49,18 +51,38 @@ class G1BarrierSetC1; class G1BarrierSetC2; -G1BarrierSet::G1BarrierSet(G1CardTable* card_table) : +G1BarrierSet::G1BarrierSet(G1CardTable* card_table, + G1CardTable* refinement_table) : CardTableBarrierSet(make_barrier_set_assembler(), make_barrier_set_c1(), make_barrier_set_c2(), card_table, BarrierSet::FakeRtti(BarrierSet::G1BarrierSet)), _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize), - _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize), _satb_mark_queue_set(&_satb_mark_queue_buffer_allocator), - _dirty_card_queue_set(&_dirty_card_queue_buffer_allocator) + _refinement_table(refinement_table) {} +G1BarrierSet::~G1BarrierSet() { + delete _refinement_table; +} + +void G1BarrierSet::swap_global_card_table() { + G1CardTable* temp = static_cast(_card_table); + _card_table = _refinement_table; + _refinement_table = temp; +} + +void G1BarrierSet::update_card_table_base(Thread* thread) { +#ifdef ASSERT + { + ResourceMark rm; + assert(thread->is_Java_thread(), "may only update card table base of JavaThreads, not %s", thread->name()); + } +#endif + G1ThreadLocalData::set_byte_map_base(thread, _card_table->byte_map_base()); +} + template void G1BarrierSet::write_ref_array_pre_work(T* dst, size_t count) { G1SATBMarkQueueSet& queue_set = G1BarrierSet::satb_mark_queue_set(); @@ -89,28 +111,14 @@ void G1BarrierSet::write_ref_array_pre(narrowOop* dst, size_t count, bool dest_u } } -void G1BarrierSet::write_ref_field_post_slow(volatile CardValue* byte) { - // In the slow path, we know a card is not young - assert(*byte != G1CardTable::g1_young_card_val(), "slow path invoked without filtering"); - OrderAccess::storeload(); - if (*byte != G1CardTable::dirty_card_val()) { - *byte = G1CardTable::dirty_card_val(); - Thread* thr = Thread::current(); - G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thr); - G1BarrierSet::dirty_card_queue_set().enqueue(queue, byte); - } -} - void G1BarrierSet::write_region(JavaThread* thread, MemRegion mr) { if (mr.is_empty()) { return; } - volatile CardValue* byte = _card_table->byte_for(mr.start()); - CardValue* last_byte = _card_table->byte_for(mr.last()); - // skip young gen cards - if (*byte == G1CardTable::g1_young_card_val()) { - // MemRegion should not span multiple regions for the young gen. + // Skip writes to young gen. + if (G1CollectedHeap::heap()->heap_region_containing(mr.start())->is_young()) { + // MemRegion should not span multiple regions for arrays in young gen. DEBUG_ONLY(G1HeapRegion* containing_hr = G1CollectedHeap::heap()->heap_region_containing(mr.start());) assert(containing_hr->is_young(), "it should be young"); assert(containing_hr->is_in(mr.start()), "it should contain start"); @@ -118,16 +126,25 @@ void G1BarrierSet::write_region(JavaThread* thread, MemRegion mr) { return; } - OrderAccess::storeload(); - // Enqueue if necessary. - G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set(); - G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread); + // We need to make sure that we get the start/end byte information for the area + // to mark from the same card table to avoid getting confused in the mark loop + // further below - we might execute while the global card table is being switched. + // + // It does not matter which card table we write to: at worst we may write to the + // new card table (after the switching), which means that we will catch the + // marks next time. + // If we write to the old card table (after the switching, then the refinement + // table) the oncoming handshake will do the memory synchronization. + CardTable* card_table = AtomicAccess::load(&_card_table); + + volatile CardValue* byte = card_table->byte_for(mr.start()); + CardValue* last_byte = card_table->byte_for(mr.last()); + + // Dirty cards only if necessary. for (; byte <= last_byte; byte++) { CardValue bv = *byte; - assert(bv != G1CardTable::g1_young_card_val(), "Invalid card"); - if (bv != G1CardTable::dirty_card_val()) { + if (bv == G1CardTable::clean_card_val()) { *byte = G1CardTable::dirty_card_val(); - qset.enqueue(queue, byte); } } } @@ -148,14 +165,15 @@ void G1BarrierSet::on_thread_attach(Thread* thread) { assert(!satbq.is_active(), "SATB queue should not be active"); assert(satbq.buffer() == nullptr, "SATB queue should not have a buffer"); assert(satbq.index() == 0, "SATB queue index should be zero"); - G1DirtyCardQueue& dirtyq = G1ThreadLocalData::dirty_card_queue(thread); - assert(dirtyq.buffer() == nullptr, "Dirty Card queue should not have a buffer"); - assert(dirtyq.index() == 0, "Dirty Card queue index should be zero"); - // If we are creating the thread during a marking cycle, we should // set the active field of the SATB queue to true. That involves // copying the global is_active value to this thread's queue. satbq.set_active(_satb_mark_queue_set.is_active()); + + if (thread->is_Java_thread()) { + assert(Threads_lock->is_locked(), "must be, synchronization with refinement."); + update_card_table_base(thread); + } } void G1BarrierSet::on_thread_detach(Thread* thread) { @@ -165,14 +183,13 @@ void G1BarrierSet::on_thread_detach(Thread* thread) { SATBMarkQueue& queue = G1ThreadLocalData::satb_mark_queue(thread); G1BarrierSet::satb_mark_queue_set().flush_queue(queue); } - { - G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread); - G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set(); - qset.flush_queue(queue); - qset.record_detached_refinement_stats(queue.refinement_stats()); - } { G1RegionPinCache& cache = G1ThreadLocalData::pin_count_cache(thread); cache.flush(); } } + +void G1BarrierSet::print_on(outputStream* st) const { + _card_table->print_on(st, "Card"); + _refinement_table->print_on(st, "Refinement"); +} diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.hpp b/src/hotspot/share/gc/g1/g1BarrierSet.hpp index 2b1074fcd7a..40e87c373b7 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSet.hpp +++ b/src/hotspot/share/gc/g1/g1BarrierSet.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,32 +25,65 @@ #ifndef SHARE_GC_G1_G1BARRIERSET_HPP #define SHARE_GC_G1_G1BARRIERSET_HPP -#include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1SATBMarkQueueSet.hpp" #include "gc/shared/bufferNode.hpp" #include "gc/shared/cardTable.hpp" #include "gc/shared/cardTableBarrierSet.hpp" class G1CardTable; +class Thread; -// This barrier is specialized to use a logging barrier to support -// snapshot-at-the-beginning marking. - +// This barrier set is specialized to manage two card tables: +// * one the mutator is currently working on ("card table") +// * one the refinement threads or GC during pause are working on ("refinement table") +// +// The card table acts like a regular card table where the mutator dirties cards +// containing potentially interesting references. +// +// When the amount of dirty cards on the card table exceeds a threshold, G1 swaps +// the card tables and has the refinement threads reduce them by "refining" +// them. +// I.e. refinement looks at all dirty cards on the refinement table, and updates +// the remembered sets accordingly, clearing the cards on the refinement table. +// +// Meanwhile the mutator continues dirtying the now empty card table. +// +// This separation of data the mutator and refinement threads are working on +// removes the need for any fine-grained (per mutator write) synchronization between +// them, keeping the write barrier simple. +// +// The refinement threads mark cards in the current collection set specially on the +// card table - this is fine wrt synchronization with the mutator, because at +// most the mutator will overwrite it again if there is a race, as G1 will scan the +// entire card either way during the GC pause. +// +// During garbage collection, if the refinement table is known to be non-empty, G1 +// merges it back (and cleaning it) to the card table which is scanned for dirty +// cards. +// class G1BarrierSet: public CardTableBarrierSet { friend class VMStructs; private: BufferNode::Allocator _satb_mark_queue_buffer_allocator; - BufferNode::Allocator _dirty_card_queue_buffer_allocator; G1SATBMarkQueueSet _satb_mark_queue_set; - G1DirtyCardQueueSet _dirty_card_queue_set; + + G1CardTable* _refinement_table; + + public: + G1BarrierSet(G1CardTable* card_table, G1CardTable* refinement_table); + virtual ~G1BarrierSet(); static G1BarrierSet* g1_barrier_set() { return barrier_set_cast(BarrierSet::barrier_set()); } - public: - G1BarrierSet(G1CardTable* table); - ~G1BarrierSet() { } + G1CardTable* refinement_table() const { return _refinement_table; } + + // Swap the global card table references, without synchronization. + void swap_global_card_table(); + + // Update the given thread's card table (byte map) base to the current card table's. + void update_card_table_base(Thread* thread); virtual bool card_mark_must_follow_store() const { return true; @@ -74,9 +107,8 @@ class G1BarrierSet: public CardTableBarrierSet { inline void write_region(MemRegion mr); void write_region(JavaThread* thread, MemRegion mr); - template + template void write_ref_field_post(T* field); - void write_ref_field_post_slow(volatile CardValue* byte); virtual void on_thread_create(Thread* thread); virtual void on_thread_destroy(Thread* thread); @@ -87,9 +119,7 @@ class G1BarrierSet: public CardTableBarrierSet { return g1_barrier_set()->_satb_mark_queue_set; } - static G1DirtyCardQueueSet& dirty_card_queue_set() { - return g1_barrier_set()->_dirty_card_queue_set; - } + virtual void print_on(outputStream* st) const; // Callbacks for runtime accesses. template diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp index 9678da190af..0888fc58937 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp +++ b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp @@ -75,9 +75,8 @@ inline void G1BarrierSet::write_region(MemRegion mr) { template inline void G1BarrierSet::write_ref_field_post(T* field) { volatile CardValue* byte = _card_table->byte_for(field); - if (*byte != G1CardTable::g1_young_card_val()) { - // Take a slow path for cards in old - write_ref_field_post_slow(byte); + if (*byte == G1CardTable::clean_card_val()) { + *byte = G1CardTable::dirty_card_val(); } } @@ -127,7 +126,7 @@ inline void G1BarrierSet::AccessBarrier:: oop_store_not_in_heap(T* addr, oop new_value) { // Apply SATB barriers for all non-heap references, to allow // concurrent scanning of such references. - G1BarrierSet *bs = barrier_set_cast(BarrierSet::barrier_set()); + G1BarrierSet *bs = g1_barrier_set(); bs->write_ref_field_pre(addr); Raw::oop_store(addr, new_value); } diff --git a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp index 205829bba1a..24ade277afe 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp +++ b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp @@ -29,17 +29,17 @@ #include "utilities/macros.hpp" void G1BarrierSetRuntime::write_ref_array_pre_oop_entry(oop* dst, size_t length) { - G1BarrierSet *bs = barrier_set_cast(BarrierSet::barrier_set()); + G1BarrierSet *bs = G1BarrierSet::g1_barrier_set(); bs->write_ref_array_pre(dst, length, false); } void G1BarrierSetRuntime::write_ref_array_pre_narrow_oop_entry(narrowOop* dst, size_t length) { - G1BarrierSet *bs = barrier_set_cast(BarrierSet::barrier_set()); + G1BarrierSet *bs = G1BarrierSet::g1_barrier_set(); bs->write_ref_array_pre(dst, length, false); } void G1BarrierSetRuntime::write_ref_array_post_entry(HeapWord* dst, size_t length) { - G1BarrierSet *bs = barrier_set_cast(BarrierSet::barrier_set()); + G1BarrierSet *bs = G1BarrierSet::g1_barrier_set(); bs->G1BarrierSet::write_ref_array(dst, length); } @@ -53,14 +53,6 @@ JRT_LEAF(void, G1BarrierSetRuntime::write_ref_field_pre_entry(oopDesc* orig, Jav G1BarrierSet::satb_mark_queue_set().enqueue_known_active(queue, orig); JRT_END -// G1 post write barrier slowpath -JRT_LEAF(void, G1BarrierSetRuntime::write_ref_field_post_entry(volatile G1CardTable::CardValue* card_addr, - JavaThread* thread)) - assert(thread == JavaThread::current(), "pre-condition"); - G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread); - G1BarrierSet::dirty_card_queue_set().enqueue(queue, card_addr); -JRT_END - JRT_LEAF(void, G1BarrierSetRuntime::clone(oopDesc* src, oopDesc* dst, size_t size)) HeapAccess<>::clone(src, dst, size); JRT_END diff --git a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp index 27287a0624b..ba7bc4d90f4 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp +++ b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -47,7 +47,6 @@ public: // C2 slow-path runtime calls. static void write_ref_field_pre_entry(oopDesc* orig, JavaThread *thread); - static void write_ref_field_post_entry(volatile CardValue* card_addr, JavaThread* thread); static address clone_addr(); }; diff --git a/src/hotspot/share/gc/g1/g1CardTable.cpp b/src/hotspot/share/gc/g1/g1CardTable.cpp index 303b8cda91f..6df178d49c5 100644 --- a/src/hotspot/share/gc/g1/g1CardTable.cpp +++ b/src/hotspot/share/gc/g1/g1CardTable.cpp @@ -28,18 +28,37 @@ #include "logging/log.hpp" #include "runtime/os.hpp" -void G1CardTable::g1_mark_as_young(const MemRegion& mr) { - CardValue *const first = byte_for(mr.start()); - CardValue *const last = byte_after(mr.last()); +void G1CardTable::verify_region(MemRegion mr, CardValue val, bool val_equals) { + if (mr.is_empty()) { + return; + } + CardValue* start = byte_for(mr.start()); + CardValue* end = byte_for(mr.last()); - memset_with_concurrent_readers(first, g1_young_gen, pointer_delta(last, first, sizeof(CardValue))); -} + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + G1HeapRegion* r = g1h->heap_region_containing(mr.start()); -#ifndef PRODUCT -void G1CardTable::verify_g1_young_region(MemRegion mr) { - verify_region(mr, g1_young_gen, true); + assert(r == g1h->heap_region_containing(mr.last()), "MemRegion crosses region"); + + bool failures = false; + for (CardValue* curr = start; curr <= end; ++curr) { + CardValue curr_val = *curr; + bool failed = (val_equals) ? (curr_val != val) : (curr_val == val); + if (failed) { + if (!failures) { + log_error(gc, verify)("== CT verification failed: [" PTR_FORMAT "," PTR_FORMAT "] r: %d (%s) %sexpecting value: %d", + p2i(start), p2i(end), r->hrm_index(), r->get_short_type_str(), + (val_equals) ? "" : "not ", val); + failures = true; + } + log_error(gc, verify)("== card " PTR_FORMAT " [" PTR_FORMAT "," PTR_FORMAT "], val: %d", + p2i(curr), p2i(addr_for(curr)), + p2i((HeapWord*) (((size_t) addr_for(curr)) + _card_size)), + (int) curr_val); + } + } + guarantee(!failures, "there should not have been any failures"); } -#endif void G1CardTableChangedListener::on_commit(uint start_idx, size_t num_regions, bool zero_filled) { // Default value for a clean card on the card table is -1. So we cannot take advantage of the zero_filled parameter. @@ -74,6 +93,5 @@ void G1CardTable::initialize(G1RegionToSpaceMapper* mapper) { } bool G1CardTable::is_in_young(const void* p) const { - volatile CardValue* card = byte_for(p); - return *card == G1CardTable::g1_young_card_val(); + return G1CollectedHeap::heap()->heap_region_containing(p)->is_young(); } diff --git a/src/hotspot/share/gc/g1/g1CardTable.hpp b/src/hotspot/share/gc/g1/g1CardTable.hpp index 16133029a11..060e5459778 100644 --- a/src/hotspot/share/gc/g1/g1CardTable.hpp +++ b/src/hotspot/share/gc/g1/g1CardTable.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -52,8 +52,6 @@ class G1CardTable : public CardTable { public: enum G1CardValues { - g1_young_gen = CT_MR_BS_last_reserved << 1, - // During evacuation we use the card table to consolidate the cards we need to // scan for roots onto the card table from the various sources. Further it is // used to record already completely scanned cards to avoid re-scanning them @@ -63,18 +61,43 @@ public: // The merge at the start of each evacuation round simply sets cards to dirty // that are clean; scanned cards are set to 0x1. // - // This means that the LSB determines what to do with the card during evacuation - // given the following possible values: + // This means that the LSB determines whether the card is clean or non-clean + // (LSB is 1 -> clean, LSB is 0 -> non-clean) given the following possible values: // - // 11111111 - clean, do not scan - // 00000001 - already scanned, do not scan + // xxxxxxx1 - clean, already scanned, do not scan again (during GC only). + // 00000100 - dirty, needs to be scanned, dirty from remembered set (during GC only) + // 00000010 - dirty, needs to be scanned, contains reference to collection set. // 00000000 - dirty, needs to be scanned. // - g1_card_already_scanned = 0x1 + // g1_to_cset_card and g1_from_remset_card are both used for optimization and + // needed for more accurate prediction of card generation rate. + // + // g1_to_cset_card allows to separate dirty card generation rate by the mutator + // (which just dirties cards) from cards that will be scanned during next garbage + // collection anyway. + // Further it allows the optimization to not refine them, assuming that their + // references to young gen does not change, and not add this card to any other + // remembered set. + // This color is sticky during mutator time: refinement threads encountering + // this card on the refinement table will just copy it over to the regular card + // table without re-refining this card. This saves on refinement effort spent + // on that card because most of the time already found interesting references + // stay interesting. + // + // g1_from_remset_card allows separation of cards generated by the mutator from + // cards in the remembered set, again to make mutator dirty card generation + // prediction more accurate. + // + // More accurate prediction allow better (less wasteful) refinement control. + g1_dirty_card = dirty_card, + g1_card_already_scanned = 0x1, + g1_to_cset_card = 0x2, + g1_from_remset_card = 0x4 }; static const size_t WordAllClean = SIZE_MAX; static const size_t WordAllDirty = 0; + static const size_t WordAllFromRemset = (SIZE_MAX / 255) * g1_from_remset_card; STATIC_ASSERT(BitsPerByte == 8); static const size_t WordAlreadyScanned = (SIZE_MAX / 255) * g1_card_already_scanned; @@ -83,27 +106,27 @@ public: _listener.set_card_table(this); } - static CardValue g1_young_card_val() { return g1_young_gen; } static CardValue g1_scanned_card_val() { return g1_card_already_scanned; } - void verify_g1_young_region(MemRegion mr) PRODUCT_RETURN; - void g1_mark_as_young(const MemRegion& mr); + void verify_region(MemRegion mr, CardValue val, bool val_equals) override; size_t index_for_cardvalue(CardValue const* p) const { return pointer_delta(p, _byte_map, sizeof(CardValue)); } - // Mark the given card as Dirty if it is Clean. Returns whether the card was + // Mark the given card as From Remset if it is Clean. Returns whether the card was // Clean before this operation. This result may be inaccurate as it does not // perform the dirtying atomically. - inline bool mark_clean_as_dirty(CardValue* card); + inline bool mark_clean_as_from_remset(CardValue* card); - // Change Clean cards in a (large) area on the card table as Dirty, preserving - // already scanned cards. Assumes that most cards in that area are Clean. - inline void mark_range_dirty(size_t start_card_index, size_t num_cards); + // Change Clean cards in a (large) area on the card table as From_Remset, preserving + // cards already marked otherwise. Assumes that most cards in that area are Clean. + // Not atomic. + inline size_t mark_clean_range_as_from_remset(size_t start_card_index, size_t num_cards); - // Change the given range of dirty cards to "which". All of these cards must be Dirty. - inline void change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which); + // Change the given range of dirty cards to "which". All of these cards must be non-clean. + // Returns the number of pending cards found. + inline size_t change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which); inline uint region_idx_for(CardValue* p); diff --git a/src/hotspot/share/gc/g1/g1CardTable.inline.hpp b/src/hotspot/share/gc/g1/g1CardTable.inline.hpp index 03bce7d50d7..370dc22ded0 100644 --- a/src/hotspot/share/gc/g1/g1CardTable.inline.hpp +++ b/src/hotspot/share/gc/g1/g1CardTable.inline.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,25 +28,39 @@ #include "gc/g1/g1CardTable.hpp" #include "gc/g1/g1HeapRegion.hpp" +#include "utilities/population_count.hpp" inline uint G1CardTable::region_idx_for(CardValue* p) { size_t const card_idx = pointer_delta(p, _byte_map, sizeof(CardValue)); return (uint)(card_idx >> G1HeapRegion::LogCardsPerRegion); } -inline bool G1CardTable::mark_clean_as_dirty(CardValue* card) { +inline bool G1CardTable::mark_clean_as_from_remset(CardValue* card) { CardValue value = *card; if (value == clean_card_val()) { - *card = dirty_card_val(); + *card = g1_from_remset_card; return true; } return false; } -inline void G1CardTable::mark_range_dirty(size_t start_card_index, size_t num_cards) { +// Returns bits from a where mask is 0, and bits from b where mask is 1. +// +// Example: +// a = 0xAAAAAAAA +// b = 0xBBBBBBBB +// mask = 0xFF00FF00 +// result = 0xBBAABBAA +inline size_t blend(size_t a, size_t b, size_t mask) { + return (a & ~mask) | (b & mask); +} + +inline size_t G1CardTable::mark_clean_range_as_from_remset(size_t start_card_index, size_t num_cards) { assert(is_aligned(start_card_index, sizeof(size_t)), "Start card index must be aligned."); assert(is_aligned(num_cards, sizeof(size_t)), "Number of cards to change must be evenly divisible."); + size_t result = 0; + size_t const num_chunks = num_cards / sizeof(size_t); size_t* cur_word = (size_t*)&_byte_map[start_card_index]; @@ -54,31 +68,33 @@ inline void G1CardTable::mark_range_dirty(size_t start_card_index, size_t num_ca while (cur_word < end_word_map) { size_t value = *cur_word; if (value == WordAllClean) { - *cur_word = WordAllDirty; - } else if (value == WordAllDirty) { - // do nothing. + *cur_word = WordAllFromRemset; + result += sizeof(size_t); + } else if ((value & WordAlreadyScanned) == 0) { + // Do nothing if there is no "Clean" card in it. } else { - // There is a mix of cards in there. Tread slowly. - CardValue* cur = (CardValue*)cur_word; - for (size_t i = 0; i < sizeof(size_t); i++) { - CardValue value = *cur; - if (value == clean_card_val()) { - *cur = dirty_card_val(); - } - cur++; - } + // There is a mix of cards in there. Tread "slowly". + size_t clean_card_mask = (value & WordAlreadyScanned) * 0xff; // All "Clean" cards have 0xff, all other places 0x00 now. + result += population_count(clean_card_mask) / BitsPerByte; + *cur_word = blend(value, WordAllFromRemset, clean_card_mask); } cur_word++; } + return result; } -inline void G1CardTable::change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which) { +inline size_t G1CardTable::change_dirty_cards_to(CardValue* start_card, CardValue* end_card, CardValue which) { + size_t result = 0; for (CardValue* i_card = start_card; i_card < end_card; ++i_card) { CardValue value = *i_card; - assert(value == dirty_card_val(), + assert((value & g1_card_already_scanned) == 0, "Must have been dirty %d start " PTR_FORMAT " " PTR_FORMAT, value, p2i(start_card), p2i(end_card)); + if (value == g1_dirty_card) { + result++; + } *i_card = which; } + return result; } #endif /* SHARE_GC_G1_G1CARDTABLE_INLINE_HPP */ diff --git a/src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp b/src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp new file mode 100644 index 00000000000..e0cadbdd907 --- /dev/null +++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "gc/g1/g1CardTableClaimTable.inline.hpp" +#include "gc/g1/g1CollectedHeap.inline.hpp" +#include "gc/g1/g1HeapRegion.inline.hpp" +#include "gc/shared/workerThread.hpp" +#include "memory/allocation.hpp" +#include "utilities/checkedCast.hpp" +#include "utilities/powerOfTwo.hpp" + +G1CardTableClaimTable::G1CardTableClaimTable(uint chunks_per_region) : + _max_reserved_regions(0), + _card_claims(nullptr), + _cards_per_chunk(checked_cast(G1HeapRegion::CardsPerRegion / chunks_per_region)) +{ + guarantee(chunks_per_region > 0, "%u chunks per region", chunks_per_region); +} + +G1CardTableClaimTable::~G1CardTableClaimTable() { + FREE_C_HEAP_ARRAY(uint, _card_claims); +} + +void G1CardTableClaimTable::initialize(uint max_reserved_regions) { + assert(_card_claims == nullptr, "Must not be initialized twice"); + _card_claims = NEW_C_HEAP_ARRAY(uint, max_reserved_regions, mtGC); + _max_reserved_regions = max_reserved_regions; + reset_all_to_unclaimed(); +} + +void G1CardTableClaimTable::reset_all_to_unclaimed() { + for (uint i = 0; i < _max_reserved_regions; i++) { + _card_claims[i] = 0; + } +} + +void G1CardTableClaimTable::reset_all_to_claimed() { + for (uint i = 0; i < _max_reserved_regions; i++) { + _card_claims[i] = (uint)G1HeapRegion::CardsPerRegion; + } +} + +void G1CardTableClaimTable::heap_region_iterate_from_worker_offset(G1HeapRegionClosure* cl, uint worker_id, uint max_workers) { + // Every worker will actually look at all regions, skipping over regions that + // are completed. + const size_t n_regions = _max_reserved_regions; + const uint start_index = (uint)(worker_id * n_regions / max_workers); + + for (uint count = 0; count < n_regions; count++) { + const uint index = (start_index + count) % n_regions; + assert(index < n_regions, "sanity"); + // Skip over fully processed regions + if (!has_unclaimed_cards(index)) { + continue; + } + G1HeapRegion* r = G1CollectedHeap::heap()->region_at(index); + bool res = cl->do_heap_region(r); + if (res) { + return; + } + } +} + +G1CardTableChunkClaimer::G1CardTableChunkClaimer(G1CardTableClaimTable* scan_state, uint region_idx) : + _claim_values(scan_state), + _region_idx(region_idx), + _cur_claim(0) { + guarantee(size() <= G1HeapRegion::CardsPerRegion, "Should not claim more space than possible."); +} + +G1ChunkScanner::G1ChunkScanner(CardValue* const start_card, CardValue* const end_card) : + _start_card(start_card), + _end_card(end_card) { + assert(is_word_aligned(start_card), "precondition"); + assert(is_word_aligned(end_card), "precondition"); +} diff --git a/src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp b/src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp new file mode 100644 index 00000000000..4f524b83f97 --- /dev/null +++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.hpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP +#define SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP + +#include "gc/g1/g1CardTable.hpp" +#include "memory/allocation.hpp" + +class G1HeapRegionClosure; + +// Helper class representing claim values for the cards in the card table corresponding +// to a region. +// I.e. for every region this class stores an atomic counter that represents the +// number of cards from 0 to the number of cards per region already claimed for +// this region. +// If the claimed value is >= the number of cards of a region, the region can be +// considered fully claimed. +// +// Claiming works on full region (all cards in region) or a range of contiguous cards +// (chunk). Chunk size is given at construction time. +class G1CardTableClaimTable : public CHeapObj { + uint _max_reserved_regions; + + // Card table iteration claim values for every heap region, from 0 (completely unclaimed) + // to (>=) G1HeapRegion::CardsPerRegion (completely claimed). + uint volatile* _card_claims; + + uint _cards_per_chunk; // For conversion between card index and chunk index. + + // Claim increment number of cards, returning the previous claim value. + inline uint claim_cards(uint region, uint increment); + +public: + G1CardTableClaimTable(uint chunks_per_region); + ~G1CardTableClaimTable(); + + // Allocates the data structure and initializes the claims to unclaimed. + void initialize(uint max_reserved_regions); + + void reset_all_to_unclaimed(); + void reset_all_to_claimed(); + + inline bool has_unclaimed_cards(uint region); + inline void reset_to_unclaimed(uint region); + + // Claims all cards in that region, returning the previous claim value. + inline uint claim_all_cards(uint region); + + // Claim a single chunk in that region, returning the previous claim value. + inline uint claim_chunk(uint region); + inline uint cards_per_chunk() const; + + size_t max_reserved_regions() { return _max_reserved_regions; } + + void heap_region_iterate_from_worker_offset(G1HeapRegionClosure* cl, uint worker_id, uint max_workers); +}; + +// Helper class to claim dirty chunks within the card table for a given region. +class G1CardTableChunkClaimer { + G1CardTableClaimTable* _claim_values; + + uint _region_idx; + uint _cur_claim; + +public: + G1CardTableChunkClaimer(G1CardTableClaimTable* claim_table, uint region_idx); + + inline bool has_next(); + + inline uint value() const; + inline uint size() const; +}; + +// Helper class to locate consecutive dirty cards inside a range of cards. +class G1ChunkScanner { + using Word = size_t; + using CardValue = G1CardTable::CardValue; + + CardValue* const _start_card; + CardValue* const _end_card; + + static const size_t ExpandedToScanMask = G1CardTable::WordAlreadyScanned; + static const size_t ToScanMask = G1CardTable::g1_card_already_scanned; + + inline bool is_card_dirty(const CardValue* const card) const; + + inline bool is_word_aligned(const void* const addr) const; + + inline CardValue* find_first_dirty_card(CardValue* i_card) const; + inline CardValue* find_first_non_dirty_card(CardValue* i_card) const; + +public: + G1ChunkScanner(CardValue* const start_card, CardValue* const end_card); + + template + void on_dirty_cards(Func&& f) { + for (CardValue* cur_card = _start_card; cur_card < _end_card; /* empty */) { + CardValue* dirty_l = find_first_dirty_card(cur_card); + CardValue* dirty_r = find_first_non_dirty_card(dirty_l); + + assert(dirty_l <= dirty_r, "inv"); + + if (dirty_l == dirty_r) { + assert(dirty_r == _end_card, "finished the entire chunk"); + return; + } + + f(dirty_l, dirty_r); + + cur_card = dirty_r + 1; + } + } +}; + +#endif // SHARE_GC_G1_G1CARDTABLECLAIMTABLE_HPP diff --git a/src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp b/src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp new file mode 100644 index 00000000000..d682f0d17ae --- /dev/null +++ b/src/hotspot/share/gc/g1/g1CardTableClaimTable.inline.hpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP +#define SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP + +#include "gc/g1/g1CardTableClaimTable.hpp" + +#include "gc/g1/g1CollectedHeap.inline.hpp" +#include "gc/g1/g1HeapRegion.inline.hpp" +#include "runtime/atomicAccess.hpp" + +bool G1CardTableClaimTable::has_unclaimed_cards(uint region) { + assert(region < _max_reserved_regions, "Tried to access invalid region %u", region); + return AtomicAccess::load(&_card_claims[region]) < G1HeapRegion::CardsPerRegion; +} + +void G1CardTableClaimTable::reset_to_unclaimed(uint region) { + assert(region < _max_reserved_regions, "Tried to access invalid region %u", region); + AtomicAccess::store(&_card_claims[region], 0u); +} + +uint G1CardTableClaimTable::claim_cards(uint region, uint increment) { + assert(region < _max_reserved_regions, "Tried to access invalid region %u", region); + return AtomicAccess::fetch_then_add(&_card_claims[region], increment, memory_order_relaxed); +} + +uint G1CardTableClaimTable::claim_chunk(uint region) { + assert(region < _max_reserved_regions, "Tried to access invalid region %u", region); + return AtomicAccess::fetch_then_add(&_card_claims[region], cards_per_chunk(), memory_order_relaxed); +} + +uint G1CardTableClaimTable::claim_all_cards(uint region) { + return claim_cards(region, (uint)G1HeapRegion::CardsPerRegion); +} + +uint G1CardTableClaimTable::cards_per_chunk() const { return _cards_per_chunk; } + +bool G1CardTableChunkClaimer::has_next() { + _cur_claim = _claim_values->claim_chunk(_region_idx); + return (_cur_claim < G1HeapRegion::CardsPerRegion); +} + +uint G1CardTableChunkClaimer::value() const { return _cur_claim; } +uint G1CardTableChunkClaimer::size() const { return _claim_values->cards_per_chunk(); } + +bool G1ChunkScanner::is_card_dirty(const CardValue* const card) const { + return (*card & ToScanMask) == 0; +} + +bool G1ChunkScanner::is_word_aligned(const void* const addr) const { + return ((uintptr_t)addr) % sizeof(Word) == 0; +} + +G1CardTable::CardValue* G1ChunkScanner::find_first_dirty_card(CardValue* i_card) const { + while (!is_word_aligned(i_card)) { + if (is_card_dirty(i_card)) { + return i_card; + } + i_card++; + } + + for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) { + Word word_value = *reinterpret_cast(i_card); + bool has_dirty_cards_in_word = (~word_value & ExpandedToScanMask) != 0; + + if (has_dirty_cards_in_word) { + for (uint i = 0; i < sizeof(Word); ++i) { + if (is_card_dirty(i_card)) { + return i_card; + } + i_card++; + } + ShouldNotReachHere(); + } + } + + return _end_card; +} + +G1CardTable::CardValue* G1ChunkScanner::find_first_non_dirty_card(CardValue* i_card) const { + while (!is_word_aligned(i_card)) { + if (!is_card_dirty(i_card)) { + return i_card; + } + i_card++; + } + + for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) { + Word word_value = *reinterpret_cast(i_card); + bool all_cards_dirty = (word_value & ExpandedToScanMask) == 0; + + if (!all_cards_dirty) { + for (uint i = 0; i < sizeof(Word); ++i) { + if (!is_card_dirty(i_card)) { + return i_card; + } + i_card++; + } + ShouldNotReachHere(); + } + } + + return _end_card; +} + +#endif // SHARE_GC_G1_G1CARDTABLECLAIMTABLE_INLINE_HPP diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp index 4a257265931..ed21c9aa370 100644 --- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp +++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp @@ -38,7 +38,6 @@ #include "gc/g1/g1ConcurrentMarkThread.inline.hpp" #include "gc/g1/g1ConcurrentRefine.hpp" #include "gc/g1/g1ConcurrentRefineThread.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1EvacStats.inline.hpp" #include "gc/g1/g1FullCollector.hpp" #include "gc/g1/g1GCCounters.hpp" @@ -60,10 +59,10 @@ #include "gc/g1/g1ParScanThreadState.inline.hpp" #include "gc/g1/g1PeriodicGCTask.hpp" #include "gc/g1/g1Policy.hpp" -#include "gc/g1/g1RedirtyCardsQueue.hpp" #include "gc/g1/g1RegionPinCache.inline.hpp" #include "gc/g1/g1RegionToSpaceMapper.hpp" #include "gc/g1/g1RemSet.hpp" +#include "gc/g1/g1ReviseYoungLengthTask.hpp" #include "gc/g1/g1RootClosures.hpp" #include "gc/g1/g1RootProcessor.hpp" #include "gc/g1/g1SATBMarkQueueSet.hpp" @@ -111,6 +110,7 @@ #include "runtime/init.hpp" #include "runtime/java.hpp" #include "runtime/orderAccess.hpp" +#include "runtime/threads.hpp" #include "runtime/threadSMR.hpp" #include "runtime/vmThread.hpp" #include "utilities/align.hpp" @@ -146,7 +146,7 @@ void G1CollectedHeap::run_batch_task(G1BatchedTask* cl) { workers()->run_task(cl, num_workers); } -uint G1CollectedHeap::get_chunks_per_region() { +uint G1CollectedHeap::get_chunks_per_region_for_scan() { uint log_region_size = G1HeapRegion::LogOfHRGrainBytes; // Limit the expected input values to current known possible values of the // (log) region size. Adjust as necessary after testing if changing the permissible @@ -156,6 +156,18 @@ uint G1CollectedHeap::get_chunks_per_region() { return 1u << (log_region_size / 2 - 4); } +uint G1CollectedHeap::get_chunks_per_region_for_merge() { + uint log_region_size = G1HeapRegion::LogOfHRGrainBytes; + // Limit the expected input values to current known possible values of the + // (log) region size. Adjust as necessary after testing if changing the permissible + // values for region size. + assert(log_region_size >= 20 && log_region_size <= 29, + "expected value in [20,29], but got %u", log_region_size); + + uint half_log_region_size = (log_region_size + 1) / 2; + return 1 << (half_log_region_size - 9); +} + G1HeapRegion* G1CollectedHeap::new_heap_region(uint hrs_index, MemRegion mr) { return new G1HeapRegion(hrs_index, bot(), mr, &_card_set_config); @@ -614,7 +626,6 @@ inline HeapWord* G1CollectedHeap::attempt_allocation(size_t min_word_size, assert_heap_not_locked(); if (result != nullptr) { assert(*actual_word_size != 0, "Actual size must have been set here"); - dirty_young_block(result, *actual_word_size); } else { *actual_word_size = 0; } @@ -809,11 +820,27 @@ void G1CollectedHeap::prepare_for_mutator_after_full_collection(size_t allocatio } void G1CollectedHeap::abort_refinement() { - // Discard all remembered set updates and reset refinement statistics. - G1BarrierSet::dirty_card_queue_set().abandon_logs_and_stats(); - assert(G1BarrierSet::dirty_card_queue_set().num_cards() == 0, - "DCQS should be empty"); - concurrent_refine()->get_and_reset_refinement_stats(); + G1ConcurrentRefineSweepState& sweep_state = concurrent_refine()->sweep_state(); + if (sweep_state.is_in_progress()) { + + if (!sweep_state.are_java_threads_synched()) { + // Synchronize Java threads with global card table that has already been swapped. + class SwapThreadCardTableClosure : public ThreadClosure { + public: + + virtual void do_thread(Thread* t) { + G1BarrierSet* bs = G1BarrierSet::g1_barrier_set(); + bs->update_card_table_base(t); + } + } cl; + Threads::java_threads_do(&cl); + } + + // Record any available refinement statistics. + policy()->record_refinement_stats(sweep_state.stats()); + sweep_state.complete_work(false /* concurrent */, false /* print_log */); + } + sweep_state.reset_stats(); } void G1CollectedHeap::verify_after_full_collection() { @@ -825,6 +852,7 @@ void G1CollectedHeap::verify_after_full_collection() { } _hrm.verify_optional(); _verifier->verify_region_sets_optional(); + _verifier->verify_card_tables_clean(true /* both_card_tables */); _verifier->verify_after_gc(); _verifier->verify_bitmap_clear(false /* above_tams_only */); @@ -1168,8 +1196,13 @@ G1CollectedHeap::G1CollectedHeap() : _service_thread(nullptr), _periodic_gc_task(nullptr), _free_arena_memory_task(nullptr), + _revise_young_length_task(nullptr), _workers(nullptr), - _card_table(nullptr), + _refinement_epoch(0), + _last_synchronized_start(0), + _last_refinement_epoch_start(0), + _yield_duration_in_refinement_epoch(0), + _last_safepoint_refinement_epoch(0), _collection_pause_end(Ticks::now()), _old_set("Old Region Set", new OldRegionSetChecker()), _humongous_set("Humongous Region Set", new HumongousRegionSetChecker()), @@ -1289,7 +1322,7 @@ G1RegionToSpaceMapper* G1CollectedHeap::create_aux_memory_mapper(const char* des jint G1CollectedHeap::initialize_concurrent_refinement() { jint ecode = JNI_OK; - _cr = G1ConcurrentRefine::create(policy(), &ecode); + _cr = G1ConcurrentRefine::create(this, &ecode); return ecode; } @@ -1345,18 +1378,12 @@ jint G1CollectedHeap::initialize() { initialize_reserved_region(heap_rs); // Create the barrier set for the entire reserved region. - G1CardTable* ct = new G1CardTable(_reserved); - G1BarrierSet* bs = new G1BarrierSet(ct); + G1CardTable* card_table = new G1CardTable(_reserved); + G1CardTable* refinement_table = new G1CardTable(_reserved); + + G1BarrierSet* bs = new G1BarrierSet(card_table, refinement_table); bs->initialize(); assert(bs->is_a(BarrierSet::G1BarrierSet), "sanity"); - BarrierSet::set_barrier_set(bs); - _card_table = ct; - - { - G1SATBMarkQueueSet& satbqs = bs->satb_mark_queue_set(); - satbqs.set_process_completed_buffers_threshold(G1SATBProcessCompletedThreshold); - satbqs.set_buffer_enqueue_threshold_percentage(G1SATBBufferEnqueueingThresholdPercent); - } // Create space mappers. size_t page_size = heap_rs.page_size(); @@ -1391,12 +1418,26 @@ jint G1CollectedHeap::initialize() { G1CardTable::compute_size(heap_rs.size() / HeapWordSize), G1CardTable::heap_map_factor()); + G1RegionToSpaceMapper* refinement_cards_storage = + create_aux_memory_mapper("Refinement Card Table", + G1CardTable::compute_size(heap_rs.size() / HeapWordSize), + G1CardTable::heap_map_factor()); + size_t bitmap_size = G1CMBitMap::compute_size(heap_rs.size()); G1RegionToSpaceMapper* bitmap_storage = create_aux_memory_mapper("Mark Bitmap", bitmap_size, G1CMBitMap::heap_map_factor()); - _hrm.initialize(heap_storage, bitmap_storage, bot_storage, cardtable_storage); - _card_table->initialize(cardtable_storage); + _hrm.initialize(heap_storage, bitmap_storage, bot_storage, cardtable_storage, refinement_cards_storage); + card_table->initialize(cardtable_storage); + refinement_table->initialize(refinement_cards_storage); + + BarrierSet::set_barrier_set(bs); + + { + G1SATBMarkQueueSet& satbqs = bs->satb_mark_queue_set(); + satbqs.set_process_completed_buffers_threshold(G1SATBProcessCompletedThreshold); + satbqs.set_buffer_enqueue_threshold_percentage(G1SATBBufferEnqueueingThresholdPercent); + } // 6843694 - ensure that the maximum region index can fit // in the remembered set structures. @@ -1408,7 +1449,7 @@ jint G1CollectedHeap::initialize() { guarantee((uintptr_t)(heap_rs.base()) >= G1CardTable::card_size(), "Java heap must not start within the first card."); G1FromCardCache::initialize(max_num_regions()); // Also create a G1 rem set. - _rem_set = new G1RemSet(this, _card_table); + _rem_set = new G1RemSet(this); _rem_set->initialize(max_num_regions()); size_t max_cards_per_region = ((size_t)1 << (sizeof(CardIdx_t)*BitsPerByte-1)) - 1; @@ -1467,6 +1508,11 @@ jint G1CollectedHeap::initialize() { _free_arena_memory_task = new G1MonotonicArenaFreeMemoryTask("Card Set Free Memory Task"); _service_thread->register_task(_free_arena_memory_task); + if (policy()->use_adaptive_young_list_length()) { + _revise_young_length_task = new G1ReviseYoungLengthTask("Revise Young Length List Task"); + _service_thread->register_task(_revise_young_length_task); + } + // Here we allocate the dummy G1HeapRegion that is required by the // G1AllocRegion class. G1HeapRegion* dummy_region = _hrm.get_dummy_region(); @@ -1495,6 +1541,7 @@ jint G1CollectedHeap::initialize() { CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_parallel_workers); CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_mark); CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_refine); + CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_conc_refine_control); CPUTimeCounters::create_counter(CPUTimeGroups::CPUTimeType::gc_service); G1InitLogger::print(); @@ -1519,12 +1566,35 @@ void G1CollectedHeap::stop() { void G1CollectedHeap::safepoint_synchronize_begin() { SuspendibleThreadSet::synchronize(); + + _last_synchronized_start = os::elapsed_counter(); } void G1CollectedHeap::safepoint_synchronize_end() { + jlong now = os::elapsed_counter(); + jlong synchronize_duration = now - _last_synchronized_start; + + if (_last_safepoint_refinement_epoch == _refinement_epoch) { + _yield_duration_in_refinement_epoch += synchronize_duration; + } else { + _last_refinement_epoch_start = now; + _last_safepoint_refinement_epoch = _refinement_epoch; + _yield_duration_in_refinement_epoch = 0; + } + SuspendibleThreadSet::desynchronize(); } +void G1CollectedHeap::set_last_refinement_epoch_start(jlong epoch_start, jlong last_yield_duration) { + _last_refinement_epoch_start = epoch_start; + guarantee(_yield_duration_in_refinement_epoch >= last_yield_duration, "should be"); + _yield_duration_in_refinement_epoch -= last_yield_duration; +} + +jlong G1CollectedHeap::yield_duration_in_refinement_epoch() { + return _yield_duration_in_refinement_epoch; +} + void G1CollectedHeap::post_initialize() { CollectedHeap::post_initialize(); ref_processing_init(); @@ -2336,6 +2406,7 @@ void G1CollectedHeap::gc_epilogue(bool full) { &_collection_set_candidates_card_set_stats); update_perf_counter_cpu_time(); + _refinement_epoch++; } uint G1CollectedHeap::uncommit_regions(uint region_limit) { @@ -2468,7 +2539,6 @@ void G1CollectedHeap::verify_before_young_collection(G1HeapVerifier::G1VerifyTyp Ticks start = Ticks::now(); _verifier->prepare_for_verify(); _verifier->verify_region_sets_optional(); - _verifier->verify_dirty_young_regions(); _verifier->verify_before_gc(); verify_numa_regions("GC Start"); phase_times()->record_verify_before_time_ms((Ticks::now() - start).seconds() * MILLIUNITS); @@ -2734,6 +2804,11 @@ void G1CollectedHeap::free_region(G1HeapRegion* hr, G1FreeRegionList* free_list) if (free_list != nullptr) { free_list->add_ordered(hr); } + if (VerifyDuringGC) { + // Card and refinement table must be clear for freed regions. + card_table()->verify_region(MemRegion(hr->bottom(), hr->end()), G1CardTable::clean_card_val(), true); + refinement_table()->verify_region(MemRegion(hr->bottom(), hr->end()), G1CardTable::clean_card_val(), true); + } } void G1CollectedHeap::retain_region(G1HeapRegion* hr) { diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp index 8d26bcb1c0b..43839cc48d5 100644 --- a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp +++ b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp @@ -75,6 +75,7 @@ class G1GCPhaseTimes; class G1HeapSizingPolicy; class G1NewTracer; class G1RemSet; +class G1ReviseYoungLengthTask; class G1ServiceTask; class G1ServiceThread; class GCMemoryManager; @@ -171,9 +172,23 @@ private: G1ServiceThread* _service_thread; G1ServiceTask* _periodic_gc_task; G1MonotonicArenaFreeMemoryTask* _free_arena_memory_task; + G1ReviseYoungLengthTask* _revise_young_length_task; WorkerThreads* _workers; - G1CardTable* _card_table; + + // The current epoch for refinement, i.e. the number of times the card tables + // have been swapped by a garbage collection. + // Used for detecting whether concurrent refinement has been interrupted by a + // garbage collection. + size_t _refinement_epoch; + + // The following members are for tracking safepoint durations between garbage + // collections. + jlong _last_synchronized_start; + + jlong _last_refinement_epoch_start; + jlong _yield_duration_in_refinement_epoch; // Time spent in safepoints since beginning of last refinement epoch. + size_t _last_safepoint_refinement_epoch; // Refinement epoch before last safepoint. Ticks _collection_pause_end; @@ -541,12 +556,17 @@ public: void run_batch_task(G1BatchedTask* cl); // Return "optimal" number of chunks per region we want to use for claiming areas - // within a region to claim. + // within a region to claim during card table scanning. // The returned value is a trade-off between granularity of work distribution and // memory usage and maintenance costs of that table. // Testing showed that 64 for 1M/2M region, 128 for 4M/8M regions, 256 for 16/32M regions, // and so on seems to be such a good trade-off. - static uint get_chunks_per_region(); + static uint get_chunks_per_region_for_scan(); + // Return "optimal" number of chunks per region we want to use for claiming areas + // within a region to claim during card table merging. + // This is much smaller than for scanning as the merge work is much smaller. + // Currently 1 for 1M regions, 2 for 2/4M regions, 4 for 8/16M regions and so on. + static uint get_chunks_per_region_for_merge(); G1Allocator* allocator() { return _allocator; @@ -687,11 +707,6 @@ public: // Add the given region to the retained regions collection set candidates. void retain_region(G1HeapRegion* hr); - // It dirties the cards that cover the block so that the post - // write barrier never queues anything when updating objects on this - // block. It is assumed (and in fact we assert) that the block - // belongs to a young region. - inline void dirty_young_block(HeapWord* start, size_t word_size); // Frees a humongous region by collapsing it into individual regions // and calling free_region() for each of them. The freed regions @@ -905,6 +920,10 @@ public: void safepoint_synchronize_begin() override; void safepoint_synchronize_end() override; + jlong last_refinement_epoch_start() const { return _last_refinement_epoch_start; } + void set_last_refinement_epoch_start(jlong epoch_start, jlong last_yield_duration); + jlong yield_duration_in_refinement_epoch(); + // Does operations required after initialization has been done. void post_initialize() override; @@ -1069,7 +1088,16 @@ public: } G1CardTable* card_table() const { - return _card_table; + return static_cast(G1BarrierSet::g1_barrier_set()->card_table()); + } + + G1CardTable* refinement_table() const { + return G1BarrierSet::g1_barrier_set()->refinement_table(); + } + + G1CardTable::CardValue* card_table_base() const { + assert(card_table() != nullptr, "must be"); + return card_table()->byte_map_base(); } // Iteration functions. diff --git a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp index 3370ff9938f..fdc8585dbc0 100644 --- a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp +++ b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp @@ -149,30 +149,6 @@ inline void G1CollectedHeap::old_set_remove(G1HeapRegion* hr) { _old_set.remove(hr); } -// It dirties the cards that cover the block so that the post -// write barrier never queues anything when updating objects on this -// block. It is assumed (and in fact we assert) that the block -// belongs to a young region. -inline void -G1CollectedHeap::dirty_young_block(HeapWord* start, size_t word_size) { - assert_heap_not_locked(); - - // Assign the containing region to containing_hr so that we don't - // have to keep calling heap_region_containing() in the - // asserts below. - DEBUG_ONLY(G1HeapRegion* containing_hr = heap_region_containing(start);) - assert(word_size > 0, "pre-condition"); - assert(containing_hr->is_in(start), "it should contain start"); - assert(containing_hr->is_young(), "it should be young"); - assert(!containing_hr->is_humongous(), "it should not be humongous"); - - HeapWord* end = start + word_size; - assert(containing_hr->is_in(end - 1), "it should also contain end - 1"); - - MemRegion mr(start, end); - card_table()->g1_mark_as_young(mr); -} - inline G1ScannerTasksQueueSet* G1CollectedHeap::task_queues() const { return _task_queues; } diff --git a/src/hotspot/share/gc/g1/g1CollectionSet.cpp b/src/hotspot/share/gc/g1/g1CollectionSet.cpp index d501ee5b47b..abfb620d626 100644 --- a/src/hotspot/share/gc/g1/g1CollectionSet.cpp +++ b/src/hotspot/share/gc/g1/g1CollectionSet.cpp @@ -308,7 +308,8 @@ double G1CollectionSet::finalize_young_part(double target_pause_time_ms, G1Survi guarantee(target_pause_time_ms > 0.0, "target_pause_time_ms = %1.6lf should be positive", target_pause_time_ms); - size_t pending_cards = _policy->pending_cards_at_gc_start(); + bool in_young_only_phase = _policy->collector_state()->in_young_only_phase(); + size_t pending_cards = _policy->analytics()->predict_pending_cards(in_young_only_phase); log_trace(gc, ergo, cset)("Start choosing CSet. Pending cards: %zu target pause time: %1.2fms", pending_cards, target_pause_time_ms); @@ -323,10 +324,8 @@ double G1CollectionSet::finalize_young_part(double target_pause_time_ms, G1Survi verify_young_cset_indices(); - size_t num_young_cards = _g1h->young_regions_cardset()->occupied(); - _policy->record_card_rs_length(num_young_cards); - - double predicted_base_time_ms = _policy->predict_base_time_ms(pending_cards, num_young_cards); + size_t card_rs_length = _policy->analytics()->predict_card_rs_length(in_young_only_phase); + double predicted_base_time_ms = _policy->predict_base_time_ms(pending_cards, card_rs_length); // Base time already includes the whole remembered set related time, so do not add that here // again. double predicted_eden_time = _policy->predict_young_region_other_time_ms(eden_region_length) + diff --git a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp index e52d380e26b..97386cb9720 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp @@ -27,6 +27,7 @@ #include "gc/g1/g1BarrierSet.hpp" #include "gc/g1/g1BatchedTask.hpp" #include "gc/g1/g1CardSetMemory.hpp" +#include "gc/g1/g1CardTableClaimTable.inline.hpp" #include "gc/g1/g1CollectedHeap.inline.hpp" #include "gc/g1/g1CollectionSetChooser.hpp" #include "gc/g1/g1CollectorState.hpp" @@ -34,7 +35,7 @@ #include "gc/g1/g1ConcurrentMarkRemarkTasks.hpp" #include "gc/g1/g1ConcurrentMarkThread.inline.hpp" #include "gc/g1/g1ConcurrentRebuildAndScrub.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" +#include "gc/g1/g1ConcurrentRefine.hpp" #include "gc/g1/g1HeapRegion.inline.hpp" #include "gc/g1/g1HeapRegionManager.hpp" #include "gc/g1/g1HeapRegionPrinter.hpp" @@ -483,7 +484,7 @@ G1ConcurrentMark::G1ConcurrentMark(G1CollectedHeap* g1h, // _finger set in set_non_marking_state - _worker_id_offset(G1DirtyCardQueueSet::num_par_ids() + G1ConcRefinementThreads), + _worker_id_offset(G1ConcRefinementThreads), // The refinement control thread does not refine cards, so it's just the worker threads. _max_num_tasks(MAX2(ConcGCThreads, ParallelGCThreads)), // _num_active_tasks set in set_non_marking_state() // _tasks set inside the constructor @@ -1141,7 +1142,7 @@ void G1ConcurrentMark::mark_from_roots() { // worker threads may currently exist and more may not be // available. active_workers = _concurrent_workers->set_active_workers(active_workers); - log_info(gc, task)("Using %u workers of %u for marking", active_workers, _concurrent_workers->max_workers()); + log_info(gc, task)("Concurrent Mark Using %u of %u Workers", active_workers, _concurrent_workers->max_workers()); _num_concurrent_workers = active_workers; diff --git a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp index 4977da4729d..752082ce629 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp @@ -580,6 +580,8 @@ public: // TARS for the given region during remembered set rebuilding. inline HeapWord* top_at_rebuild_start(G1HeapRegion* r) const; + uint worker_id_offset() const { return _worker_id_offset; } + // Clear statistics gathered during the concurrent cycle for the given region after // it has been reclaimed. void clear_statistics(G1HeapRegion* r); diff --git a/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp b/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp index 02afc443d68..fdef4214622 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentMarkRemarkTasks.cpp @@ -25,6 +25,7 @@ #include "gc/g1/g1CollectedHeap.inline.hpp" #include "gc/g1/g1ConcurrentMark.inline.hpp" #include "gc/g1/g1ConcurrentMarkRemarkTasks.hpp" +#include "gc/g1/g1ConcurrentRefine.hpp" #include "gc/g1/g1HeapRegion.inline.hpp" #include "gc/g1/g1HeapRegionPrinter.hpp" #include "gc/g1/g1RemSetTrackingPolicy.hpp" @@ -54,15 +55,16 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public _num_humongous_regions_removed(0), _local_cleanup_list(local_cleanup_list) {} - void reclaim_empty_region(G1HeapRegion* hr) { + void reclaim_empty_region_common(G1HeapRegion* hr) { assert(!hr->has_pinned_objects(), "precondition"); assert(hr->used() > 0, "precondition"); _freed_bytes += hr->used(); hr->set_containing_set(nullptr); - hr->clear_cardtable(); + hr->clear_both_card_tables(); _cm->clear_statistics(hr); G1HeapRegionPrinter::mark_reclaim(hr); + _g1h->concurrent_refine()->notify_region_reclaimed(hr); } void reclaim_empty_humongous_region(G1HeapRegion* hr) { @@ -71,8 +73,8 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public auto on_humongous_region = [&] (G1HeapRegion* hr) { assert(hr->is_humongous(), "precondition"); - reclaim_empty_region(hr); _num_humongous_regions_removed++; + reclaim_empty_region_common(hr); _g1h->free_humongous_region(hr, _local_cleanup_list); }; @@ -82,8 +84,8 @@ struct G1UpdateRegionLivenessAndSelectForRebuildTask::G1OnRegionClosure : public void reclaim_empty_old_region(G1HeapRegion* hr) { assert(hr->is_old(), "precondition"); - reclaim_empty_region(hr); _num_old_regions_removed++; + reclaim_empty_region_common(hr); _g1h->free_region(hr, _local_cleanup_list); } diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp index 0633e18411d..cd560a41333 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRebuildAndScrub.cpp @@ -245,7 +245,7 @@ class G1RebuildRSAndScrubTask : public WorkerTask { G1RebuildRSAndScrubRegionClosure(G1ConcurrentMark* cm, bool should_rebuild_remset, uint worker_id) : _cm(cm), _bitmap(_cm->mark_bitmap()), - _rebuild_closure(G1CollectedHeap::heap(), worker_id), + _rebuild_closure(G1CollectedHeap::heap(), worker_id + cm->worker_id_offset()), _should_rebuild_remset(should_rebuild_remset), _processed_words(0) { } diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp index 84776b7a4b1..ed6a9ad4292 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp @@ -22,15 +22,20 @@ * */ +#include "gc/g1/g1Analytics.hpp" #include "gc/g1/g1BarrierSet.hpp" +#include "gc/g1/g1CardTableClaimTable.inline.hpp" +#include "gc/g1/g1CollectedHeap.inline.hpp" #include "gc/g1/g1CollectionSet.hpp" #include "gc/g1/g1ConcurrentRefine.hpp" +#include "gc/g1/g1ConcurrentRefineSweepTask.hpp" #include "gc/g1/g1ConcurrentRefineThread.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1HeapRegion.inline.hpp" #include "gc/g1/g1HeapRegionRemSet.inline.hpp" #include "gc/g1/g1Policy.hpp" #include "gc/shared/gc_globals.hpp" +#include "gc/shared/gcTraceTime.inline.hpp" +#include "gc/shared/workerThread.hpp" #include "logging/log.hpp" #include "memory/allocation.inline.hpp" #include "memory/iterator.hpp" @@ -38,17 +43,15 @@ #include "runtime/mutexLocker.hpp" #include "utilities/debug.hpp" #include "utilities/globalDefinitions.hpp" +#include "utilities/ticks.hpp" #include -G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thread(uint worker_id, bool initializing) { +G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thread() { G1ConcurrentRefineThread* result = nullptr; - if (initializing || !InjectGCWorkerCreationFailure) { - result = G1ConcurrentRefineThread::create(_cr, worker_id); - } + result = G1ConcurrentRefineThread::create(_cr); if (result == nullptr || result->osthread() == nullptr) { - log_warning(gc)("Failed to create refinement thread %u, no more %s", - worker_id, + log_warning(gc)("Failed to create refinement control thread, no more %s", result == nullptr ? "memory" : "OS threads"); if (result != nullptr) { delete result; @@ -60,106 +63,392 @@ G1ConcurrentRefineThread* G1ConcurrentRefineThreadControl::create_refinement_thr G1ConcurrentRefineThreadControl::G1ConcurrentRefineThreadControl(uint max_num_threads) : _cr(nullptr), - _threads(max_num_threads) + _control_thread(nullptr), + _workers(nullptr), + _max_num_threads(max_num_threads) {} G1ConcurrentRefineThreadControl::~G1ConcurrentRefineThreadControl() { - while (_threads.is_nonempty()) { - delete _threads.pop(); - } -} - -bool G1ConcurrentRefineThreadControl::ensure_threads_created(uint worker_id, bool initializing) { - assert(worker_id < max_num_threads(), "precondition"); - - while ((uint)_threads.length() <= worker_id) { - G1ConcurrentRefineThread* rt = create_refinement_thread(_threads.length(), initializing); - if (rt == nullptr) { - return false; - } - _threads.push(rt); - } - - return true; + delete _control_thread; + delete _workers; } jint G1ConcurrentRefineThreadControl::initialize(G1ConcurrentRefine* cr) { assert(cr != nullptr, "G1ConcurrentRefine must not be null"); _cr = cr; - if (max_num_threads() > 0) { - _threads.push(create_refinement_thread(0, true)); - if (_threads.at(0) == nullptr) { - vm_shutdown_during_initialization("Could not allocate primary refinement thread"); + if (is_refinement_enabled()) { + _control_thread = create_refinement_thread(); + if (_control_thread == nullptr) { + vm_shutdown_during_initialization("Could not allocate refinement control thread"); return JNI_ENOMEM; } - - if (!UseDynamicNumberOfGCThreads) { - if (!ensure_threads_created(max_num_threads() - 1, true)) { - vm_shutdown_during_initialization("Could not allocate refinement threads"); - return JNI_ENOMEM; - } - } + _workers = new WorkerThreads("G1 Refinement Workers", max_num_threads()); + _workers->initialize_workers(); } - return JNI_OK; } #ifdef ASSERT -void G1ConcurrentRefineThreadControl::assert_current_thread_is_primary_refinement_thread() const { - assert(Thread::current() == _threads.at(0), "Not primary thread"); +void G1ConcurrentRefineThreadControl::assert_current_thread_is_control_refinement_thread() const { + assert(Thread::current() == _control_thread, "Not refinement control thread"); } #endif // ASSERT -bool G1ConcurrentRefineThreadControl::activate(uint worker_id) { - if (ensure_threads_created(worker_id, false)) { - _threads.at(worker_id)->activate(); - return true; - } +void G1ConcurrentRefineThreadControl::activate() { + _control_thread->activate(); +} - return false; +void G1ConcurrentRefineThreadControl::run_task(WorkerTask* task, uint num_workers) { + assert(num_workers >= 1, "must be"); + + WithActiveWorkers w(_workers, num_workers); + _workers->run_task(task); +} + +void G1ConcurrentRefineThreadControl::control_thread_do(ThreadClosure* tc) { + if (is_refinement_enabled()) { + tc->do_thread(_control_thread); + } } void G1ConcurrentRefineThreadControl::worker_threads_do(ThreadClosure* tc) { - for (G1ConcurrentRefineThread* t : _threads) { - tc->do_thread(t); + if (is_refinement_enabled()) { + _workers->threads_do(tc); } } void G1ConcurrentRefineThreadControl::stop() { - for (G1ConcurrentRefineThread* t : _threads) { - t->stop(); + if (is_refinement_enabled()) { + _control_thread->stop(); } } +G1ConcurrentRefineSweepState::G1ConcurrentRefineSweepState(uint max_reserved_regions) : + _state(State::Idle), + _sweep_table(new G1CardTableClaimTable(G1CollectedHeap::get_chunks_per_region_for_merge())), + _stats() +{ + _sweep_table->initialize(max_reserved_regions); +} + +G1ConcurrentRefineSweepState::~G1ConcurrentRefineSweepState() { + delete _sweep_table; +} + +void G1ConcurrentRefineSweepState::set_state_start_time() { + _state_start[static_cast(_state)] = Ticks::now(); +} + +Tickspan G1ConcurrentRefineSweepState::get_duration(State start, State end) { + return _state_start[static_cast(end)] - _state_start[static_cast(start)]; +} + +void G1ConcurrentRefineSweepState::reset_stats() { + stats()->reset(); +} + +void G1ConcurrentRefineSweepState::add_yield_during_sweep_duration(jlong duration) { + stats()->inc_yield_during_sweep_duration(duration); +} + +bool G1ConcurrentRefineSweepState::advance_state(State next_state) { + bool result = is_in_progress(); + if (result) { + _state = next_state; + } else { + _state = State::Idle; + } + return result; +} + +void G1ConcurrentRefineSweepState::assert_state(State expected) { + assert(_state == expected, "must be %s but is %s", state_name(expected), state_name(_state)); +} + +void G1ConcurrentRefineSweepState::start_work() { + assert_state(State::Idle); + + set_state_start_time(); + + _stats.reset(); + + _state = State::SwapGlobalCT; +} + +bool G1ConcurrentRefineSweepState::swap_global_card_table() { + assert_state(State::SwapGlobalCT); + + GCTraceTime(Info, gc, refine) tm("Concurrent Refine Global Card Table Swap"); + set_state_start_time(); + + { + // We can't have any new threads being in the process of created while we + // swap the card table because we read the current card table state during + // initialization. + // A safepoint may occur during that time, so leave the STS temporarily. + SuspendibleThreadSetLeaver sts_leave; + + MutexLocker mu(Threads_lock); + // A GC that advanced the epoch might have happened, which already switched + // The global card table. Do nothing. + if (is_in_progress()) { + G1BarrierSet::g1_barrier_set()->swap_global_card_table(); + } + } + + return advance_state(State::SwapJavaThreadsCT); +} + +bool G1ConcurrentRefineSweepState::swap_java_threads_ct() { + assert_state(State::SwapJavaThreadsCT); + + GCTraceTime(Info, gc, refine) tm("Concurrent Refine Java Thread CT swap"); + + set_state_start_time(); + + { + // Need to leave the STS to avoid potential deadlock in the handshake. + SuspendibleThreadSetLeaver sts; + + class G1SwapThreadCardTableClosure : public HandshakeClosure { + public: + G1SwapThreadCardTableClosure() : HandshakeClosure("G1 Java Thread CT swap") { } + + virtual void do_thread(Thread* thread) { + G1BarrierSet* bs = G1BarrierSet::g1_barrier_set(); + bs->update_card_table_base(thread); + } + } cl; + Handshake::execute(&cl); + } + + return advance_state(State::SynchronizeGCThreads); + } + +bool G1ConcurrentRefineSweepState::swap_gc_threads_ct() { + assert_state(State::SynchronizeGCThreads); + + GCTraceTime(Info, gc, refine) tm("Concurrent Refine GC Thread CT swap"); + + set_state_start_time(); + + { + class RendezvousGCThreads: public VM_Operation { + public: + VMOp_Type type() const { return VMOp_G1RendezvousGCThreads; } + + virtual bool evaluate_at_safepoint() const { + // We only care about synchronizing the GC threads. + // Leave the Java threads running. + return false; + } + + virtual bool skip_thread_oop_barriers() const { + fatal("Concurrent VMOps should not call this"); + return true; + } + + void doit() { + // Light weight "handshake" of the GC threads for memory synchronization; + // both changes to the Java heap need to be synchronized as well as the + // previous global card table reference change, so that no GC thread + // accesses the wrong card table. + // For example in the rebuild remset process the marking threads write + // marks into the card table, and that card table reference must be the + // correct one. + SuspendibleThreadSet::synchronize(); + SuspendibleThreadSet::desynchronize(); + }; + } op; + + SuspendibleThreadSetLeaver sts_leave; + VMThread::execute(&op); + } + + return advance_state(State::SnapshotHeap); +} + +void G1ConcurrentRefineSweepState::snapshot_heap(bool concurrent) { + if (concurrent) { + GCTraceTime(Info, gc, refine) tm("Concurrent Refine Snapshot Heap"); + + assert_state(State::SnapshotHeap); + + set_state_start_time(); + + snapshot_heap_inner(); + + advance_state(State::SweepRT); + } else { + assert_state(State::Idle); + assert_at_safepoint(); + + snapshot_heap_inner(); + } +} + +void G1ConcurrentRefineSweepState::sweep_refinement_table_start() { + assert_state(State::SweepRT); + + set_state_start_time(); +} + +bool G1ConcurrentRefineSweepState::sweep_refinement_table_step() { + assert_state(State::SweepRT); + + GCTraceTime(Info, gc, refine) tm("Concurrent Refine Table Step"); + + G1ConcurrentRefine* cr = G1CollectedHeap::heap()->concurrent_refine(); + + G1ConcurrentRefineSweepTask task(_sweep_table, &_stats, cr->num_threads_wanted()); + cr->run_with_refinement_workers(&task); + + if (task.sweep_completed()) { + advance_state(State::CompleteRefineWork); + return true; + } else { + return false; + } +} + +bool G1ConcurrentRefineSweepState::complete_work(bool concurrent, bool print_log) { + if (concurrent) { + assert_state(State::CompleteRefineWork); + } else { + // May have been forced to complete at any other time. + assert(is_in_progress() && _state != State::CompleteRefineWork, "must be but is %s", state_name(_state)); + } + + set_state_start_time(); + + if (print_log) { + G1ConcurrentRefineStats* s = &_stats; + + log_debug(gc, refine)("Refinement took %.2fms (pre-sweep %.2fms card refine %.2f) " + "(scanned %zu clean %zu (%.2f%%) not_clean %zu (%.2f%%) not_parsable %zu " + "refers_to_cset %zu (%.2f%%) still_refers_to_cset %zu (%.2f%%) no_cross_region %zu pending %zu)", + get_duration(State::Idle, _state).seconds() * 1000.0, + get_duration(State::Idle, State::SweepRT).seconds() * 1000.0, + TimeHelper::counter_to_millis(s->refine_duration()), + s->cards_scanned(), + s->cards_clean(), + percent_of(s->cards_clean(), s->cards_scanned()), + s->cards_not_clean(), + percent_of(s->cards_not_clean(), s->cards_scanned()), + s->cards_not_parsable(), + s->cards_refer_to_cset(), + percent_of(s->cards_refer_to_cset(), s->cards_not_clean()), + s->cards_already_refer_to_cset(), + percent_of(s->cards_already_refer_to_cset(), s->cards_not_clean()), + s->cards_no_cross_region(), + s->cards_pending() + ); + } + + bool has_sweep_rt_work = _state == State::SweepRT; + + advance_state(State::Idle); + return has_sweep_rt_work; +} + +void G1ConcurrentRefineSweepState::snapshot_heap_inner() { + // G1CollectedHeap::heap_region_iterate() below will only visit currently committed + // regions. Initialize all entries in the state table here and later in this method + // selectively enable regions that we are interested. This way regions committed + // later will be automatically excluded from iteration. + // Their refinement table must be completely empty anyway. + _sweep_table->reset_all_to_claimed(); + + class SnapshotRegionsClosure : public G1HeapRegionClosure { + G1CardTableClaimTable* _sweep_table; + + public: + SnapshotRegionsClosure(G1CardTableClaimTable* sweep_table) : G1HeapRegionClosure(), _sweep_table(sweep_table) { } + + bool do_heap_region(G1HeapRegion* r) override { + if (!r->is_free()) { + // Need to scan all parts of non-free regions, so reset the claim. + // No need for synchronization: we are only interested in regions + // that were allocated before the handshake; the handshake makes such + // regions' metadata visible to all threads, and we do not care about + // humongous regions that were allocated afterwards. + _sweep_table->reset_to_unclaimed(r->hrm_index()); + } + return false; + } + } cl(_sweep_table); + G1CollectedHeap::heap()->heap_region_iterate(&cl); +} + +bool G1ConcurrentRefineSweepState::is_in_progress() const { + return _state != State::Idle; +} + +bool G1ConcurrentRefineSweepState::are_java_threads_synched() const { + return _state > State::SwapJavaThreadsCT || !is_in_progress(); +} + uint64_t G1ConcurrentRefine::adjust_threads_period_ms() const { // Instead of a fixed value, this could be a command line option. But then // we might also want to allow configuration of adjust_threads_wait_ms(). - return 50; + + // Use a prime number close to 50ms, different to other components that derive + // their wait time from the try_get_available_bytes_estimate() call to minimize + // interference. + return 53; } static size_t minimum_pending_cards_target() { - // One buffer per thread. - return ParallelGCThreads * G1UpdateBufferSize; + return ParallelGCThreads * G1PerThreadPendingCardThreshold; } -G1ConcurrentRefine::G1ConcurrentRefine(G1Policy* policy) : - _policy(policy), - _threads_wanted(0), +G1ConcurrentRefine::G1ConcurrentRefine(G1CollectedHeap* g1h) : + _policy(g1h->policy()), + _num_threads_wanted(0), _pending_cards_target(PendingCardsTargetUninitialized), _last_adjust(), _needs_adjust(false), - _threads_needed(policy, adjust_threads_period_ms()), + _heap_was_locked(false), + _threads_needed(g1h->policy(), adjust_threads_period_ms()), _thread_control(G1ConcRefinementThreads), - _dcqs(G1BarrierSet::dirty_card_queue_set()) -{} + _sweep_state(g1h->max_num_regions()) +{ } jint G1ConcurrentRefine::initialize() { return _thread_control.initialize(this); } -G1ConcurrentRefine* G1ConcurrentRefine::create(G1Policy* policy, jint* ecode) { - G1ConcurrentRefine* cr = new G1ConcurrentRefine(policy); +G1ConcurrentRefineSweepState& G1ConcurrentRefine::sweep_state_for_merge() { + bool has_sweep_claims = sweep_state().complete_work(false /* concurrent */); + if (has_sweep_claims) { + log_debug(gc, refine)("Continue existing work"); + } else { + // Refinement has been interrupted without having a snapshot. There may + // be a mix of already swapped and not-swapped card tables assigned to threads, + // so they might have already dirtied the swapped card tables. + // Conservatively scan all (non-free, non-committed) region's card tables, + // creating the snapshot right now. + log_debug(gc, refine)("Create work from scratch"); + + sweep_state().snapshot_heap(false /* concurrent */); + } + return sweep_state(); +} + +void G1ConcurrentRefine::run_with_refinement_workers(WorkerTask* task) { + _thread_control.run_task(task, num_threads_wanted()); +} + +void G1ConcurrentRefine::notify_region_reclaimed(G1HeapRegion* r) { + assert_at_safepoint(); + if (_sweep_state.is_in_progress()) { + _sweep_state.sweep_table()->claim_all_cards(r->hrm_index()); + } +} + +G1ConcurrentRefine* G1ConcurrentRefine::create(G1CollectedHeap* g1h, jint* ecode) { + G1ConcurrentRefine* cr = new G1ConcurrentRefine(g1h); *ecode = cr->initialize(); if (*ecode != 0) { delete cr; @@ -176,25 +465,31 @@ G1ConcurrentRefine::~G1ConcurrentRefine() { } void G1ConcurrentRefine::threads_do(ThreadClosure *tc) { + worker_threads_do(tc); + control_thread_do(tc); +} + +void G1ConcurrentRefine::worker_threads_do(ThreadClosure *tc) { _thread_control.worker_threads_do(tc); } -void G1ConcurrentRefine::update_pending_cards_target(double logged_cards_time_ms, - size_t processed_logged_cards, - size_t predicted_thread_buffer_cards, +void G1ConcurrentRefine::control_thread_do(ThreadClosure *tc) { + _thread_control.control_thread_do(tc); +} + +void G1ConcurrentRefine::update_pending_cards_target(double pending_cards_time_ms, + size_t processed_pending_cards, double goal_ms) { size_t minimum = minimum_pending_cards_target(); - if ((processed_logged_cards < minimum) || (logged_cards_time_ms == 0.0)) { - log_debug(gc, ergo, refine)("Unchanged pending cards target: %zu", - _pending_cards_target); + if ((processed_pending_cards < minimum) || (pending_cards_time_ms == 0.0)) { + log_debug(gc, ergo, refine)("Unchanged pending cards target: %zu (processed %zu minimum %zu time %1.2f)", + _pending_cards_target, processed_pending_cards, minimum, pending_cards_time_ms); return; } // Base the pending cards budget on the measured rate. - double rate = processed_logged_cards / logged_cards_time_ms; - size_t budget = static_cast(goal_ms * rate); - // Deduct predicted cards in thread buffers to get target. - size_t new_target = budget - MIN2(budget, predicted_thread_buffer_cards); + double rate = processed_pending_cards / pending_cards_time_ms; + size_t new_target = static_cast(goal_ms * rate); // Add some hysteresis with previous values. if (is_pending_cards_target_initialized()) { new_target = (new_target + _pending_cards_target) / 2; @@ -205,46 +500,36 @@ void G1ConcurrentRefine::update_pending_cards_target(double logged_cards_time_ms log_debug(gc, ergo, refine)("New pending cards target: %zu", new_target); } -void G1ConcurrentRefine::adjust_after_gc(double logged_cards_time_ms, - size_t processed_logged_cards, - size_t predicted_thread_buffer_cards, +void G1ConcurrentRefine::adjust_after_gc(double pending_cards_time_ms, + size_t processed_pending_cards, double goal_ms) { - if (!G1UseConcRefinement) return; + if (!G1UseConcRefinement) { + return; + } - update_pending_cards_target(logged_cards_time_ms, - processed_logged_cards, - predicted_thread_buffer_cards, + update_pending_cards_target(pending_cards_time_ms, + processed_pending_cards, goal_ms); - if (_thread_control.max_num_threads() == 0) { - // If no refinement threads then the mutator threshold is the target. - _dcqs.set_mutator_refinement_threshold(_pending_cards_target); - } else { - // Provisionally make the mutator threshold unlimited, to be updated by - // the next periodic adjustment. Because card state may have changed - // drastically, record that adjustment is needed and kick the primary - // thread, in case it is waiting. - _dcqs.set_mutator_refinement_threshold(SIZE_MAX); + if (_thread_control.is_refinement_enabled()) { _needs_adjust = true; if (is_pending_cards_target_initialized()) { - _thread_control.activate(0); + _thread_control.activate(); } } } -// Wake up the primary thread less frequently when the time available until -// the next GC is longer. But don't increase the wait time too rapidly. -// This reduces the number of primary thread wakeups that just immediately -// go back to waiting, while still being responsive to behavior changes. -static uint64_t compute_adjust_wait_time_ms(double available_ms) { - return static_cast(sqrt(available_ms) * 4.0); -} - uint64_t G1ConcurrentRefine::adjust_threads_wait_ms() const { - assert_current_thread_is_primary_refinement_thread(); + assert_current_thread_is_control_refinement_thread(); if (is_pending_cards_target_initialized()) { - double available_ms = _threads_needed.predicted_time_until_next_gc_ms(); - uint64_t wait_time_ms = compute_adjust_wait_time_ms(available_ms); - return MAX2(wait_time_ms, adjust_threads_period_ms()); + // Retry asap when the cause for not getting a prediction was that we temporarily + // did not get the heap lock. Otherwise we might wait for too long until we get + // back here. + if (_heap_was_locked) { + return 1; + } + double available_time_ms = _threads_needed.predicted_time_until_next_gc_ms(); + + return _policy->adjust_wait_time_ms(available_time_ms, adjust_threads_period_ms()); } else { // If target not yet initialized then wait forever (until explicitly // activated). This happens during startup, when we don't bother with @@ -253,185 +538,74 @@ uint64_t G1ConcurrentRefine::adjust_threads_wait_ms() const { } } -class G1ConcurrentRefine::RemSetSamplingClosure : public G1HeapRegionClosure { - size_t _sampled_code_root_rs_length; +bool G1ConcurrentRefine::adjust_num_threads_periodically() { + assert_current_thread_is_control_refinement_thread(); -public: - RemSetSamplingClosure() : - _sampled_code_root_rs_length(0) {} - - bool do_heap_region(G1HeapRegion* r) override { - G1HeapRegionRemSet* rem_set = r->rem_set(); - _sampled_code_root_rs_length += rem_set->code_roots_list_length(); - return false; - } - - size_t sampled_code_root_rs_length() const { return _sampled_code_root_rs_length; } -}; - -// Adjust the target length (in regions) of the young gen, based on the -// current length of the remembered sets. -// -// At the end of the GC G1 determines the length of the young gen based on -// how much time the next GC can take, and when the next GC may occur -// according to the MMU. -// -// The assumption is that a significant part of the GC is spent on scanning -// the remembered sets (and many other components), so this thread constantly -// reevaluates the prediction for the remembered set scanning costs, and potentially -// resizes the young gen. This may do a premature GC or even increase the young -// gen size to keep pause time length goal. -void G1ConcurrentRefine::adjust_young_list_target_length() { - if (_policy->use_adaptive_young_list_length()) { - G1CollectedHeap* g1h = G1CollectedHeap::heap(); - G1CollectionSet* cset = g1h->collection_set(); - RemSetSamplingClosure cl; - cset->iterate(&cl); - - size_t card_rs_length = g1h->young_regions_cardset()->occupied(); - - size_t sampled_code_root_rs_length = cl.sampled_code_root_rs_length(); - _policy->revise_young_list_target_length(card_rs_length, sampled_code_root_rs_length); - } -} - -bool G1ConcurrentRefine::adjust_threads_periodically() { - assert_current_thread_is_primary_refinement_thread(); - - // Check whether it's time to do a periodic adjustment. + _heap_was_locked = false; + // Check whether it's time to do a periodic adjustment if there is no explicit + // request pending. We might have spuriously woken up. if (!_needs_adjust) { Tickspan since_adjust = Ticks::now() - _last_adjust; - if (since_adjust.milliseconds() >= adjust_threads_period_ms()) { - _needs_adjust = true; + if (since_adjust.milliseconds() < adjust_threads_period_ms()) { + _num_threads_wanted = 0; + return false; } } - // If needed, try to adjust threads wanted. - if (_needs_adjust) { - // Getting used young bytes requires holding Heap_lock. But we can't use - // normal lock and block until available. Blocking on the lock could - // deadlock with a GC VMOp that is holding the lock and requesting a - // safepoint. Instead try to lock, and if fail then skip adjustment for - // this iteration of the thread, do some refinement work, and retry the - // adjustment later. - if (Heap_lock->try_lock()) { - size_t used_bytes = _policy->estimate_used_young_bytes_locked(); - Heap_lock->unlock(); - adjust_young_list_target_length(); - size_t young_bytes = _policy->young_list_target_length() * G1HeapRegion::GrainBytes; - size_t available_bytes = young_bytes - MIN2(young_bytes, used_bytes); - adjust_threads_wanted(available_bytes); - _needs_adjust = false; - _last_adjust = Ticks::now(); - return true; - } + // Reset pending request. + _needs_adjust = false; + size_t available_bytes = 0; + if (_policy->try_get_available_bytes_estimate(available_bytes)) { + adjust_threads_wanted(available_bytes); + _last_adjust = Ticks::now(); + } else { + _heap_was_locked = true; + // Defer adjustment to next time. + _needs_adjust = true; } - return false; -} - -bool G1ConcurrentRefine::is_in_last_adjustment_period() const { - return _threads_needed.predicted_time_until_next_gc_ms() <= adjust_threads_period_ms(); + return (_num_threads_wanted > 0) && !heap_was_locked(); } void G1ConcurrentRefine::adjust_threads_wanted(size_t available_bytes) { - assert_current_thread_is_primary_refinement_thread(); - size_t num_cards = _dcqs.num_cards(); - size_t mutator_threshold = SIZE_MAX; - uint old_wanted = AtomicAccess::load(&_threads_wanted); + assert_current_thread_is_control_refinement_thread(); - _threads_needed.update(old_wanted, + G1Policy* policy = G1CollectedHeap::heap()->policy(); + const G1Analytics* analytics = policy->analytics(); + + size_t num_cards = policy->current_pending_cards(); + + _threads_needed.update(_num_threads_wanted, available_bytes, num_cards, _pending_cards_target); uint new_wanted = _threads_needed.threads_needed(); if (new_wanted > _thread_control.max_num_threads()) { - // If running all the threads can't reach goal, turn on refinement by - // mutator threads. Using target as the threshold may be stronger - // than required, but will do the most to get us under goal, and we'll - // reevaluate with the next adjustment. - mutator_threshold = _pending_cards_target; + // Bound the wanted threads by maximum available. new_wanted = _thread_control.max_num_threads(); - } else if (is_in_last_adjustment_period()) { - // If very little time remains until GC, enable mutator refinement. If - // the target has been reached, this keeps the number of pending cards on - // target even if refinement threads deactivate in the meantime. And if - // the target hasn't been reached, this prevents things from getting - // worse. - mutator_threshold = _pending_cards_target; } - AtomicAccess::store(&_threads_wanted, new_wanted); - _dcqs.set_mutator_refinement_threshold(mutator_threshold); - log_debug(gc, refine)("Concurrent refinement: wanted %u, cards: %zu, " - "predicted: %zu, time: %1.2fms", + + _num_threads_wanted = new_wanted; + + log_debug(gc, refine)("Concurrent refinement: wanted %u, pending cards: %zu (pending-from-gc %zu), " + "predicted: %zu, goal %zu, time-until-next-gc: %1.2fms pred-refine-rate %1.2fc/ms log-rate %1.2fc/ms", new_wanted, num_cards, + G1CollectedHeap::heap()->policy()->pending_cards_from_gc(), _threads_needed.predicted_cards_at_next_gc(), - _threads_needed.predicted_time_until_next_gc_ms()); - // Activate newly wanted threads. The current thread is the primary - // refinement thread, so is already active. - for (uint i = MAX2(old_wanted, 1u); i < new_wanted; ++i) { - if (!_thread_control.activate(i)) { - // Failed to allocate and activate thread. Stop trying to activate, and - // instead use mutator threads to make up the gap. - AtomicAccess::store(&_threads_wanted, i); - _dcqs.set_mutator_refinement_threshold(_pending_cards_target); - break; - } - } -} - -void G1ConcurrentRefine::reduce_threads_wanted() { - assert_current_thread_is_primary_refinement_thread(); - if (!_needs_adjust) { // Defer if adjustment request is active. - uint wanted = AtomicAccess::load(&_threads_wanted); - if (wanted > 0) { - AtomicAccess::store(&_threads_wanted, --wanted); - } - // If very little time remains until GC, enable mutator refinement. If - // the target has been reached, this keeps the number of pending cards on - // target even as refinement threads deactivate in the meantime. - if (is_in_last_adjustment_period()) { - _dcqs.set_mutator_refinement_threshold(_pending_cards_target); - } - } -} - -bool G1ConcurrentRefine::is_thread_wanted(uint worker_id) const { - return worker_id < AtomicAccess::load(&_threads_wanted); + _pending_cards_target, + _threads_needed.predicted_time_until_next_gc_ms(), + analytics->predict_concurrent_refine_rate_ms(), + analytics->predict_dirtied_cards_rate_ms() + ); } bool G1ConcurrentRefine::is_thread_adjustment_needed() const { - assert_current_thread_is_primary_refinement_thread(); + assert_current_thread_is_control_refinement_thread(); return _needs_adjust; } void G1ConcurrentRefine::record_thread_adjustment_needed() { - assert_current_thread_is_primary_refinement_thread(); + assert_current_thread_is_control_refinement_thread(); _needs_adjust = true; } - -G1ConcurrentRefineStats G1ConcurrentRefine::get_and_reset_refinement_stats() { - struct CollectStats : public ThreadClosure { - G1ConcurrentRefineStats _total_stats; - virtual void do_thread(Thread* t) { - G1ConcurrentRefineThread* crt = static_cast(t); - G1ConcurrentRefineStats& stats = *crt->refinement_stats(); - _total_stats += stats; - stats.reset(); - } - } collector; - threads_do(&collector); - return collector._total_stats; -} - -uint G1ConcurrentRefine::worker_id_offset() { - return G1DirtyCardQueueSet::num_par_ids(); -} - -bool G1ConcurrentRefine::try_refinement_step(uint worker_id, - size_t stop_at, - G1ConcurrentRefineStats* stats) { - uint adjusted_id = worker_id + worker_id_offset(); - return _dcqs.refine_completed_buffer_concurrently(adjusted_id, stop_at, stats); -} diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp index dd0b62a22ea..5e96ed738fd 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -34,23 +34,28 @@ #include "utilities/macros.hpp" // Forward decl +class G1CardTableClaimTable; +class G1CollectedHeap; class G1ConcurrentRefine; class G1ConcurrentRefineThread; -class G1DirtyCardQueueSet; +class G1HeapRegion; class G1Policy; class ThreadClosure; +class WorkerTask; +class WorkerThreads; // Helper class for refinement thread management. Used to start, stop and // iterate over them. class G1ConcurrentRefineThreadControl { G1ConcurrentRefine* _cr; - GrowableArrayCHeap _threads; + G1ConcurrentRefineThread* _control_thread; + + WorkerThreads* _workers; + uint _max_num_threads; // Create the refinement thread for the given worker id. // If initializing is true, ignore InjectGCWorkerCreationFailure. - G1ConcurrentRefineThread* create_refinement_thread(uint worker_id, bool initializing); - - bool ensure_threads_created(uint worker_id, bool initializing); + G1ConcurrentRefineThread* create_refinement_thread(); NONCOPYABLE(G1ConcurrentRefineThreadControl); @@ -60,21 +65,119 @@ public: jint initialize(G1ConcurrentRefine* cr); - void assert_current_thread_is_primary_refinement_thread() const NOT_DEBUG_RETURN; + void assert_current_thread_is_control_refinement_thread() const NOT_DEBUG_RETURN; - uint max_num_threads() const { return _threads.capacity(); } + uint max_num_threads() const { return _max_num_threads; } + bool is_refinement_enabled() const { return _max_num_threads > 0; } - // Activate the indicated thread. If the thread has not yet been allocated, - // allocate and then activate. If allocation is needed and fails, return - // false. Otherwise return true. - // precondition: worker_id < max_num_threads(). - // precondition: current thread is not the designated worker. - bool activate(uint worker_id); + // Activate the control thread. + void activate(); + void run_task(WorkerTask* task, uint num_workers); + + void control_thread_do(ThreadClosure* tc); void worker_threads_do(ThreadClosure* tc); void stop(); }; +// Tracks the current state of re-examining the dirty cards from idle to completion +// (and reset back to idle). +// +// The process steps are as follows: +// +// 1) Swap global card table pointers +// +// 2) Swap Java Thread's card table pointers +// +// 3) Synchronize GC Threads +// Ensures memory visibility +// +// After this point mutator threads should not mark the refinement table. +// +// 4) Snapshot the heap +// Determines which regions need to be swept. +// +// 5) Sweep Refinement table +// Examines non-Clean cards on the refinement table. +// +// 6) Completion Work +// Calculates statistics about the process to be used in various parts of +// the garbage collection. +// +// All but step 4 are interruptible by safepoints. In case of a garbage collection, +// the garbage collection will interrupt this process, and go to Idle state. +// +class G1ConcurrentRefineSweepState { + + enum class State : uint { + Idle, // Refinement is doing nothing. + SwapGlobalCT, // Swap global card table. + SwapJavaThreadsCT, // Swap java thread's card tables. + SynchronizeGCThreads, // Synchronize GC thread's memory view. + SnapshotHeap, // Take a snapshot of the region's top() values. + SweepRT, // Sweep the refinement table for pending (dirty) cards. + CompleteRefineWork, // Cleanup of refinement work, reset to idle. + Last + } _state; + + static const char* state_name(State state) { + static const char* _state_names[] = { + "Idle", + "Swap Global Card Table", + "Swap JavaThread Card Table", + "Synchronize GC Threads", + "Snapshot Heap", + "Sweep Refinement Table", + "Complete Sweep Work" + }; + + return _state_names[static_cast(state)]; + } + + // Current heap snapshot. + G1CardTableClaimTable* _sweep_table; + + // Start times for all states. + Ticks _state_start[static_cast(State::Last)]; + + void set_state_start_time(); + Tickspan get_duration(State start, State end); + + G1ConcurrentRefineStats _stats; + + // Advances the state to next_state if not interrupted by a changed epoch. Returns + // to Idle otherwise. + bool advance_state(State next_state); + + void assert_state(State expected); + + void snapshot_heap_inner(); + +public: + G1ConcurrentRefineSweepState(uint max_reserved_regions); + ~G1ConcurrentRefineSweepState(); + + void start_work(); + + bool swap_global_card_table(); + bool swap_java_threads_ct(); + bool swap_gc_threads_ct(); + void snapshot_heap(bool concurrent = true); + void sweep_refinement_table_start(); + bool sweep_refinement_table_step(); + + bool complete_work(bool concurrent, bool print_log = true); + + G1CardTableClaimTable* sweep_table() { return _sweep_table; } + G1ConcurrentRefineStats* stats() { return &_stats; } + void reset_stats(); + + void add_yield_during_sweep_duration(jlong duration); + + bool is_in_progress() const; + bool are_java_threads_synched() const; +}; + // Controls concurrent refinement. // // Mutator threads produce dirty cards, which need to be examined for updates @@ -84,49 +187,43 @@ public: // pending dirty cards at the start of a GC can be processed within that time // budget. // -// Concurrent refinement is performed by a combination of dedicated threads -// and by mutator threads as they produce dirty cards. If configured to not -// have any dedicated threads (-XX:G1ConcRefinementThreads=0) then all -// concurrent refinement work is performed by mutator threads. When there are -// dedicated threads, they generally do most of the concurrent refinement -// work, to minimize throughput impact of refinement work on mutator threads. +// Concurrent refinement is performed by a set of dedicated threads. If configured +// to not have any dedicated threads (-XX:G1ConcRefinementThreads=0) then no +// refinement work is performed at all. // // This class determines the target number of dirty cards pending for the next // GC. It also owns the dedicated refinement threads and controls their // activation in order to achieve that target. // -// There are two kinds of dedicated refinement threads, a single primary -// thread and some number of secondary threads. When active, all refinement -// threads take buffers of dirty cards from the dirty card queue and process -// them. Between buffers they query this owning object to find out whether -// they should continue running, deactivating themselves if not. +// There are two kinds of dedicated refinement threads, a single control +// thread and some number of refinement worker threads. +// The control thread determines whether there is need to do work, and then starts +// an appropriate number of refinement worker threads to get back to the target +// number of pending dirty cards. +// +// The control wakes up periodically whether there is need to do refinement +// work, starting the refinement process as necessary. // -// The primary thread drives the control system that determines how many -// refinement threads should be active. If inactive, it wakes up periodically -// to recalculate the number of active threads needed, and activates -// additional threads as necessary. While active it also periodically -// recalculates the number wanted and activates more threads if needed. It -// also reduces the number of wanted threads when the target has been reached, -// triggering deactivations. class G1ConcurrentRefine : public CHeapObj { G1Policy* _policy; - volatile uint _threads_wanted; + volatile uint _num_threads_wanted; size_t _pending_cards_target; Ticks _last_adjust; Ticks _last_deactivate; bool _needs_adjust; + bool _heap_was_locked; // The heap has been locked the last time we tried to adjust the number of refinement threads. + G1ConcurrentRefineThreadsNeeded _threads_needed; G1ConcurrentRefineThreadControl _thread_control; - G1DirtyCardQueueSet& _dcqs; - G1ConcurrentRefine(G1Policy* policy); + G1ConcurrentRefineSweepState _sweep_state; - static uint worker_id_offset(); + G1ConcurrentRefine(G1CollectedHeap* g1h); jint initialize(); - void assert_current_thread_is_primary_refinement_thread() const { - _thread_control.assert_current_thread_is_primary_refinement_thread(); + void assert_current_thread_is_control_refinement_thread() const { + _thread_control.assert_current_thread_is_control_refinement_thread(); } // For the first few collection cycles we don't have a target (and so don't @@ -138,16 +235,11 @@ class G1ConcurrentRefine : public CHeapObj { return _pending_cards_target != PendingCardsTargetUninitialized; } - void update_pending_cards_target(double logged_cards_scan_time_ms, - size_t processed_logged_cards, - size_t predicted_thread_buffer_cards, + void update_pending_cards_target(double pending_cards_scan_time_ms, + size_t processed_pending_cards, double goal_ms); uint64_t adjust_threads_period_ms() const; - bool is_in_last_adjustment_period() const; - - class RemSetSamplingClosure; // Helper class for adjusting young length. - void adjust_young_list_target_length(); void adjust_threads_wanted(size_t available_bytes); @@ -156,67 +248,66 @@ class G1ConcurrentRefine : public CHeapObj { public: ~G1ConcurrentRefine(); + G1ConcurrentRefineSweepState& sweep_state() { return _sweep_state; } + + G1ConcurrentRefineSweepState& sweep_state_for_merge(); + + void run_with_refinement_workers(WorkerTask* task); + + void notify_region_reclaimed(G1HeapRegion* r); + // Returns a G1ConcurrentRefine instance if succeeded to create/initialize the // G1ConcurrentRefine instance. Otherwise, returns null with error code. - static G1ConcurrentRefine* create(G1Policy* policy, jint* ecode); + static G1ConcurrentRefine* create(G1CollectedHeap* g1h, jint* ecode); // Stop all the refinement threads. void stop(); // Called at the end of a GC to prepare for refinement during the next // concurrent phase. Updates the target for the number of pending dirty - // cards. Updates the mutator refinement threshold. Ensures the primary - // refinement thread (if it exists) is active, so it will adjust the number + // cards. Updates the mutator refinement threshold. Ensures the refinement + // control thread (if it exists) is active, so it will adjust the number // of running threads. - void adjust_after_gc(double logged_cards_scan_time_ms, - size_t processed_logged_cards, - size_t predicted_thread_buffer_cards, + void adjust_after_gc(double pending_cards_scan_time_ms, + size_t processed_pending_cards, double goal_ms); // Target number of pending dirty cards at the start of the next GC. size_t pending_cards_target() const { return _pending_cards_target; } - // May recalculate the number of refinement threads that should be active in - // order to meet the pending cards target. Returns true if adjustment was - // performed, and clears any pending request. Returns false if the - // adjustment period has not expired, or because a timed or requested - // adjustment could not be performed immediately and so was deferred. - // precondition: current thread is the primary refinement thread. - bool adjust_threads_periodically(); + // Recalculates the number of refinement threads that should be active in + // order to meet the pending cards target. + // Returns true if it could recalculate the number of threads and + // refinement threads should be started. + // Returns false if the adjustment period has not expired, or because a timed + // or requested adjustment could not be performed immediately and so was deferred. + bool adjust_num_threads_periodically(); - // The amount of time (in ms) the primary refinement thread should sleep + // The amount of time (in ms) the refinement control thread should sleep // when it is inactive. It requests adjustment whenever it is reactivated. - // precondition: current thread is the primary refinement thread. + // precondition: current thread is the refinement control thread. uint64_t adjust_threads_wait_ms() const; // Record a request for thread adjustment as soon as possible. - // precondition: current thread is the primary refinement thread. + // precondition: current thread is the refinement control thread. void record_thread_adjustment_needed(); // Test whether there is a pending request for thread adjustment. - // precondition: current thread is the primary refinement thread. + // precondition: current thread is the refinement control thread. bool is_thread_adjustment_needed() const; - // Reduce the number of active threads wanted. - // precondition: current thread is the primary refinement thread. - void reduce_threads_wanted(); + // Indicate that last refinement adjustment had been deferred due to not + // obtaining the heap lock. + bool heap_was_locked() const { return _heap_was_locked; } - // Test whether the thread designated by worker_id should be active. - bool is_thread_wanted(uint worker_id) const; - - // Return total of concurrent refinement stats for the - // ConcurrentRefineThreads. Also reset the stats for the threads. - G1ConcurrentRefineStats get_and_reset_refinement_stats(); - - // Perform a single refinement step; called by the refinement - // threads. Returns true if there was refinement work available. - // Updates stats. - bool try_refinement_step(uint worker_id, - size_t stop_at, - G1ConcurrentRefineStats* stats); + uint num_threads_wanted() const { return _num_threads_wanted; } + uint max_num_threads() const { return _thread_control.max_num_threads(); } // Iterate over all concurrent refinement threads applying the given closure. void threads_do(ThreadClosure *tc); + // Iterate over specific refinement threads applying the given closure. + void worker_threads_do(ThreadClosure *tc); + void control_thread_do(ThreadClosure *tc); }; #endif // SHARE_GC_G1_G1CONCURRENTREFINE_HPP diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp index 7f0bcc5b50f..83a09c55a3f 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp @@ -23,41 +23,33 @@ */ #include "gc/g1/g1ConcurrentRefineStats.hpp" +#include "runtime/atomicAccess.hpp" +#include "runtime/timer.hpp" G1ConcurrentRefineStats::G1ConcurrentRefineStats() : - _refinement_time(), - _refined_cards(0), - _precleaned_cards(0), - _dirtied_cards(0) + _sweep_duration(0), + _yield_during_sweep_duration(0), + _cards_scanned(0), + _cards_clean(0), + _cards_not_parsable(0), + _cards_already_refer_to_cset(0), + _cards_refer_to_cset(0), + _cards_no_cross_region(0), + _refine_duration(0) {} -double G1ConcurrentRefineStats::refinement_rate_ms() const { - // Report 0 when no time recorded because no refinement performed. - double secs = refinement_time().seconds(); - return (secs > 0) ? (refined_cards() / (secs * MILLIUNITS)) : 0.0; -} +void G1ConcurrentRefineStats::add_atomic(G1ConcurrentRefineStats* other) { + AtomicAccess::add(&_sweep_duration, other->_sweep_duration, memory_order_relaxed); + AtomicAccess::add(&_yield_during_sweep_duration, other->_yield_during_sweep_duration, memory_order_relaxed); -G1ConcurrentRefineStats& -G1ConcurrentRefineStats::operator+=(const G1ConcurrentRefineStats& other) { - _refinement_time += other._refinement_time; - _refined_cards += other._refined_cards; - _precleaned_cards += other._precleaned_cards; - _dirtied_cards += other._dirtied_cards; - return *this; -} + AtomicAccess::add(&_cards_scanned, other->_cards_scanned, memory_order_relaxed); + AtomicAccess::add(&_cards_clean, other->_cards_clean, memory_order_relaxed); + AtomicAccess::add(&_cards_not_parsable, other->_cards_not_parsable, memory_order_relaxed); + AtomicAccess::add(&_cards_already_refer_to_cset, other->_cards_already_refer_to_cset, memory_order_relaxed); + AtomicAccess::add(&_cards_refer_to_cset, other->_cards_refer_to_cset, memory_order_relaxed); + AtomicAccess::add(&_cards_no_cross_region, other->_cards_no_cross_region, memory_order_relaxed); -template -static T clipped_sub(T x, T y) { - return (x < y) ? T() : (x - y); -} - -G1ConcurrentRefineStats& -G1ConcurrentRefineStats::operator-=(const G1ConcurrentRefineStats& other) { - _refinement_time = clipped_sub(_refinement_time, other._refinement_time); - _refined_cards = clipped_sub(_refined_cards, other._refined_cards); - _precleaned_cards = clipped_sub(_precleaned_cards, other._precleaned_cards); - _dirtied_cards = clipped_sub(_dirtied_cards, other._dirtied_cards); - return *this; + AtomicAccess::add(&_refine_duration, other->_refine_duration, memory_order_relaxed); } void G1ConcurrentRefineStats::reset() { diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp index ae576778a07..ce22f4317df 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -33,47 +33,56 @@ // Used for collecting per-thread statistics and for summaries over a // collection of threads. class G1ConcurrentRefineStats : public CHeapObj { - Tickspan _refinement_time; - size_t _refined_cards; - size_t _precleaned_cards; - size_t _dirtied_cards; + jlong _sweep_duration; // Time spent sweeping the table finding non-clean cards + // and refining them. + jlong _yield_during_sweep_duration; // Time spent yielding during the sweep (not doing the sweep). + + size_t _cards_scanned; // Total number of cards scanned. + size_t _cards_clean; // Number of cards found clean. + size_t _cards_not_parsable; // Number of cards we could not parse and left unrefined. + size_t _cards_already_refer_to_cset;// Number of cards marked found to be already young. + size_t _cards_refer_to_cset; // Number of dirty cards that were recently found to contain a to-cset reference. + size_t _cards_no_cross_region; // Number of dirty cards that were dirtied, but then cleaned again by the mutator. + + jlong _refine_duration; // Time spent during actual refinement. public: G1ConcurrentRefineStats(); - // Time spent performing concurrent refinement. - Tickspan refinement_time() const { return _refinement_time; } + // Time spent performing sweeping the refinement table (includes actual refinement, + // but not yield time). + jlong sweep_duration() const { return _sweep_duration - _yield_during_sweep_duration; } + jlong yield_during_sweep_duration() const { return _yield_during_sweep_duration; } + jlong refine_duration() const { return _refine_duration; } // Number of refined cards. - size_t refined_cards() const { return _refined_cards; } + size_t refined_cards() const { return cards_not_clean(); } - // Refinement rate, in cards per ms. - double refinement_rate_ms() const; + size_t cards_scanned() const { return _cards_scanned; } + size_t cards_clean() const { return _cards_clean; } + size_t cards_not_clean() const { return _cards_scanned - _cards_clean; } + size_t cards_not_parsable() const { return _cards_not_parsable; } + size_t cards_already_refer_to_cset() const { return _cards_already_refer_to_cset; } + size_t cards_refer_to_cset() const { return _cards_refer_to_cset; } + size_t cards_no_cross_region() const { return _cards_no_cross_region; } + // Number of cards that were marked dirty and in need of refinement. This includes cards recently + // found to refer to the collection set as they originally were dirty. + size_t cards_pending() const { return cards_not_clean() - _cards_already_refer_to_cset; } - // Number of cards for which refinement was skipped because some other - // thread had already refined them. - size_t precleaned_cards() const { return _precleaned_cards; } + size_t cards_to_cset() const { return _cards_already_refer_to_cset + _cards_refer_to_cset; } - // Number of cards marked dirty and in need of refinement. - size_t dirtied_cards() const { return _dirtied_cards; } + void inc_sweep_time(jlong t) { _sweep_duration += t; } + void inc_yield_during_sweep_duration(jlong t) { _yield_during_sweep_duration += t; } + void inc_refine_duration(jlong t) { _refine_duration += t; } - void inc_refinement_time(Tickspan t) { _refinement_time += t; } - void inc_refined_cards(size_t cards) { _refined_cards += cards; } - void inc_precleaned_cards(size_t cards) { _precleaned_cards += cards; } - void inc_dirtied_cards(size_t cards) { _dirtied_cards += cards; } + void inc_cards_scanned(size_t increment) { _cards_scanned += increment; } + void inc_cards_clean(size_t increment) { _cards_clean += increment; } + void inc_cards_not_parsable() { _cards_not_parsable++; } + void inc_cards_already_refer_to_cset() { _cards_already_refer_to_cset++; } + void inc_cards_refer_to_cset() { _cards_refer_to_cset++; } + void inc_cards_no_cross_region() { _cards_no_cross_region++; } - G1ConcurrentRefineStats& operator+=(const G1ConcurrentRefineStats& other); - G1ConcurrentRefineStats& operator-=(const G1ConcurrentRefineStats& other); - - friend G1ConcurrentRefineStats operator+(G1ConcurrentRefineStats x, - const G1ConcurrentRefineStats& y) { - return x += y; - } - - friend G1ConcurrentRefineStats operator-(G1ConcurrentRefineStats x, - const G1ConcurrentRefineStats& y) { - return x -= y; - } + void add_atomic(G1ConcurrentRefineStats* other); void reset(); }; diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp new file mode 100644 index 00000000000..ca5bc9ebe5f --- /dev/null +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.cpp @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "gc/g1/g1CardTableClaimTable.inline.hpp" +#include "gc/g1/g1CollectedHeap.inline.hpp" +#include "gc/g1/g1ConcurrentRefineSweepTask.hpp" + +class G1RefineRegionClosure : public G1HeapRegionClosure { + using CardValue = G1CardTable::CardValue; + + G1RemSet* _rem_set; + G1CardTableClaimTable* _scan_state; + + uint _worker_id; + + size_t _num_collections_at_start; + + bool has_work(G1HeapRegion* r) { + return _scan_state->has_unclaimed_cards(r->hrm_index()); + } + + void verify_card_pair_refers_to_same_card(CardValue* source_card, CardValue* dest_card) { +#ifdef ASSERT + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + G1HeapRegion* refinement_r = g1h->heap_region_containing(g1h->refinement_table()->addr_for(source_card)); + G1HeapRegion* card_r = g1h->heap_region_containing(g1h->card_table()->addr_for(dest_card)); + size_t refinement_i = g1h->refinement_table()->index_for_cardvalue(source_card); + size_t card_i = g1h->card_table()->index_for_cardvalue(dest_card); + + assert(refinement_r == card_r, "not same region source %u (%zu) dest %u (%zu) ", refinement_r->hrm_index(), refinement_i, card_r->hrm_index(), card_i); + assert(refinement_i == card_i, "indexes are not same %zu %zu", refinement_i, card_i); +#endif + } + + void do_dirty_card(CardValue* source_card, CardValue* dest_card) { + verify_card_pair_refers_to_same_card(source_card, dest_card); + + G1RemSet::RefineResult res = _rem_set->refine_card_concurrently(source_card, _worker_id); + // Gather statistics based on the result. + switch (res) { + case G1RemSet::HasRefToCSet: { + *dest_card = G1CardTable::g1_to_cset_card; + _refine_stats.inc_cards_refer_to_cset(); + break; + } + case G1RemSet::AlreadyToCSet: { + *dest_card = G1CardTable::g1_to_cset_card; + _refine_stats.inc_cards_already_refer_to_cset(); + break; + } + case G1RemSet::NoCrossRegion: { + _refine_stats.inc_cards_no_cross_region(); + break; + } + case G1RemSet::CouldNotParse: { + // Could not refine - redirty with the original value. + *dest_card = *source_card; + _refine_stats.inc_cards_not_parsable(); + break; + } + case G1RemSet::HasRefToOld : break; // Nothing special to do. + } + // Clean card on source card table. + *source_card = G1CardTable::clean_card_val(); + } + + void do_claimed_block(CardValue* dirty_l, CardValue* dirty_r, CardValue* dest_card) { + for (CardValue* source = dirty_l; source < dirty_r; ++source, ++dest_card) { + do_dirty_card(source, dest_card); + } + } + +public: + bool _completed; + G1ConcurrentRefineStats _refine_stats; + + G1RefineRegionClosure(uint worker_id, G1CardTableClaimTable* scan_state) : + G1HeapRegionClosure(), + _rem_set(G1CollectedHeap::heap()->rem_set()), + _scan_state(scan_state), + _worker_id(worker_id), + _completed(true), + _refine_stats() { } + + bool do_heap_region(G1HeapRegion* r) override { + + if (!has_work(r)) { + return false; + } + + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + + if (r->is_young()) { + if (_scan_state->claim_all_cards(r->hrm_index()) == 0) { + // Clear the pre-dirtying information. + r->clear_refinement_table(); + } + return false; + } + + G1CardTable* card_table = g1h->card_table(); + G1CardTable* refinement_table = g1h->refinement_table(); + + G1CardTableChunkClaimer claim(_scan_state, r->hrm_index()); + + size_t const region_card_base_idx = (size_t)r->hrm_index() << G1HeapRegion::LogCardsPerRegion; + + while (claim.has_next()) { + size_t const start_idx = region_card_base_idx + claim.value(); + CardValue* const start_card = refinement_table->byte_for_index(start_idx); + CardValue* const end_card = start_card + claim.size(); + + CardValue* dest_card = card_table->byte_for_index(start_idx); + + G1ChunkScanner scanner{start_card, end_card}; + + size_t num_dirty_cards = 0; + scanner.on_dirty_cards([&] (CardValue* dirty_l, CardValue* dirty_r) { + jlong refine_start = os::elapsed_counter(); + + do_claimed_block(dirty_l, dirty_r, dest_card + pointer_delta(dirty_l, start_card, sizeof(CardValue))); + num_dirty_cards += pointer_delta(dirty_r, dirty_l, sizeof(CardValue)); + + _refine_stats.inc_refine_duration(os::elapsed_counter() - refine_start); + }); + + if (VerifyDuringGC) { + for (CardValue* i = start_card; i < end_card; ++i) { + guarantee(*i == G1CardTable::clean_card_val(), "must be"); + } + } + + _refine_stats.inc_cards_scanned(claim.size()); + _refine_stats.inc_cards_clean(claim.size() - num_dirty_cards); + + if (SuspendibleThreadSet::should_yield()) { + _completed = false; + break; + } + } + + return !_completed; + } +}; + +G1ConcurrentRefineSweepTask::G1ConcurrentRefineSweepTask(G1CardTableClaimTable* scan_state, + G1ConcurrentRefineStats* stats, + uint max_workers) : + WorkerTask("G1 Refine Task"), + _scan_state(scan_state), + _stats(stats), + _max_workers(max_workers), + _sweep_completed(true) +{ } + +void G1ConcurrentRefineSweepTask::work(uint worker_id) { + jlong start = os::elapsed_counter(); + + G1RefineRegionClosure sweep_cl(worker_id, _scan_state); + _scan_state->heap_region_iterate_from_worker_offset(&sweep_cl, worker_id, _max_workers); + + if (!sweep_cl._completed) { + _sweep_completed = false; + } + + sweep_cl._refine_stats.inc_sweep_time(os::elapsed_counter() - start); + _stats->add_atomic(&sweep_cl._refine_stats); +} + +bool G1ConcurrentRefineSweepTask::sweep_completed() const { return _sweep_completed; } \ No newline at end of file diff --git a/src/hotspot/share/gc/shared/bufferNodeList.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.hpp similarity index 57% rename from src/hotspot/share/gc/shared/bufferNodeList.hpp rename to src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.hpp index 55905ec071a..bf24c5ae850 100644 --- a/src/hotspot/share/gc/shared/bufferNodeList.hpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineSweepTask.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,20 +22,27 @@ * */ -#ifndef SHARE_GC_SHARED_BUFFERNODELIST_HPP -#define SHARE_GC_SHARED_BUFFERNODELIST_HPP +#ifndef SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP +#define SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP -#include "utilities/globalDefinitions.hpp" +#include "gc/g1/g1ConcurrentRefineStats.hpp" +#include "gc/shared/workerThread.hpp" -class BufferNode; +class G1CardTableClaimTable; -struct BufferNodeList { - BufferNode* _head; // First node in list or null if empty. - BufferNode* _tail; // Last node in list or null if empty. - size_t _entry_count; // Sum of entries in nodes in list. +class G1ConcurrentRefineSweepTask : public WorkerTask { + G1CardTableClaimTable* _scan_state; + G1ConcurrentRefineStats* _stats; + uint _max_workers; + bool _sweep_completed; - BufferNodeList(); - BufferNodeList(BufferNode* head, BufferNode* tail, size_t entry_count); +public: + + G1ConcurrentRefineSweepTask(G1CardTableClaimTable* scan_state, G1ConcurrentRefineStats* stats, uint max_workers); + + void work(uint worker_id) override; + + bool sweep_completed() const; }; -#endif // SHARE_GC_SHARED_BUFFERNODELIST_HPP +#endif /* SHARE_GC_G1_G1CONCURRENTREFINESWEEPTASK_HPP */ diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp index 2fa19d46093..eccfe466d48 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp @@ -23,10 +23,13 @@ */ #include "gc/g1/g1BarrierSet.hpp" +#include "gc/g1/g1CardTableClaimTable.inline.hpp" +#include "gc/g1/g1CollectedHeap.inline.hpp" #include "gc/g1/g1ConcurrentRefine.hpp" #include "gc/g1/g1ConcurrentRefineStats.hpp" +#include "gc/g1/g1ConcurrentRefineSweepTask.hpp" #include "gc/g1/g1ConcurrentRefineThread.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" +#include "gc/shared/gcTraceTime.inline.hpp" #include "gc/shared/suspendibleThreadSet.hpp" #include "logging/log.hpp" #include "runtime/cpuTimeCounters.hpp" @@ -38,60 +41,61 @@ #include "utilities/globalDefinitions.hpp" #include "utilities/ticks.hpp" -G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) : +G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr) : ConcurrentGCThread(), - _notifier(Mutex::nosafepoint, FormatBuffer<>("G1 Refine#%d", worker_id), true), + _notifier(Mutex::nosafepoint, "G1 Refine Control", true), _requested_active(false), - _refinement_stats(), - _worker_id(worker_id), _cr(cr) { - // set name - set_name("G1 Refine#%d", worker_id); + set_name("G1 Refine Control"); } void G1ConcurrentRefineThread::run_service() { - while (wait_for_completed_buffers()) { + while (wait_for_work()) { SuspendibleThreadSetJoiner sts_join; - G1ConcurrentRefineStats active_stats_start = _refinement_stats; report_active("Activated"); while (!should_terminate()) { if (sts_join.should_yield()) { - report_inactive("Paused", _refinement_stats - active_stats_start); + report_inactive("Paused"); sts_join.yield(); // Reset after yield rather than accumulating across yields, else a // very long running thread could overflow. - active_stats_start = _refinement_stats; report_active("Resumed"); - } else if (maybe_deactivate()) { - break; + } + // Look if we want to do refinement. If we don't then don't do any refinement + // this. This thread may have just woken up but no threads are currently + // needed, which is common. In this case we want to just go back to + // waiting, with a minimum of fuss; in particular, don't do any "premature" + // refinement. However, adjustment may be pending but temporarily + // blocked. In that case we wait for adjustment to succeed. + Ticks adjust_start = Ticks::now(); + if (cr()->adjust_num_threads_periodically()) { + GCTraceTime(Info, gc, refine) tm("Concurrent Refine Cycle"); + do_refinement(); } else { - do_refinement_step(); + log_debug(gc, refine)("Concurrent Refine Adjust Only (#threads wanted: %u adjustment_needed: %s wait_for_heap_lock: %s) %.2fms", + cr()->num_threads_wanted(), + BOOL_TO_STR(cr()->is_thread_adjustment_needed()), + BOOL_TO_STR(cr()->heap_was_locked()), + (Ticks::now() - adjust_start).seconds() * MILLIUNITS); + + deactivate(); + break; } } - report_inactive("Deactivated", _refinement_stats - active_stats_start); + report_inactive("Deactivated"); update_perf_counter_cpu_time(); } - log_debug(gc, refine)("Stopping %d", _worker_id); + log_debug(gc, refine)("Stopping %s", name()); } void G1ConcurrentRefineThread::report_active(const char* reason) const { - log_trace(gc, refine)("%s worker %u, current: %zu", - reason, - _worker_id, - G1BarrierSet::dirty_card_queue_set().num_cards()); + log_trace(gc, refine)("%s active (%s)", name(), reason); } -void G1ConcurrentRefineThread::report_inactive(const char* reason, - const G1ConcurrentRefineStats& stats) const { - log_trace(gc, refine) - ("%s worker %u, cards: %zu, refined %zu, rate %1.2fc/ms", - reason, - _worker_id, - G1BarrierSet::dirty_card_queue_set().num_cards(), - stats.refined_cards(), - stats.refinement_rate_ms()); +void G1ConcurrentRefineThread::report_inactive(const char* reason) const { + log_trace(gc, refine)("%s inactive (%s)", name(), reason); } void G1ConcurrentRefineThread::activate() { @@ -103,21 +107,12 @@ void G1ConcurrentRefineThread::activate() { } } -bool G1ConcurrentRefineThread::maybe_deactivate() { +bool G1ConcurrentRefineThread::deactivate() { assert(this == Thread::current(), "precondition"); - if (cr()->is_thread_wanted(_worker_id)) { - return false; - } else { - MutexLocker ml(&_notifier, Mutex::_no_safepoint_check_flag); - bool requested = _requested_active; - _requested_active = false; - return !requested; // Deactivate only if not recently requested active. - } -} - -bool G1ConcurrentRefineThread::try_refinement_step(size_t stop_at) { - assert(this == Thread::current(), "precondition"); - return _cr->try_refinement_step(_worker_id, stop_at, &_refinement_stats); + MutexLocker ml(&_notifier, Mutex::_no_safepoint_check_flag); + bool requested = _requested_active; + _requested_active = false; + return !requested; // Deactivate only if not recently requested active. } void G1ConcurrentRefineThread::stop_service() { @@ -128,23 +123,9 @@ jlong G1ConcurrentRefineThread::cpu_time() { return os::thread_cpu_time(this); } -// The (single) primary thread drives the controller for the refinement threads. -class G1PrimaryConcurrentRefineThread final : public G1ConcurrentRefineThread { - bool wait_for_completed_buffers() override; - bool maybe_deactivate() override; - void do_refinement_step() override; - // Updates jstat cpu usage for all refinement threads. - void update_perf_counter_cpu_time() override; - -public: - G1PrimaryConcurrentRefineThread(G1ConcurrentRefine* cr) : - G1ConcurrentRefineThread(cr, 0) - {} -}; - -// When inactive, the primary thread periodically wakes up and requests -// adjustment of the number of active refinement threads. -bool G1PrimaryConcurrentRefineThread::wait_for_completed_buffers() { +// When inactive, the control thread periodically wakes up to check if there is +// refinement work pending. +bool G1ConcurrentRefineThread::wait_for_work() { assert(this == Thread::current(), "precondition"); MonitorLocker ml(notifier(), Mutex::_no_safepoint_check_flag); if (!requested_active() && !should_terminate()) { @@ -157,78 +138,115 @@ bool G1PrimaryConcurrentRefineThread::wait_for_completed_buffers() { return !should_terminate(); } -bool G1PrimaryConcurrentRefineThread::maybe_deactivate() { - // Don't deactivate while needing to adjust the number of active threads. - return !cr()->is_thread_adjustment_needed() && - G1ConcurrentRefineThread::maybe_deactivate(); +void G1ConcurrentRefineThread::do_refinement() { + G1ConcurrentRefineSweepState& state = _cr->sweep_state(); + + state.start_work(); + + // Swap card tables. + + // 1. Global card table + if (!state.swap_global_card_table()) { + log_debug(gc, refine)("GC pause after Global Card Table Swap"); + return; + } + + // 2. Java threads + if (!state.swap_java_threads_ct()) { + log_debug(gc, refine)("GC pause after Java Thread CT swap"); + return; + } + + // 3. GC threads + if (!state.swap_gc_threads_ct()) { + log_debug(gc, refine)("GC pause after GC Thread CT swap"); + return; + } + + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + jlong epoch_yield_duration = g1h->yield_duration_in_refinement_epoch(); + jlong next_epoch_start = os::elapsed_counter(); + + jlong total_yield_during_sweep_duration = 0; + + // 4. Snapshot heap. + state.snapshot_heap(); + + // 5. Sweep refinement table until done + bool interrupted_by_gc = false; + + log_info(gc, task)("Concurrent Refine Sweep Using %u of %u Workers", _cr->num_threads_wanted(), _cr->max_num_threads()); + + state.sweep_refinement_table_start(); + while (true) { + bool completed = state.sweep_refinement_table_step(); + + if (completed) { + break; + } + + if (SuspendibleThreadSet::should_yield()) { + jlong yield_during_sweep_start = os::elapsed_counter(); + SuspendibleThreadSet::yield(); + + // The yielding may have completed the task, check. + if (!state.is_in_progress()) { + log_debug(gc, refine)("GC completed sweeping, aborting concurrent operation"); + interrupted_by_gc = true; + break; + } else { + jlong yield_during_sweep_duration = os::elapsed_counter() - yield_during_sweep_start; + log_debug(gc, refine)("Yielded from card table sweeping for %.2fms, no GC inbetween, continue", + TimeHelper::counter_to_millis(yield_during_sweep_duration)); + total_yield_during_sweep_duration += yield_during_sweep_duration; + } + } + } + + if (!interrupted_by_gc) { + GCTraceTime(Info, gc, refine) tm("Concurrent Refine Complete Work"); + + state.add_yield_during_sweep_duration(total_yield_during_sweep_duration); + + state.complete_work(true); + + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + G1Policy* policy = g1h->policy(); + G1ConcurrentRefineStats* stats = state.stats(); + policy->record_refinement_stats(stats); + + { + // The young gen revising mechanism reads the predictor and the values set + // here. Avoid inconsistencies by locking. + MutexLocker x(G1ReviseYoungLength_lock, Mutex::_no_safepoint_check_flag); + policy->record_dirtying_stats(TimeHelper::counter_to_millis(G1CollectedHeap::heap()->last_refinement_epoch_start()), + TimeHelper::counter_to_millis(next_epoch_start), + stats->cards_pending(), + TimeHelper::counter_to_millis(epoch_yield_duration), + 0 /* pending_cards_from_gc */, + stats->cards_to_cset()); + G1CollectedHeap::heap()->set_last_refinement_epoch_start(next_epoch_start, epoch_yield_duration); + } + stats->reset(); + } } -void G1PrimaryConcurrentRefineThread::do_refinement_step() { - // Try adjustment first. If it succeeds then don't do any refinement this - // round. This thread may have just woken up but no threads are currently - // needed, which is common. In this case we want to just go back to - // waiting, with a minimum of fuss; in particular, don't do any "premature" - // refinement. However, adjustment may be pending but temporarily - // blocked. In that case we *do* try refinement, rather than possibly - // uselessly spinning while waiting for adjustment to succeed. - if (!cr()->adjust_threads_periodically()) { - // No adjustment, so try refinement, with the target as a cuttoff. - if (!try_refinement_step(cr()->pending_cards_target())) { - // Refinement was cut off, so proceed with fewer threads. - cr()->reduce_threads_wanted(); +void G1ConcurrentRefineThread::update_perf_counter_cpu_time() { + // The control thread is responsible for updating the CPU time for all workers. + if (UsePerfData) { + { + ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine); + cr()->worker_threads_do(&tttc); + } + { + ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine_control); + cr()->control_thread_do(&tttc); } } } -void G1PrimaryConcurrentRefineThread::update_perf_counter_cpu_time() { - if (UsePerfData) { - ThreadTotalCPUTimeClosure tttc(CPUTimeGroups::CPUTimeType::gc_conc_refine); - cr()->threads_do(&tttc); - } -} - -class G1SecondaryConcurrentRefineThread final : public G1ConcurrentRefineThread { - bool wait_for_completed_buffers() override; - void do_refinement_step() override; - void update_perf_counter_cpu_time() override { /* Nothing to do. The primary thread does all the work. */ } - -public: - G1SecondaryConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) : - G1ConcurrentRefineThread(cr, worker_id) - { - assert(worker_id > 0, "precondition"); - } -}; - -bool G1SecondaryConcurrentRefineThread::wait_for_completed_buffers() { - assert(this == Thread::current(), "precondition"); - MonitorLocker ml(notifier(), Mutex::_no_safepoint_check_flag); - while (!requested_active() && !should_terminate()) { - ml.wait(); - } - return !should_terminate(); -} - -void G1SecondaryConcurrentRefineThread::do_refinement_step() { - assert(this == Thread::current(), "precondition"); - // Secondary threads ignore the target and just drive the number of pending - // dirty cards down. The primary thread is responsible for noticing the - // target has been reached and reducing the number of wanted threads. This - // makes the control of wanted threads all under the primary, while avoiding - // useless spinning by secondary threads until the primary thread notices. - // (Useless spinning is still possible if there are no pending cards, but - // that should rarely happen.) - try_refinement_step(0); -} - -G1ConcurrentRefineThread* -G1ConcurrentRefineThread::create(G1ConcurrentRefine* cr, uint worker_id) { - G1ConcurrentRefineThread* crt; - if (worker_id == 0) { - crt = new (std::nothrow) G1PrimaryConcurrentRefineThread(cr); - } else { - crt = new (std::nothrow) G1SecondaryConcurrentRefineThread(cr, worker_id); - } +G1ConcurrentRefineThread* G1ConcurrentRefineThread::create(G1ConcurrentRefine* cr) { + G1ConcurrentRefineThread* crt = new (std::nothrow) G1ConcurrentRefineThread(cr); if (crt != nullptr) { crt->create_and_start(); } diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp index b1e34e4b78d..8e635247cd3 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp @@ -33,8 +33,8 @@ // Forward Decl. class G1ConcurrentRefine; -// One or more G1 Concurrent Refinement Threads may be active if concurrent -// refinement is in progress. +// Concurrent refinement control thread watching card mark accrual on the card table +// and starting refinement work. class G1ConcurrentRefineThread: public ConcurrentGCThread { friend class VMStructs; friend class G1CollectedHeap; @@ -42,43 +42,34 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread { Monitor _notifier; bool _requested_active; - G1ConcurrentRefineStats _refinement_stats; - uint _worker_id; G1ConcurrentRefine* _cr; NONCOPYABLE(G1ConcurrentRefineThread); -protected: - G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id); + G1ConcurrentRefineThread(G1ConcurrentRefine* cr); Monitor* notifier() { return &_notifier; } bool requested_active() const { return _requested_active; } // Returns !should_terminate(). // precondition: this is the current thread. - virtual bool wait_for_completed_buffers() = 0; + bool wait_for_work(); // Deactivate if appropriate. Returns true if deactivated. // precondition: this is the current thread. - virtual bool maybe_deactivate(); + bool deactivate(); - // Attempt to do some refinement work. - // precondition: this is the current thread. - virtual void do_refinement_step() = 0; + // Swap card table and do a complete re-examination/refinement pass over the + // refinement table. + void do_refinement(); // Update concurrent refine threads cpu time stats. - virtual void update_perf_counter_cpu_time() = 0; - - // Helper for do_refinement_step implementations. Try to perform some - // refinement work, limited by stop_at. Returns true if any refinement work - // was performed, false if no work available per stop_at. - // precondition: this is the current thread. - bool try_refinement_step(size_t stop_at); + void update_perf_counter_cpu_time(); void report_active(const char* reason) const; - void report_inactive(const char* reason, const G1ConcurrentRefineStats& stats) const; + void report_inactive(const char* reason) const; G1ConcurrentRefine* cr() const { return _cr; } @@ -86,23 +77,12 @@ protected: void stop_service() override; public: - static G1ConcurrentRefineThread* create(G1ConcurrentRefine* cr, uint worker_id); - virtual ~G1ConcurrentRefineThread() = default; - - uint worker_id() const { return _worker_id; } + static G1ConcurrentRefineThread* create(G1ConcurrentRefine* cr); // Activate this thread. // precondition: this is not the current thread. void activate(); - G1ConcurrentRefineStats* refinement_stats() { - return &_refinement_stats; - } - - const G1ConcurrentRefineStats* refinement_stats() const { - return &_refinement_stats; - } - // Total cpu time spent in this thread so far. jlong cpu_time(); }; diff --git a/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp b/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp index d34229bd359..3ab26bd72af 100644 --- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp +++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThreadsNeeded.cpp @@ -45,48 +45,22 @@ G1ConcurrentRefineThreadsNeeded::G1ConcurrentRefineThreadsNeeded(G1Policy* polic // // 1. Minimize the number of refinement threads running at once. // -// 2. Minimize the number of activations and deactivations for the -// refinement threads that run. -// -// 3. Delay performing refinement work. Having more dirty cards waiting to +// 2. Delay performing refinement work. Having more dirty cards waiting to // be refined can be beneficial, as further writes to the same card don't // create more work. void G1ConcurrentRefineThreadsNeeded::update(uint active_threads, size_t available_bytes, size_t num_cards, size_t target_num_cards) { + _predicted_time_until_next_gc_ms = _policy->predict_time_to_next_gc_ms(available_bytes); + + // Estimate number of cards that need to be processed before next GC. const G1Analytics* analytics = _policy->analytics(); - // Estimate time until next GC, based on remaining bytes available for - // allocation and the allocation rate. - double alloc_region_rate = analytics->predict_alloc_rate_ms(); - double alloc_bytes_rate = alloc_region_rate * G1HeapRegion::GrainBytes; - if (alloc_bytes_rate == 0.0) { - // A zero rate indicates we don't yet have data to use for predictions. - // Since we don't have any idea how long until the next GC, use a time of - // zero. - _predicted_time_until_next_gc_ms = 0.0; - } else { - // If the heap size is large and the allocation rate is small, we can get - // a predicted time until next GC that is so large it can cause problems - // (such as overflow) in other calculations. Limit the prediction to one - // hour, which is still large in this context. - const double one_hour_ms = 60.0 * 60.0 * MILLIUNITS; - double raw_time_ms = available_bytes / alloc_bytes_rate; - _predicted_time_until_next_gc_ms = MIN2(raw_time_ms, one_hour_ms); - } + double incoming_rate = analytics->predict_dirtied_cards_rate_ms(); + double raw_cards = incoming_rate * _predicted_time_until_next_gc_ms; + size_t incoming_cards = static_cast(raw_cards); - // Estimate number of cards that need to be processed before next GC. There - // are no incoming cards when time is short, because in that case the - // controller activates refinement by mutator threads to stay on target even - // if threads deactivate in the meantime. This also covers the case of not - // having a real prediction of time until GC. - size_t incoming_cards = 0; - if (_predicted_time_until_next_gc_ms > _update_period_ms) { - double incoming_rate = analytics->predict_dirtied_cards_rate_ms(); - double raw_cards = incoming_rate * _predicted_time_until_next_gc_ms; - incoming_cards = static_cast(raw_cards); - } size_t total_cards = num_cards + incoming_cards; _predicted_cards_at_next_gc = total_cards; @@ -100,9 +74,8 @@ void G1ConcurrentRefineThreadsNeeded::update(uint active_threads, // The calculation of the number of threads needed isn't very stable when // time is short, and can lead to starting up lots of threads for not much // profit. If we're in the last update period, don't change the number of - // threads running, other than to treat the current thread as running. That - // might not be sufficient, but hopefully we were already reasonably close. - // We won't accumulate more because mutator refinement will be activated. + // threads needed. That might not be sufficient, but hopefully we were + // already reasonably close. if (_predicted_time_until_next_gc_ms <= _update_period_ms) { _threads_needed = MAX2(active_threads, 1u); return; @@ -133,11 +106,12 @@ void G1ConcurrentRefineThreadsNeeded::update(uint active_threads, // close to the next GC we want to drive toward the target, so round up // then. The rest of the time we round to nearest, trying to remain near // the middle of the range. + double rthreads = nthreads; if (_predicted_time_until_next_gc_ms <= _update_period_ms * 5.0) { - nthreads = ::ceil(nthreads); + rthreads = ::ceil(nthreads); } else { - nthreads = ::round(nthreads); + rthreads = ::round(nthreads); } - _threads_needed = static_cast(MIN2(nthreads, UINT_MAX)); + _threads_needed = static_cast(MIN2(rthreads, UINT_MAX)); } diff --git a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp deleted file mode 100644 index ec9d68af3bb..00000000000 --- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp +++ /dev/null @@ -1,599 +0,0 @@ -/* - * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - * - */ - -#include "gc/g1/g1BarrierSet.inline.hpp" -#include "gc/g1/g1CardTableEntryClosure.hpp" -#include "gc/g1/g1CollectedHeap.inline.hpp" -#include "gc/g1/g1ConcurrentRefineStats.hpp" -#include "gc/g1/g1ConcurrentRefineThread.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" -#include "gc/g1/g1FreeIdSet.hpp" -#include "gc/g1/g1HeapRegionRemSet.inline.hpp" -#include "gc/g1/g1RedirtyCardsQueue.hpp" -#include "gc/g1/g1RemSet.hpp" -#include "gc/g1/g1ThreadLocalData.hpp" -#include "gc/shared/bufferNode.hpp" -#include "gc/shared/bufferNodeList.hpp" -#include "gc/shared/suspendibleThreadSet.hpp" -#include "memory/iterator.hpp" -#include "runtime/atomicAccess.hpp" -#include "runtime/javaThread.hpp" -#include "runtime/mutex.hpp" -#include "runtime/mutexLocker.hpp" -#include "runtime/os.hpp" -#include "runtime/safepoint.hpp" -#include "runtime/threads.hpp" -#include "runtime/threadSMR.hpp" -#include "utilities/globalCounter.inline.hpp" -#include "utilities/macros.hpp" -#include "utilities/nonblockingQueue.inline.hpp" -#include "utilities/pair.hpp" -#include "utilities/quickSort.hpp" -#include "utilities/ticks.hpp" - -G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) : - PtrQueue(qset), - _refinement_stats(new G1ConcurrentRefineStats()) -{ } - -G1DirtyCardQueue::~G1DirtyCardQueue() { - delete _refinement_stats; -} - -// Assumed to be zero by concurrent threads. -static uint par_ids_start() { return 0; } - -G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) : - PtrQueueSet(allocator), - _num_cards(0), - _mutator_refinement_threshold(SIZE_MAX), - _completed(), - _paused(), - _free_ids(par_ids_start(), num_par_ids()), - _detached_refinement_stats() -{} - -G1DirtyCardQueueSet::~G1DirtyCardQueueSet() { - abandon_completed_buffers(); -} - -// Determines how many mutator threads can process the buffers in parallel. -uint G1DirtyCardQueueSet::num_par_ids() { - return (uint)os::initial_active_processor_count(); -} - -void G1DirtyCardQueueSet::flush_queue(G1DirtyCardQueue& queue) { - if (queue.buffer() != nullptr) { - G1ConcurrentRefineStats* stats = queue.refinement_stats(); - stats->inc_dirtied_cards(queue.size()); - } - PtrQueueSet::flush_queue(queue); -} - -void G1DirtyCardQueueSet::enqueue(G1DirtyCardQueue& queue, - volatile CardValue* card_ptr) { - CardValue* value = const_cast(card_ptr); - if (!try_enqueue(queue, value)) { - handle_zero_index(queue); - retry_enqueue(queue, value); - } -} - -void G1DirtyCardQueueSet::handle_zero_index(G1DirtyCardQueue& queue) { - assert(queue.index() == 0, "precondition"); - BufferNode* old_node = exchange_buffer_with_new(queue); - if (old_node != nullptr) { - assert(old_node->index() == 0, "invariant"); - G1ConcurrentRefineStats* stats = queue.refinement_stats(); - stats->inc_dirtied_cards(old_node->capacity()); - handle_completed_buffer(old_node, stats); - } -} - -void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) { - G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t); - G1BarrierSet::dirty_card_queue_set().handle_zero_index(queue); -} - -size_t G1DirtyCardQueueSet::num_cards() const { - return AtomicAccess::load(&_num_cards); -} - -void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) { - assert(cbn != nullptr, "precondition"); - // Increment _num_cards before adding to queue, so queue removal doesn't - // need to deal with _num_cards possibly going negative. - AtomicAccess::add(&_num_cards, cbn->size()); - // Perform push in CS. The old tail may be popped while the push is - // observing it (attaching it to the new buffer). We need to ensure it - // can't be reused until the push completes, to avoid ABA problems. - GlobalCounter::CriticalSection cs(Thread::current()); - _completed.push(*cbn); -} - -// Thread-safe attempt to remove and return the first buffer from -// the _completed queue, using the NonblockingQueue::try_pop() underneath. -// It has a limitation that it may return null when there are objects -// in the queue if there is a concurrent push/append operation. -BufferNode* G1DirtyCardQueueSet::dequeue_completed_buffer() { - Thread* current_thread = Thread::current(); - BufferNode* result = nullptr; - while (true) { - // Use GlobalCounter critical section to avoid ABA problem. - // The release of a buffer to its allocator's free list uses - // GlobalCounter::write_synchronize() to coordinate with this - // dequeuing operation. - // We use a CS per iteration, rather than over the whole loop, - // because we're not guaranteed to make progress. Lingering in - // one CS could defer releasing buffer to the free list for reuse, - // leading to excessive allocations. - GlobalCounter::CriticalSection cs(current_thread); - if (_completed.try_pop(&result)) return result; - } -} - -BufferNode* G1DirtyCardQueueSet::get_completed_buffer() { - BufferNode* result = dequeue_completed_buffer(); - if (result == nullptr) { // Unlikely if no paused buffers. - enqueue_previous_paused_buffers(); - result = dequeue_completed_buffer(); - if (result == nullptr) return nullptr; - } - AtomicAccess::sub(&_num_cards, result->size()); - return result; -} - -#ifdef ASSERT -void G1DirtyCardQueueSet::verify_num_cards() const { - size_t actual = 0; - for (BufferNode* cur = _completed.first(); - !_completed.is_end(cur); - cur = cur->next()) { - actual += cur->size(); - } - assert(actual == AtomicAccess::load(&_num_cards), - "Num entries in completed buffers should be %zu but are %zu", - AtomicAccess::load(&_num_cards), actual); -} -#endif // ASSERT - -G1DirtyCardQueueSet::PausedBuffers::PausedList::PausedList() : - _head(nullptr), _tail(nullptr), - _safepoint_id(SafepointSynchronize::safepoint_id()) -{} - -#ifdef ASSERT -G1DirtyCardQueueSet::PausedBuffers::PausedList::~PausedList() { - assert(AtomicAccess::load(&_head) == nullptr, "precondition"); - assert(_tail == nullptr, "precondition"); -} -#endif // ASSERT - -bool G1DirtyCardQueueSet::PausedBuffers::PausedList::is_next() const { - assert_not_at_safepoint(); - return _safepoint_id == SafepointSynchronize::safepoint_id(); -} - -void G1DirtyCardQueueSet::PausedBuffers::PausedList::add(BufferNode* node) { - assert_not_at_safepoint(); - assert(is_next(), "precondition"); - BufferNode* old_head = AtomicAccess::xchg(&_head, node); - if (old_head == nullptr) { - assert(_tail == nullptr, "invariant"); - _tail = node; - } else { - node->set_next(old_head); - } -} - -G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::PausedList::take() { - BufferNode* head = AtomicAccess::load(&_head); - BufferNode* tail = _tail; - AtomicAccess::store(&_head, (BufferNode*)nullptr); - _tail = nullptr; - return HeadTail(head, tail); -} - -G1DirtyCardQueueSet::PausedBuffers::PausedBuffers() : _plist(nullptr) {} - -#ifdef ASSERT -G1DirtyCardQueueSet::PausedBuffers::~PausedBuffers() { - assert(AtomicAccess::load(&_plist) == nullptr, "invariant"); -} -#endif // ASSERT - -void G1DirtyCardQueueSet::PausedBuffers::add(BufferNode* node) { - assert_not_at_safepoint(); - PausedList* plist = AtomicAccess::load_acquire(&_plist); - if (plist == nullptr) { - // Try to install a new next list. - plist = new PausedList(); - PausedList* old_plist = AtomicAccess::cmpxchg(&_plist, (PausedList*)nullptr, plist); - if (old_plist != nullptr) { - // Some other thread installed a new next list. Use it instead. - delete plist; - plist = old_plist; - } - } - assert(plist->is_next(), "invariant"); - plist->add(node); -} - -G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_previous() { - assert_not_at_safepoint(); - PausedList* previous; - { - // Deal with plist in a critical section, to prevent it from being - // deleted out from under us by a concurrent take_previous(). - GlobalCounter::CriticalSection cs(Thread::current()); - previous = AtomicAccess::load_acquire(&_plist); - if ((previous == nullptr) || // Nothing to take. - previous->is_next() || // Not from a previous safepoint. - // Some other thread stole it. - (AtomicAccess::cmpxchg(&_plist, previous, (PausedList*)nullptr) != previous)) { - return HeadTail(); - } - } - // We now own previous. - HeadTail result = previous->take(); - // There might be other threads examining previous (in concurrent - // take_previous()). Synchronize to wait until any such threads are - // done with such examination before deleting. - GlobalCounter::write_synchronize(); - delete previous; - return result; -} - -G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_all() { - assert_at_safepoint(); - HeadTail result; - PausedList* plist = AtomicAccess::load(&_plist); - if (plist != nullptr) { - AtomicAccess::store(&_plist, (PausedList*)nullptr); - result = plist->take(); - delete plist; - } - return result; -} - -void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) { - assert_not_at_safepoint(); - assert(node->next() == nullptr, "precondition"); - // Ensure there aren't any paused buffers from a previous safepoint. - enqueue_previous_paused_buffers(); - // Cards for paused buffers are included in count, to contribute to - // notification checking after the coming safepoint if it doesn't GC. - // Note that this means the queue's _num_cards differs from the number - // of cards in the queued buffers when there are paused buffers. - AtomicAccess::add(&_num_cards, node->size()); - _paused.add(node); -} - -void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(const HeadTail& paused) { - if (paused._head != nullptr) { - assert(paused._tail != nullptr, "invariant"); - // Cards from paused buffers are already recorded in the queue count. - _completed.append(*paused._head, *paused._tail); - } -} - -void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() { - assert_not_at_safepoint(); - enqueue_paused_buffers_aux(_paused.take_previous()); -} - -void G1DirtyCardQueueSet::enqueue_all_paused_buffers() { - assert_at_safepoint(); - enqueue_paused_buffers_aux(_paused.take_all()); -} - -void G1DirtyCardQueueSet::abandon_completed_buffers() { - BufferNodeList list = take_all_completed_buffers(); - BufferNode* buffers_to_delete = list._head; - while (buffers_to_delete != nullptr) { - BufferNode* bn = buffers_to_delete; - buffers_to_delete = bn->next(); - bn->set_next(nullptr); - deallocate_buffer(bn); - } -} - -// Merge lists of buffers. The source queue set is emptied as a -// result. The queue sets must share the same allocator. -void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) { - assert(allocator() == src->allocator(), "precondition"); - const BufferNodeList from = src->take_all_completed_buffers(); - if (from._head != nullptr) { - AtomicAccess::add(&_num_cards, from._entry_count); - _completed.append(*from._head, *from._tail); - } -} - -BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() { - enqueue_all_paused_buffers(); - verify_num_cards(); - Pair pair = _completed.take_all(); - size_t num_cards = AtomicAccess::load(&_num_cards); - AtomicAccess::store(&_num_cards, size_t(0)); - return BufferNodeList(pair.first, pair.second, num_cards); -} - -class G1RefineBufferedCards : public StackObj { - BufferNode* const _node; - CardTable::CardValue** const _node_buffer; - const size_t _node_buffer_capacity; - const uint _worker_id; - G1ConcurrentRefineStats* _stats; - G1RemSet* const _g1rs; - - static inline ptrdiff_t compare_cards(const CardTable::CardValue* p1, - const CardTable::CardValue* p2) { - return p2 - p1; - } - - // Sorts the cards from start_index to _node_buffer_capacity in *decreasing* - // address order. Tests showed that this order is preferable to not sorting - // or increasing address order. - void sort_cards(size_t start_index) { - QuickSort::sort(&_node_buffer[start_index], - _node_buffer_capacity - start_index, - compare_cards); - } - - // Returns the index to the first clean card in the buffer. - size_t clean_cards() { - const size_t start = _node->index(); - assert(start <= _node_buffer_capacity, "invariant"); - - // Two-fingered compaction algorithm similar to the filtering mechanism in - // SATBMarkQueue. The main difference is that clean_card_before_refine() - // could change the buffer element in-place. - // We don't check for SuspendibleThreadSet::should_yield(), because - // cleaning and redirtying the cards is fast. - CardTable::CardValue** src = &_node_buffer[start]; - CardTable::CardValue** dst = &_node_buffer[_node_buffer_capacity]; - assert(src <= dst, "invariant"); - for ( ; src < dst; ++src) { - // Search low to high for a card to keep. - if (_g1rs->clean_card_before_refine(src)) { - // Found keeper. Search high to low for a card to discard. - while (src < --dst) { - if (!_g1rs->clean_card_before_refine(dst)) { - *dst = *src; // Replace discard with keeper. - break; - } - } - // If discard search failed (src == dst), the outer loop will also end. - } - } - - // dst points to the first retained clean card, or the end of the buffer - // if all the cards were discarded. - const size_t first_clean = dst - _node_buffer; - assert(first_clean >= start && first_clean <= _node_buffer_capacity, "invariant"); - // Discarded cards are considered as refined. - _stats->inc_refined_cards(first_clean - start); - _stats->inc_precleaned_cards(first_clean - start); - return first_clean; - } - - bool refine_cleaned_cards(size_t start_index) { - bool result = true; - size_t i = start_index; - for ( ; i < _node_buffer_capacity; ++i) { - if (SuspendibleThreadSet::should_yield()) { - redirty_unrefined_cards(i); - result = false; - break; - } - _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id); - } - _node->set_index(i); - _stats->inc_refined_cards(i - start_index); - return result; - } - - void redirty_unrefined_cards(size_t start) { - for ( ; start < _node_buffer_capacity; ++start) { - *_node_buffer[start] = G1CardTable::dirty_card_val(); - } - } - -public: - G1RefineBufferedCards(BufferNode* node, - uint worker_id, - G1ConcurrentRefineStats* stats) : - _node(node), - _node_buffer(reinterpret_cast(BufferNode::make_buffer_from_node(node))), - _node_buffer_capacity(node->capacity()), - _worker_id(worker_id), - _stats(stats), - _g1rs(G1CollectedHeap::heap()->rem_set()) {} - - bool refine() { - size_t first_clean_index = clean_cards(); - if (first_clean_index == _node_buffer_capacity) { - _node->set_index(first_clean_index); - return true; - } - // This fence serves two purposes. First, the cards must be cleaned - // before processing the contents. Second, we can't proceed with - // processing a region until after the read of the region's top in - // collect_and_clean_cards(), for synchronization with possibly concurrent - // humongous object allocation (see comment at the StoreStore fence before - // setting the regions' tops in humongous allocation path). - // It's okay that reading region's top and reading region's type were racy - // wrto each other. We need both set, in any order, to proceed. - OrderAccess::fence(); - sort_cards(first_clean_index); - return refine_cleaned_cards(first_clean_index); - } -}; - -bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node, - uint worker_id, - G1ConcurrentRefineStats* stats) { - Ticks start_time = Ticks::now(); - G1RefineBufferedCards buffered_cards(node, worker_id, stats); - bool result = buffered_cards.refine(); - stats->inc_refinement_time(Ticks::now() - start_time); - return result; -} - -void G1DirtyCardQueueSet::handle_refined_buffer(BufferNode* node, - bool fully_processed) { - if (fully_processed) { - assert(node->is_empty(), "Buffer not fully consumed: index: %zu, size: %zu", - node->index(), node->capacity()); - deallocate_buffer(node); - } else { - assert(!node->is_empty(), "Buffer fully consumed."); - // Buffer incompletely processed because there is a pending safepoint. - // Record partially processed buffer, to be finished later. - record_paused_buffer(node); - } -} - -void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node, - G1ConcurrentRefineStats* stats) { - enqueue_completed_buffer(new_node); - - // No need for mutator refinement if number of cards is below limit. - if (AtomicAccess::load(&_num_cards) <= AtomicAccess::load(&_mutator_refinement_threshold)) { - return; - } - - // Don't try to process a buffer that will just get immediately paused. - // When going into a safepoint it's just a waste of effort. - // When coming out of a safepoint, Java threads may be running before the - // yield request (for non-Java threads) has been cleared. - if (SuspendibleThreadSet::should_yield()) { - return; - } - - // Only Java threads perform mutator refinement. - if (!Thread::current()->is_Java_thread()) { - return; - } - - BufferNode* node = get_completed_buffer(); - if (node == nullptr) return; // Didn't get a buffer to process. - - // Refine cards in buffer. - - uint worker_id = _free_ids.claim_par_id(); // temporarily claim an id - bool fully_processed = refine_buffer(node, worker_id, stats); - _free_ids.release_par_id(worker_id); // release the id - - // Deal with buffer after releasing id, to let another thread use id. - handle_refined_buffer(node, fully_processed); -} - -bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id, - size_t stop_at, - G1ConcurrentRefineStats* stats) { - // Not enough cards to trigger processing. - if (AtomicAccess::load(&_num_cards) <= stop_at) return false; - - BufferNode* node = get_completed_buffer(); - if (node == nullptr) return false; // Didn't get a buffer to process. - - bool fully_processed = refine_buffer(node, worker_id, stats); - handle_refined_buffer(node, fully_processed); - return true; -} - -void G1DirtyCardQueueSet::abandon_logs_and_stats() { - assert_at_safepoint(); - - // Disable mutator refinement until concurrent refinement decides otherwise. - set_mutator_refinement_threshold(SIZE_MAX); - - // Iterate over all the threads, resetting per-thread queues and stats. - struct AbandonThreadLogClosure : public ThreadClosure { - G1DirtyCardQueueSet& _qset; - AbandonThreadLogClosure(G1DirtyCardQueueSet& qset) : _qset(qset) {} - virtual void do_thread(Thread* t) { - G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t); - _qset.reset_queue(queue); - queue.refinement_stats()->reset(); - } - } closure(*this); - Threads::threads_do(&closure); - - enqueue_all_paused_buffers(); - abandon_completed_buffers(); - - // Reset stats from detached threads. - MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag); - _detached_refinement_stats.reset(); -} - -void G1DirtyCardQueueSet::update_refinement_stats(G1ConcurrentRefineStats& stats) { - assert_at_safepoint(); - - _concatenated_refinement_stats = stats; - - enqueue_all_paused_buffers(); - verify_num_cards(); - - // Collect and reset stats from detached threads. - MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag); - _concatenated_refinement_stats += _detached_refinement_stats; - _detached_refinement_stats.reset(); -} - -G1ConcurrentRefineStats G1DirtyCardQueueSet::concatenate_log_and_stats(Thread* thread) { - assert_at_safepoint(); - - G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread); - // Flush the buffer if non-empty. Flush before accumulating and - // resetting stats, since flushing may modify the stats. - if (!queue.is_empty()) { - flush_queue(queue); - } - - G1ConcurrentRefineStats result = *queue.refinement_stats(); - queue.refinement_stats()->reset(); - return result; -} - -G1ConcurrentRefineStats G1DirtyCardQueueSet::concatenated_refinement_stats() const { - assert_at_safepoint(); - return _concatenated_refinement_stats; -} - -void G1DirtyCardQueueSet::record_detached_refinement_stats(G1ConcurrentRefineStats* stats) { - MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag); - _detached_refinement_stats += *stats; - stats->reset(); -} - -size_t G1DirtyCardQueueSet::mutator_refinement_threshold() const { - return AtomicAccess::load(&_mutator_refinement_threshold); -} - -void G1DirtyCardQueueSet::set_mutator_refinement_threshold(size_t value) { - AtomicAccess::store(&_mutator_refinement_threshold, value); -} diff --git a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp deleted file mode 100644 index 6beb536df87..00000000000 --- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - * - */ - -#ifndef SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP -#define SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP - -#include "gc/g1/g1CardTable.hpp" -#include "gc/g1/g1ConcurrentRefineStats.hpp" -#include "gc/g1/g1FreeIdSet.hpp" -#include "gc/shared/bufferNode.hpp" -#include "gc/shared/bufferNodeList.hpp" -#include "gc/shared/ptrQueue.hpp" -#include "memory/allocation.hpp" -#include "memory/padded.hpp" -#include "utilities/nonblockingQueue.hpp" - -class G1PrimaryConcurrentRefineThread; -class G1DirtyCardQueueSet; -class G1RedirtyCardsQueueSet; -class Thread; - -// A ptrQueue whose elements are "oops", pointers to object heads. -class G1DirtyCardQueue: public PtrQueue { - G1ConcurrentRefineStats* _refinement_stats; - -public: - G1DirtyCardQueue(G1DirtyCardQueueSet* qset); - - // Flush before destroying; queue may be used to capture pending work while - // doing something else, with auto-flush on completion. - ~G1DirtyCardQueue(); - - G1ConcurrentRefineStats* refinement_stats() const { - return _refinement_stats; - } - - // Compiler support. - static ByteSize byte_offset_of_index() { - return PtrQueue::byte_offset_of_index(); - } - using PtrQueue::byte_width_of_index; - - static ByteSize byte_offset_of_buf() { - return PtrQueue::byte_offset_of_buf(); - } - using PtrQueue::byte_width_of_buf; - -}; - -class G1DirtyCardQueueSet: public PtrQueueSet { - // Head and tail of a list of BufferNodes, linked through their next() - // fields. Similar to BufferNodeList, but without the _entry_count. - struct HeadTail { - BufferNode* _head; - BufferNode* _tail; - HeadTail() : _head(nullptr), _tail(nullptr) {} - HeadTail(BufferNode* head, BufferNode* tail) : _head(head), _tail(tail) {} - }; - - // Concurrent refinement may stop processing in the middle of a buffer if - // there is a pending safepoint, to avoid long delays to safepoint. A - // partially processed buffer needs to be recorded for processing by the - // safepoint if it's a GC safepoint; otherwise it needs to be recorded for - // further concurrent refinement work after the safepoint. But if the - // buffer was obtained from the completed buffer queue then it can't simply - // be added back to the queue, as that would introduce a new source of ABA - // for the queue. - // - // The PausedBuffer object is used to record such buffers for the upcoming - // safepoint, and provides access to the buffers recorded for previous - // safepoints. Before obtaining a buffer from the completed buffers queue, - // we first transfer any buffers from previous safepoints to the queue. - // This is ABA-safe because threads cannot be in the midst of a queue pop - // across a safepoint. - // - // The paused buffers are conceptually an extension of the completed buffers - // queue, and operations which need to deal with all of the queued buffers - // (such as concatenating or abandoning logs) also need to deal with any - // paused buffers. In general, if a safepoint performs a GC then the paused - // buffers will be processed as part of it, and there won't be any paused - // buffers after a GC safepoint. - class PausedBuffers { - class PausedList : public CHeapObj { - BufferNode* volatile _head; - BufferNode* _tail; - size_t _safepoint_id; - - NONCOPYABLE(PausedList); - - public: - PausedList(); - DEBUG_ONLY(~PausedList();) - - // Return true if this list was created to hold buffers for the - // next safepoint. - // precondition: not at safepoint. - bool is_next() const; - - // Thread-safe add the buffer to the list. - // precondition: not at safepoint. - // precondition: is_next(). - void add(BufferNode* node); - - // Take all the buffers from the list. Not thread-safe. - HeadTail take(); - }; - - // The most recently created list, which might be for either the next or - // a previous safepoint, or might be null if the next list hasn't been - // created yet. We only need one list because of the requirement that - // threads calling add() must first ensure there are no paused buffers - // from a previous safepoint. There might be many list instances existing - // at the same time though; there can be many threads competing to create - // and install the next list, and meanwhile there can be a thread dealing - // with the previous list. - PausedList* volatile _plist; - DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, sizeof(PausedList*)); - - NONCOPYABLE(PausedBuffers); - - public: - PausedBuffers(); - DEBUG_ONLY(~PausedBuffers();) - - // Thread-safe add the buffer to paused list for next safepoint. - // precondition: not at safepoint. - // precondition: does not have paused buffers from a previous safepoint. - void add(BufferNode* node); - - // Thread-safe take all paused buffers for previous safepoints. - // precondition: not at safepoint. - HeadTail take_previous(); - - // Take all the paused buffers. - // precondition: at safepoint. - HeadTail take_all(); - }; - - DEFINE_PAD_MINUS_SIZE(0, DEFAULT_PADDING_SIZE, 0); - // Upper bound on the number of cards in the completed and paused buffers. - volatile size_t _num_cards; - DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, sizeof(size_t)); - // If the queue contains more cards than configured here, the - // mutator must start doing some of the concurrent refinement work. - volatile size_t _mutator_refinement_threshold; - DEFINE_PAD_MINUS_SIZE(2, DEFAULT_PADDING_SIZE, sizeof(size_t)); - // Buffers ready for refinement. - // NonblockingQueue has inner padding of one cache line. - NonblockingQueue _completed; - // Add a trailer padding after NonblockingQueue. - DEFINE_PAD_MINUS_SIZE(3, DEFAULT_PADDING_SIZE, sizeof(BufferNode*)); - // Buffers for which refinement is temporarily paused. - // PausedBuffers has inner padding, including trailer. - PausedBuffers _paused; - - G1FreeIdSet _free_ids; - - G1ConcurrentRefineStats _concatenated_refinement_stats; - G1ConcurrentRefineStats _detached_refinement_stats; - - // Verify _num_cards == sum of cards in the completed queue. - void verify_num_cards() const NOT_DEBUG_RETURN; - - // Thread-safe add a buffer to paused list for next safepoint. - // precondition: not at safepoint. - void record_paused_buffer(BufferNode* node); - void enqueue_paused_buffers_aux(const HeadTail& paused); - // Thread-safe transfer paused buffers for previous safepoints to the queue. - // precondition: not at safepoint. - void enqueue_previous_paused_buffers(); - // Transfer all paused buffers to the queue. - // precondition: at safepoint. - void enqueue_all_paused_buffers(); - - void abandon_completed_buffers(); - - // Refine the cards in "node" from its index to buffer_capacity. - // Stops processing if SuspendibleThreadSet::should_yield() is true. - // Returns true if the entire buffer was processed, false if there - // is a pending yield request. The node's index is updated to exclude - // the processed elements, e.g. up to the element before processing - // stopped, or one past the last element if the entire buffer was - // processed. Updates stats. - bool refine_buffer(BufferNode* node, - uint worker_id, - G1ConcurrentRefineStats* stats); - - // Deal with buffer after a call to refine_buffer. If fully processed, - // deallocate the buffer. Otherwise, record it as paused. - void handle_refined_buffer(BufferNode* node, bool fully_processed); - - // Thread-safe attempt to remove and return the first buffer from - // the _completed queue. - // Returns null if the queue is empty, or if a concurrent push/append - // interferes. It uses GlobalCounter critical section to avoid ABA problem. - BufferNode* dequeue_completed_buffer(); - // Remove and return a completed buffer from the list, or return null - // if none available. - BufferNode* get_completed_buffer(); - - // Called when queue is full or has no buffer. - void handle_zero_index(G1DirtyCardQueue& queue); - - // Enqueue the buffer, and optionally perform refinement by the mutator. - // Mutator refinement is only done by Java threads, and only if there - // are more than mutator_refinement_threshold cards in the completed buffers. - // Updates stats. - // - // Mutator refinement, if performed, stops processing a buffer if - // SuspendibleThreadSet::should_yield(), recording the incompletely - // processed buffer for later processing of the remainder. - void handle_completed_buffer(BufferNode* node, G1ConcurrentRefineStats* stats); - -public: - G1DirtyCardQueueSet(BufferNode::Allocator* allocator); - ~G1DirtyCardQueueSet(); - - // The number of parallel ids that can be claimed to allow collector or - // mutator threads to do card-processing work. - static uint num_par_ids(); - - static void handle_zero_index_for_thread(Thread* t); - - virtual void enqueue_completed_buffer(BufferNode* node); - - // Upper bound on the number of cards currently in this queue set. - // Read without synchronization. The value may be high because there - // is a concurrent modification of the set of buffers. - size_t num_cards() const; - - void merge_bufferlists(G1RedirtyCardsQueueSet* src); - - BufferNodeList take_all_completed_buffers(); - - void flush_queue(G1DirtyCardQueue& queue); - - using CardValue = G1CardTable::CardValue; - void enqueue(G1DirtyCardQueue& queue, volatile CardValue* card_ptr); - - // If there are more than stop_at cards in the completed buffers, pop - // a buffer, refine its contents, and return true. Otherwise return - // false. Updates stats. - // - // Stops processing a buffer if SuspendibleThreadSet::should_yield(), - // recording the incompletely processed buffer for later processing of - // the remainder. - bool refine_completed_buffer_concurrently(uint worker_id, - size_t stop_at, - G1ConcurrentRefineStats* stats); - - // If a full collection is happening, reset per-thread refinement stats and - // partial logs, and release completed logs. The full collection will make - // them all irrelevant. - // precondition: at safepoint. - void abandon_logs_and_stats(); - - // Update global refinement statistics with the ones given and the ones from - // detached threads. - // precondition: at safepoint. - void update_refinement_stats(G1ConcurrentRefineStats& stats); - // Add the given thread's partial logs to the global list and return and reset - // its refinement stats. - // precondition: at safepoint. - G1ConcurrentRefineStats concatenate_log_and_stats(Thread* thread); - - // Return the total of mutator refinement stats for all threads. - // precondition: at safepoint. - // precondition: only call after concatenate_logs_and_stats. - G1ConcurrentRefineStats concatenated_refinement_stats() const; - - // Accumulate refinement stats from threads that are detaching. - void record_detached_refinement_stats(G1ConcurrentRefineStats* stats); - - // Number of cards above which mutator threads should do refinement. - size_t mutator_refinement_threshold() const; - - // Set number of cards above which mutator threads should do refinement. - void set_mutator_refinement_threshold(size_t value); -}; - -#endif // SHARE_GC_G1_G1DIRTYCARDQUEUE_HPP diff --git a/src/hotspot/share/gc/g1/g1FromCardCache.cpp b/src/hotspot/share/gc/g1/g1FromCardCache.cpp index 4a29bcbc6dc..8f5c84da0e3 100644 --- a/src/hotspot/share/gc/g1/g1FromCardCache.cpp +++ b/src/hotspot/share/gc/g1/g1FromCardCache.cpp @@ -22,8 +22,6 @@ * */ -#include "gc/g1/g1ConcurrentRefine.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1FromCardCache.hpp" #include "gc/shared/gc_globals.hpp" #include "memory/padded.inline.hpp" @@ -80,7 +78,7 @@ void G1FromCardCache::print(outputStream* out) { #endif uint G1FromCardCache::num_par_rem_sets() { - return G1DirtyCardQueueSet::num_par_ids() + G1ConcRefinementThreads + MAX2(ConcGCThreads, ParallelGCThreads); + return G1ConcRefinementThreads + ConcGCThreads; } void G1FromCardCache::clear(uint region_idx) { diff --git a/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp b/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp index cc71cf86172..5dbf70f36b3 100644 --- a/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp +++ b/src/hotspot/share/gc/g1/g1FullGCCompactTask.cpp @@ -147,6 +147,10 @@ void G1FullGCCompactTask::free_non_overlapping_regions(uint src_start_idx, uint for (uint i = non_overlapping_start; i <= src_end_idx; ++i) { G1HeapRegion* hr = _g1h->region_at(i); + if (VerifyDuringGC) { + // Satisfy some asserts in free_..._region + hr->clear_both_card_tables(); + } _g1h->free_humongous_region(hr, nullptr); } } diff --git a/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp b/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp index f9868bba678..64d85660ca7 100644 --- a/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp +++ b/src/hotspot/share/gc/g1/g1FullGCPrepareTask.inline.hpp @@ -35,6 +35,10 @@ #include "gc/shared/fullGCForwarding.inline.hpp" void G1DetermineCompactionQueueClosure::free_empty_humongous_region(G1HeapRegion* hr) { + if (VerifyDuringGC) { + // Satisfy some asserts in free_..._region. + hr->clear_both_card_tables(); + } _g1h->free_humongous_region(hr, nullptr); _collector->set_free(hr->hrm_index()); add_to_compaction_queue(hr); diff --git a/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp b/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp index ae9a78a9cdf..02397392a6e 100644 --- a/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp +++ b/src/hotspot/share/gc/g1/g1FullGCResetMetadataTask.cpp @@ -32,7 +32,7 @@ G1FullGCResetMetadataTask::G1ResetMetadataClosure::G1ResetMetadataClosure(G1Full void G1FullGCResetMetadataTask::G1ResetMetadataClosure::reset_region_metadata(G1HeapRegion* hr) { hr->rem_set()->clear(); - hr->clear_cardtable(); + hr->clear_both_card_tables(); } bool G1FullGCResetMetadataTask::G1ResetMetadataClosure::do_heap_region(G1HeapRegion* hr) { diff --git a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp index 15fb65c5700..b211b1e32fb 100644 --- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp +++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp @@ -50,8 +50,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) : { assert(max_gc_threads > 0, "Must have some GC threads"); - _gc_par_phases[RetireTLABsAndFlushLogs] = new WorkerDataArray("RetireTLABsAndFlushLogs", "JT Retire TLABs And Flush Logs (ms):", max_gc_threads); - _gc_par_phases[NonJavaThreadFlushLogs] = new WorkerDataArray("NonJavaThreadFlushLogs", "Non-JT Flush Logs (ms):", max_gc_threads); + _gc_par_phases[RetireTLABs] = new WorkerDataArray("RetireTLABs", "JavaThread Retire TLABs (ms):", max_gc_threads); _gc_par_phases[GCWorkerStart] = new WorkerDataArray("GCWorkerStart", "GC Worker Start (ms):", max_gc_threads); _gc_par_phases[ExtRootScan] = new WorkerDataArray("ExtRootScan", "Ext Root Scanning (ms):", max_gc_threads); @@ -83,7 +82,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) : _gc_par_phases[OptMergeRS]->create_thread_work_items(GCMergeRSWorkItemsStrings[i], i); } - _gc_par_phases[MergeLB] = new WorkerDataArray("MergeLB", "Log Buffers (ms):", max_gc_threads); + _gc_par_phases[SweepRT] = new WorkerDataArray("SweepRT", "Sweep (ms):", max_gc_threads); _gc_par_phases[ScanHR] = new WorkerDataArray("ScanHR", "Scan Heap Roots (ms):", max_gc_threads); _gc_par_phases[OptScanHR] = new WorkerDataArray("OptScanHR", "Optional Scan Heap Roots (ms):", max_gc_threads); _gc_par_phases[CodeRoots] = new WorkerDataArray("CodeRoots", "Code Root Scan (ms):", max_gc_threads); @@ -98,7 +97,7 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) : _gc_par_phases[MergePSS] = new WorkerDataArray("MergePSS", "Merge Per-Thread State (ms):", max_gc_threads); _gc_par_phases[RestoreEvacuationFailedRegions] = new WorkerDataArray("RestoreEvacuationFailedRegions", "Restore Evacuation Failed Regions (ms):", max_gc_threads); _gc_par_phases[RemoveSelfForwards] = new WorkerDataArray("RemoveSelfForwards", "Remove Self Forwards (ms):", max_gc_threads); - _gc_par_phases[ClearCardTable] = new WorkerDataArray("ClearLoggedCards", "Clear Logged Cards (ms):", max_gc_threads); + _gc_par_phases[ClearCardTable] = new WorkerDataArray("ClearPendingCards", "Clear Pending Cards (ms):", max_gc_threads); _gc_par_phases[RecalculateUsed] = new WorkerDataArray("RecalculateUsed", "Recalculate Used Memory (ms):", max_gc_threads); #if COMPILER2_OR_JVMCI _gc_par_phases[UpdateDerivedPointers] = new WorkerDataArray("UpdateDerivedPointers", "Update Derived Pointers (ms):", max_gc_threads); @@ -107,11 +106,15 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) : _gc_par_phases[ResetPartialArrayStateManager] = new WorkerDataArray("ResetPartialArrayStateManager", "Reset Partial Array State Manager (ms):", max_gc_threads); _gc_par_phases[ProcessEvacuationFailedRegions] = new WorkerDataArray("ProcessEvacuationFailedRegions", "Process Evacuation Failed Regions (ms):", max_gc_threads); + _gc_par_phases[ScanHR]->create_thread_work_items("Pending Cards:", ScanHRPendingCards); + _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Empty:", ScanHRScannedEmptyCards); _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Cards:", ScanHRScannedCards); _gc_par_phases[ScanHR]->create_thread_work_items("Scanned Blocks:", ScanHRScannedBlocks); _gc_par_phases[ScanHR]->create_thread_work_items("Claimed Chunks:", ScanHRClaimedChunks); _gc_par_phases[ScanHR]->create_thread_work_items("Found Roots:", ScanHRFoundRoots); + _gc_par_phases[OptScanHR]->create_thread_work_items("Pending Cards:", ScanHRPendingCards); + _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Empty:", ScanHRScannedEmptyCards); _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Cards:", ScanHRScannedCards); _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Blocks:", ScanHRScannedBlocks); _gc_par_phases[OptScanHR]->create_thread_work_items("Claimed Chunks:", ScanHRClaimedChunks); @@ -119,9 +122,6 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) : _gc_par_phases[OptScanHR]->create_thread_work_items("Scanned Refs:", ScanHRScannedOptRefs); _gc_par_phases[OptScanHR]->create_thread_work_items("Used Memory:", ScanHRUsedMemory); - _gc_par_phases[MergeLB]->create_thread_work_items("Dirty Cards:", MergeLBDirtyCards); - _gc_par_phases[MergeLB]->create_thread_work_items("Skipped Cards:", MergeLBSkippedCards); - _gc_par_phases[CodeRoots]->create_thread_work_items("Scanned Nmethods:", CodeRootsScannedNMethods); _gc_par_phases[OptCodeRoots]->create_thread_work_items("Scanned Nmethods:", CodeRootsScannedNMethods); @@ -129,7 +129,10 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) : _gc_par_phases[MergePSS]->create_thread_work_items("Copied Bytes:", MergePSSCopiedBytes); _gc_par_phases[MergePSS]->create_thread_work_items("LAB Waste:", MergePSSLABWasteBytes); _gc_par_phases[MergePSS]->create_thread_work_items("LAB Undo Waste:", MergePSSLABUndoWasteBytes); - _gc_par_phases[MergePSS]->create_thread_work_items("Evac Fail Extra Cards:", MergePSSEvacFailExtra); + _gc_par_phases[MergePSS]->create_thread_work_items("Pending Cards:", MergePSSPendingCards); + _gc_par_phases[MergePSS]->create_thread_work_items("To-Young-Gen Cards:", MergePSSToYoungGenCards); + _gc_par_phases[MergePSS]->create_thread_work_items("Evac-Fail Cards:", MergePSSEvacFail); + _gc_par_phases[MergePSS]->create_thread_work_items("Marked Cards:", MergePSSMarked); _gc_par_phases[RestoreEvacuationFailedRegions]->create_thread_work_items("Evacuation Failed Regions:", RestoreEvacFailureRegionsEvacFailedNum); _gc_par_phases[RestoreEvacuationFailedRegions]->create_thread_work_items("Pinned Regions:", RestoreEvacFailureRegionsPinnedNum); @@ -150,9 +153,6 @@ G1GCPhaseTimes::G1GCPhaseTimes(STWGCTimer* gc_timer, uint max_gc_threads) : _gc_par_phases[OptTermination]->create_thread_work_items("Optional Termination Attempts:"); - _gc_par_phases[RedirtyCards] = new WorkerDataArray("RedirtyCards", "Redirty Logged Cards (ms):", max_gc_threads); - _gc_par_phases[RedirtyCards]->create_thread_work_items("Redirtied Cards:"); - _gc_par_phases[ResizeThreadLABs] = new WorkerDataArray("ResizeTLABs", "Resize TLABs (ms):", max_gc_threads); _gc_par_phases[FreeCollectionSet] = new WorkerDataArray("FreeCSet", "Free Collection Set (ms):", max_gc_threads); @@ -171,9 +171,9 @@ void G1GCPhaseTimes::reset() { _cur_optional_evac_time_ms = 0.0; _cur_collection_nmethod_list_cleanup_time_ms = 0.0; _cur_merge_heap_roots_time_ms = 0.0; + _cur_merge_refinement_table_time_ms = 0.0; _cur_optional_merge_heap_roots_time_ms = 0.0; _cur_prepare_merge_heap_roots_time_ms = 0.0; - _cur_distribute_log_buffers_time_ms = 0.0; _cur_optional_prepare_merge_heap_roots_time_ms = 0.0; _cur_pre_evacuate_prepare_time_ms = 0.0; _cur_post_evacuate_cleanup_1_time_ms = 0.0; @@ -249,7 +249,7 @@ void G1GCPhaseTimes::record_gc_pause_end() { ASSERT_PHASE_UNINITIALIZED(MergeER); ASSERT_PHASE_UNINITIALIZED(MergeRS); ASSERT_PHASE_UNINITIALIZED(OptMergeRS); - ASSERT_PHASE_UNINITIALIZED(MergeLB); + ASSERT_PHASE_UNINITIALIZED(SweepRT); ASSERT_PHASE_UNINITIALIZED(ScanHR); ASSERT_PHASE_UNINITIALIZED(CodeRoots); ASSERT_PHASE_UNINITIALIZED(OptCodeRoots); @@ -425,8 +425,7 @@ double G1GCPhaseTimes::print_pre_evacuate_collection_set() const { } debug_time("Pre Evacuate Prepare", _cur_pre_evacuate_prepare_time_ms); - debug_phase(_gc_par_phases[RetireTLABsAndFlushLogs], 1); - debug_phase(_gc_par_phases[NonJavaThreadFlushLogs], 1); + debug_phase(_gc_par_phases[RetireTLABs], 1); debug_time("Choose Collection Set", (_recorded_young_cset_choice_time_ms + _recorded_non_young_cset_choice_time_ms)); debug_time("Region Register", _cur_region_register_time); @@ -458,8 +457,8 @@ double G1GCPhaseTimes::print_evacuate_initial_collection_set() const { debug_time("Prepare Merge Heap Roots", _cur_prepare_merge_heap_roots_time_ms); debug_phase_merge_remset(); - debug_time("Distribute Log Buffers", _cur_distribute_log_buffers_time_ms); - debug_phase(_gc_par_phases[MergeLB]); + debug_time("Merge Refinement Table", _cur_merge_refinement_table_time_ms); + debug_phase(_gc_par_phases[SweepRT], 1); info_time("Evacuate Collection Set", _cur_collection_initial_evac_time_ms); @@ -521,7 +520,6 @@ double G1GCPhaseTimes::print_post_evacuate_collection_set(bool evacuation_failed if (G1CollectedHeap::heap()->should_sample_collection_set_candidates()) { debug_phase(_gc_par_phases[SampleCollectionSetCandidates], 1); } - debug_phase(_gc_par_phases[RedirtyCards], 1); if (UseTLAB && ResizeTLAB) { debug_phase(_gc_par_phases[ResizeThreadLABs], 1); } diff --git a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp index 045160a6162..8223148b791 100644 --- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp +++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp @@ -46,8 +46,7 @@ class G1GCPhaseTimes : public CHeapObj { public: enum GCParPhases { - RetireTLABsAndFlushLogs, - NonJavaThreadFlushLogs, + RetireTLABs, GCWorkerStart, ExtRootScan, ThreadRoots, @@ -59,7 +58,7 @@ class G1GCPhaseTimes : public CHeapObj { MergeER = StrongOopStorageSetRoots + EnumRange().size(), MergeRS, OptMergeRS, - MergeLB, + SweepRT, ScanHR, OptScanHR, CodeRoots, @@ -71,7 +70,6 @@ class G1GCPhaseTimes : public CHeapObj { Other, GCWorkerTotal, GCWorkerEnd, - RedirtyCards, FreeCollectionSet, YoungFreeCSet, NonYoungFreeCSet, @@ -111,16 +109,19 @@ class G1GCPhaseTimes : public CHeapObj { MergeRSHowlArrayOfCards, MergeRSHowlBitmap, MergeRSHowlFull, - MergeRSCards, + MergeRSFromRemSetCards, + MergeRSTotalCards, MergeRSContainersSentinel }; static constexpr const char* GCMergeRSWorkItemsStrings[MergeRSContainersSentinel] = { "Merged Inline:", "Merged ArrayOfCards:", "Merged Howl:", "Merged Full:", "Merged Howl Inline:", "Merged Howl ArrayOfCards:", "Merged Howl BitMap:", "Merged Howl Full:", - "Merged Cards:" }; + "Merged From RS Cards:", "Total Cards:" }; enum GCScanHRWorkItems { + ScanHRPendingCards, + ScanHRScannedEmptyCards, ScanHRScannedCards, ScanHRScannedBlocks, ScanHRClaimedChunks, @@ -129,11 +130,6 @@ class G1GCPhaseTimes : public CHeapObj { ScanHRUsedMemory }; - enum GCMergeLBWorkItems { - MergeLBDirtyCards, - MergeLBSkippedCards - }; - enum GCCodeRootsWorkItems { CodeRootsScannedNMethods }; @@ -143,7 +139,10 @@ class G1GCPhaseTimes : public CHeapObj { MergePSSLABSize, MergePSSLABWasteBytes, MergePSSLABUndoWasteBytes, - MergePSSEvacFailExtra + MergePSSPendingCards, // To be scanned cards generated by GC (from cross-references and evacuation failure). + MergePSSToYoungGenCards, // To-young-gen cards generated by GC. + MergePSSEvacFail, // Evacuation failure generated dirty cards by GC. + MergePSSMarked, // Total newly marked cards. }; enum RestoreEvacFailureRegionsWorkItems { @@ -176,9 +175,9 @@ class G1GCPhaseTimes : public CHeapObj { double _cur_collection_nmethod_list_cleanup_time_ms; double _cur_merge_heap_roots_time_ms; + // Merge refinement table time. Note that this time is included in _cur_merge_heap_roots_time_ms. + double _cur_merge_refinement_table_time_ms; double _cur_optional_merge_heap_roots_time_ms; - // Included in above merge and optional-merge time. - double _cur_distribute_log_buffers_time_ms; double _cur_prepare_merge_heap_roots_time_ms; double _cur_optional_prepare_merge_heap_roots_time_ms; @@ -302,6 +301,10 @@ class G1GCPhaseTimes : public CHeapObj { _cur_merge_heap_roots_time_ms += ms; } + void record_merge_refinement_table_time(double ms) { + _cur_merge_refinement_table_time_ms = ms; + } + void record_or_add_optional_merge_heap_roots_time(double ms) { _cur_optional_merge_heap_roots_time_ms += ms; } @@ -310,10 +313,6 @@ class G1GCPhaseTimes : public CHeapObj { _cur_prepare_merge_heap_roots_time_ms += ms; } - void record_distribute_log_buffers_time_ms(double ms) { - _cur_distribute_log_buffers_time_ms += ms; - } - void record_or_add_optional_prepare_merge_heap_roots_time(double ms) { _cur_optional_prepare_merge_heap_roots_time_ms += ms; } @@ -382,10 +381,6 @@ class G1GCPhaseTimes : public CHeapObj { _recorded_prepare_heap_roots_time_ms = recorded_prepare_heap_roots_time_ms; } - double cur_distribute_log_buffers_time_ms() { - return _cur_distribute_log_buffers_time_ms; - } - double cur_collection_par_time_ms() { return _cur_collection_initial_evac_time_ms + _cur_optional_evac_time_ms + @@ -396,6 +391,10 @@ class G1GCPhaseTimes : public CHeapObj { _cur_collection_nmethod_list_cleanup_time_ms; } + double cur_merge_refinement_table_time() const { + return _cur_merge_refinement_table_time_ms; + } + double cur_resize_heap_time_ms() { return _cur_resize_heap_time_ms; } diff --git a/src/hotspot/share/gc/g1/g1HeapRegion.cpp b/src/hotspot/share/gc/g1/g1HeapRegion.cpp index 09bdfefccb7..ca4359dcc24 100644 --- a/src/hotspot/share/gc/g1/g1HeapRegion.cpp +++ b/src/hotspot/share/gc/g1/g1HeapRegion.cpp @@ -39,6 +39,7 @@ #include "logging/log.hpp" #include "logging/logStream.hpp" #include "memory/iterator.inline.hpp" +#include "memory/memRegion.hpp" #include "memory/resourceArea.hpp" #include "oops/access.inline.hpp" #include "oops/compressedOops.inline.hpp" @@ -137,11 +138,21 @@ void G1HeapRegion::hr_clear(bool clear_space) { if (clear_space) clear(SpaceDecorator::Mangle); } -void G1HeapRegion::clear_cardtable() { +void G1HeapRegion::clear_card_table() { G1CardTable* ct = G1CollectedHeap::heap()->card_table(); ct->clear_MemRegion(MemRegion(bottom(), end())); } +void G1HeapRegion::clear_refinement_table() { + G1CardTable* ct = G1CollectedHeap::heap()->refinement_table(); + ct->clear_MemRegion(MemRegion(bottom(), end())); +} + +void G1HeapRegion::clear_both_card_tables() { + clear_card_table(); + clear_refinement_table(); +} + void G1HeapRegion::set_free() { if (!is_free()) { report_region_type_change(G1HeapRegionTraceType::Free); @@ -591,8 +602,12 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure { G1HeapRegion* _from; G1HeapRegion* _to; - CardValue _cv_obj; - CardValue _cv_field; + + CardValue _cv_obj_ct; // In card table. + CardValue _cv_field_ct; + + CardValue _cv_obj_rt; // In refinement table. + CardValue _cv_field_rt; RemSetChecker(G1VerifyFailureCounter* failures, oop containing_obj, T* p, oop obj) : Checker(failures, containing_obj, p, obj) { @@ -600,19 +615,23 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure { _to = this->_g1h->heap_region_containing(obj); CardTable* ct = this->_g1h->card_table(); - _cv_obj = *ct->byte_for_const(this->_containing_obj); - _cv_field = *ct->byte_for_const(p); + _cv_obj_ct = *ct->byte_for_const(this->_containing_obj); + _cv_field_ct = *ct->byte_for_const(p); + + ct = this->_g1h->refinement_table(); + _cv_obj_rt = *ct->byte_for_const(this->_containing_obj); + _cv_field_rt = *ct->byte_for_const(p); } bool failed() const { if (_from != _to && !_from->is_young() && _to->rem_set()->is_complete() && _from->rem_set()->cset_group() != _to->rem_set()->cset_group()) { - const CardValue dirty = G1CardTable::dirty_card_val(); + const CardValue clean = G1CardTable::clean_card_val(); return !(_to->rem_set()->contains_reference(this->_p) || (this->_containing_obj->is_objArray() ? - _cv_field == dirty : - _cv_obj == dirty || _cv_field == dirty)); + (_cv_field_ct != clean || _cv_field_rt != clean) : + (_cv_obj_ct != clean || _cv_field_ct != clean || _cv_obj_rt != clean || _cv_field_rt != clean))); } return false; } @@ -630,7 +649,8 @@ class G1VerifyLiveAndRemSetClosure : public BasicOopIterateClosure { log.error("Missing rem set entry:"); this->print_containing_obj(&ls, _from); this->print_referenced_obj(&ls, _to, ""); - log.error("Obj head CV = %d, field CV = %d.", _cv_obj, _cv_field); + log.error("CT obj head CV = %d, field CV = %d.", _cv_obj_ct, _cv_field_ct); + log.error("RT Obj head CV = %d, field CV = %d.", _cv_obj_rt, _cv_field_rt); log.error("----------"); } }; diff --git a/src/hotspot/share/gc/g1/g1HeapRegion.hpp b/src/hotspot/share/gc/g1/g1HeapRegion.hpp index 71584ffb24d..17ec3055b52 100644 --- a/src/hotspot/share/gc/g1/g1HeapRegion.hpp +++ b/src/hotspot/share/gc/g1/g1HeapRegion.hpp @@ -42,7 +42,6 @@ class G1CollectedHeap; class G1CMBitMap; class G1CSetCandidateGroup; class G1Predictions; -class G1HeapRegion; class G1HeapRegionRemSet; class G1HeapRegionSetBase; class nmethod; @@ -478,7 +477,10 @@ public: // Callers must ensure this is not called by multiple threads at the same time. void hr_clear(bool clear_space); // Clear the card table corresponding to this region. - void clear_cardtable(); + void clear_card_table(); + void clear_refinement_table(); + + void clear_both_card_tables(); // Notify the region that an evacuation failure occurred for an object within this // region. diff --git a/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp b/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp index d4286a1caeb..795b6543bae 100644 --- a/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp +++ b/src/hotspot/share/gc/g1/g1HeapRegionManager.cpp @@ -63,7 +63,8 @@ public: G1HeapRegionManager::G1HeapRegionManager() : _bot_mapper(nullptr), - _cardtable_mapper(nullptr), + _card_table_mapper(nullptr), + _refinement_table_mapper(nullptr), _committed_map(), _next_highest_used_hrm_index(0), _regions(), _heap_mapper(nullptr), @@ -74,7 +75,8 @@ G1HeapRegionManager::G1HeapRegionManager() : void G1HeapRegionManager::initialize(G1RegionToSpaceMapper* heap_storage, G1RegionToSpaceMapper* bitmap, G1RegionToSpaceMapper* bot, - G1RegionToSpaceMapper* cardtable) { + G1RegionToSpaceMapper* card_table, + G1RegionToSpaceMapper* refinement_table) { _next_highest_used_hrm_index = 0; _heap_mapper = heap_storage; @@ -82,7 +84,8 @@ void G1HeapRegionManager::initialize(G1RegionToSpaceMapper* heap_storage, _bitmap_mapper = bitmap; _bot_mapper = bot; - _cardtable_mapper = cardtable; + _card_table_mapper = card_table; + _refinement_table_mapper = refinement_table; _regions.initialize(heap_storage->reserved(), G1HeapRegion::GrainBytes); @@ -186,7 +189,8 @@ void G1HeapRegionManager::commit_regions(uint index, size_t num_regions, WorkerT _bitmap_mapper->commit_regions(index, num_regions, pretouch_workers); _bot_mapper->commit_regions(index, num_regions, pretouch_workers); - _cardtable_mapper->commit_regions(index, num_regions, pretouch_workers); + _card_table_mapper->commit_regions(index, num_regions, pretouch_workers); + _refinement_table_mapper->commit_regions(index, num_regions, pretouch_workers); } void G1HeapRegionManager::uncommit_regions(uint start, uint num_regions) { @@ -209,7 +213,8 @@ void G1HeapRegionManager::uncommit_regions(uint start, uint num_regions) { _bitmap_mapper->uncommit_regions(start, num_regions); _bot_mapper->uncommit_regions(start, num_regions); - _cardtable_mapper->uncommit_regions(start, num_regions); + _card_table_mapper->uncommit_regions(start, num_regions); + _refinement_table_mapper->uncommit_regions(start, num_regions); _committed_map.uncommit(start, end); } @@ -261,19 +266,23 @@ void G1HeapRegionManager::clear_auxiliary_data_structures(uint start, uint num_r // Signal G1BlockOffsetTable to clear the given regions. _bot_mapper->signal_mapping_changed(start, num_regions); // Signal G1CardTable to clear the given regions. - _cardtable_mapper->signal_mapping_changed(start, num_regions); + _card_table_mapper->signal_mapping_changed(start, num_regions); + // Signal refinement table to clear the given regions. + _refinement_table_mapper->signal_mapping_changed(start, num_regions); } MemoryUsage G1HeapRegionManager::get_auxiliary_data_memory_usage() const { size_t used_sz = _bitmap_mapper->committed_size() + _bot_mapper->committed_size() + - _cardtable_mapper->committed_size(); + _card_table_mapper->committed_size() + + _refinement_table_mapper->committed_size(); size_t committed_sz = _bitmap_mapper->reserved_size() + _bot_mapper->reserved_size() + - _cardtable_mapper->reserved_size(); + _card_table_mapper->reserved_size() + + _refinement_table_mapper->reserved_size(); return MemoryUsage(0, used_sz, committed_sz, committed_sz); } diff --git a/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp b/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp index 19ae9887e94..b4ce3b0a8be 100644 --- a/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp +++ b/src/hotspot/share/gc/g1/g1HeapRegionManager.hpp @@ -74,7 +74,8 @@ class G1HeapRegionManager: public CHeapObj { friend class G1HeapRegionClaimer; G1RegionToSpaceMapper* _bot_mapper; - G1RegionToSpaceMapper* _cardtable_mapper; + G1RegionToSpaceMapper* _card_table_mapper; + G1RegionToSpaceMapper* _refinement_table_mapper; // Keeps track of the currently committed regions in the heap. The committed regions // can either be active (ready for use) or inactive (ready for uncommit). @@ -161,7 +162,8 @@ public: void initialize(G1RegionToSpaceMapper* heap_storage, G1RegionToSpaceMapper* bitmap, G1RegionToSpaceMapper* bot, - G1RegionToSpaceMapper* cardtable); + G1RegionToSpaceMapper* card_table, + G1RegionToSpaceMapper* refinement_table); // Return the "dummy" region used for G1AllocRegion. This is currently a hardwired // new G1HeapRegion that owns G1HeapRegion at index 0. Since at the moment we commit diff --git a/src/hotspot/share/gc/g1/g1HeapVerifier.cpp b/src/hotspot/share/gc/g1/g1HeapVerifier.cpp index c5af7e34dd9..21b3545f7e0 100644 --- a/src/hotspot/share/gc/g1/g1HeapVerifier.cpp +++ b/src/hotspot/share/gc/g1/g1HeapVerifier.cpp @@ -42,6 +42,7 @@ #include "oops/compressedOops.inline.hpp" #include "oops/oop.inline.hpp" #include "runtime/handles.inline.hpp" +#include "runtime/threads.hpp" int G1HeapVerifier::_enabled_verification_types = G1HeapVerifier::G1VerifyAll; @@ -528,6 +529,7 @@ void G1HeapVerifier::verify_before_gc() { void G1HeapVerifier::verify_after_gc() { verify(VerifyOption::G1UseConcMarking, "After GC"); + verify_card_tables_in_sync(); } void G1HeapVerifier::verify_bitmap_clear(bool from_tams) { @@ -556,17 +558,17 @@ void G1HeapVerifier::verify_bitmap_clear(bool from_tams) { G1CollectedHeap::heap()->heap_region_iterate(&cl); } -#ifndef PRODUCT class G1VerifyCardTableCleanup: public G1HeapRegionClosure { G1HeapVerifier* _verifier; public: G1VerifyCardTableCleanup(G1HeapVerifier* verifier) : _verifier(verifier) { } virtual bool do_heap_region(G1HeapRegion* r) { + _verifier->verify_ct_clean_region(r); if (r->is_survivor()) { - _verifier->verify_dirty_region(r); + _verifier->verify_rt_clean_region(r); } else { - _verifier->verify_not_dirty_region(r); + _verifier->verify_rt_clean_from_top(r); } return false; } @@ -579,14 +581,35 @@ void G1HeapVerifier::verify_card_table_cleanup() { } } -void G1HeapVerifier::verify_not_dirty_region(G1HeapRegion* hr) { - // All of the region should be clean. - G1CardTable* ct = _g1h->card_table(); - MemRegion mr(hr->bottom(), hr->end()); - ct->verify_not_dirty_region(mr); +class G1VerifyCardTablesClean: public G1HeapRegionClosure { + G1HeapVerifier* _verifier; + bool _both_card_tables; + +public: + G1VerifyCardTablesClean(G1HeapVerifier* verifier, bool both_card_tables = true) + : _verifier(verifier), _both_card_tables(both_card_tables) { } + + virtual bool do_heap_region(G1HeapRegion* r) { + _verifier->verify_rt_clean_region(r); // Must be all Clean from bottom -> end. + if (_both_card_tables) { + _verifier->verify_ct_clean_region(r); + } + return false; + } +}; + +void G1HeapVerifier::verify_card_tables_clean(bool both_card_tables) { + G1VerifyCardTablesClean cl(this, both_card_tables); + _g1h->heap_region_iterate(&cl); } -void G1HeapVerifier::verify_dirty_region(G1HeapRegion* hr) { +void G1HeapVerifier::verify_rt_clean_from_top(G1HeapRegion* hr) { + G1CardTable* ct = _g1h->refinement_table(); + MemRegion mr(align_up(hr->top(), G1CardTable::card_size()), hr->end()); + ct->verify_region(mr, G1CardTable::clean_card_val(), true); +} + +void G1HeapVerifier::verify_rt_dirty_to_dummy_top(G1HeapRegion* hr) { // We cannot guarantee that [bottom(),end()] is dirty. Threads // dirty allocated blocks as they allocate them. The thread that // retires each region and replaces it with a new one will do a @@ -594,29 +617,56 @@ void G1HeapVerifier::verify_dirty_region(G1HeapRegion* hr) { // not dirty that area (one less thing to have to do while holding // a lock). So we can only verify that [bottom(),pre_dummy_top()] // is dirty. - G1CardTable* ct = _g1h->card_table(); + G1CardTable* ct = _g1h->refinement_table(); MemRegion mr(hr->bottom(), hr->pre_dummy_top()); - if (hr->is_young()) { - ct->verify_g1_young_region(mr); - } else { - ct->verify_dirty_region(mr); - } + ct->verify_dirty_region(mr); } -class G1VerifyDirtyYoungListClosure : public G1HeapRegionClosure { -private: - G1HeapVerifier* _verifier; -public: - G1VerifyDirtyYoungListClosure(G1HeapVerifier* verifier) : G1HeapRegionClosure(), _verifier(verifier) { } - virtual bool do_heap_region(G1HeapRegion* r) { - _verifier->verify_dirty_region(r); - return false; - } -}; +void G1HeapVerifier::verify_ct_clean_region(G1HeapRegion* hr) { + G1CardTable* ct = _g1h->card_table(); + MemRegion mr(hr->bottom(), hr->end()); + ct->verify_region(mr, G1CardTable::clean_card_val(), true); +} -void G1HeapVerifier::verify_dirty_young_regions() { - G1VerifyDirtyYoungListClosure cl(this); - _g1h->collection_set()->iterate(&cl); +void G1HeapVerifier::verify_rt_clean_region(G1HeapRegion* hr) { + G1CardTable* ct = _g1h->refinement_table(); + MemRegion mr(hr->bottom(), hr->end()); + ct->verify_region(mr, G1CardTable::clean_card_val(), true); +} + +#ifndef PRODUCT + +void G1HeapVerifier::verify_card_tables_in_sync() { + + // Non-Java thread card tables must be null. + class AssertCardTableBaseNull : public ThreadClosure { + public: + + void do_thread(Thread* thread) { + ResourceMark rm; + assert(G1ThreadLocalData::get_byte_map_base(thread) == nullptr, "thread " PTR_FORMAT " (%s) has non-null card table base", + p2i(thread), thread->name()); + } + } check_null_cl; + + Threads::non_java_threads_do(&check_null_cl); + + // Java thread card tables must be the same as the global card table. + class AssertSameCardTableClosure : public ThreadClosure { + public: + + void do_thread(Thread* thread) { + G1CardTable::CardValue* global_ct_base = G1CollectedHeap::heap()->card_table_base(); + G1CardTable::CardValue* cur_ct_base = G1ThreadLocalData::get_byte_map_base(thread); + + ResourceMark rm; + assert(cur_ct_base == global_ct_base, + "thread " PTR_FORMAT " (%s) has wrong card table base, should be " PTR_FORMAT " is " PTR_FORMAT, + p2i(thread), thread->name(), p2i(global_ct_base), p2i(cur_ct_base)); + } + } check_same_cl; + + Threads::java_threads_do(&check_same_cl); } class G1CheckRegionAttrTableClosure : public G1HeapRegionClosure { diff --git a/src/hotspot/share/gc/g1/g1HeapVerifier.hpp b/src/hotspot/share/gc/g1/g1HeapVerifier.hpp index d4ab4c60214..6a26c77ec0d 100644 --- a/src/hotspot/share/gc/g1/g1HeapVerifier.hpp +++ b/src/hotspot/share/gc/g1/g1HeapVerifier.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -78,11 +78,16 @@ public: // Do sanity check on the contents of the in-cset fast test table. bool check_region_attr_table() PRODUCT_RETURN_( return true; ); - void verify_card_table_cleanup() PRODUCT_RETURN; + void verify_card_table_cleanup(); + void verify_card_tables_clean(bool both_card_tables); - void verify_not_dirty_region(G1HeapRegion* hr) PRODUCT_RETURN; - void verify_dirty_region(G1HeapRegion* hr) PRODUCT_RETURN; - void verify_dirty_young_regions() PRODUCT_RETURN; + void verify_ct_clean_region(G1HeapRegion* hr); + void verify_rt_dirty_to_dummy_top(G1HeapRegion* hr); + void verify_rt_clean_from_top(G1HeapRegion* hr); + void verify_rt_clean_region(G1HeapRegion* hr); + + // Verify that the global card table and the thread's card tables are in sync. + void verify_card_tables_in_sync() PRODUCT_RETURN; }; #endif // SHARE_GC_G1_G1HEAPVERIFIER_HPP diff --git a/src/hotspot/share/gc/g1/g1OopClosures.hpp b/src/hotspot/share/gc/g1/g1OopClosures.hpp index 3bff668bcec..a61c9d17f70 100644 --- a/src/hotspot/share/gc/g1/g1OopClosures.hpp +++ b/src/hotspot/share/gc/g1/g1OopClosures.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -86,19 +86,19 @@ public: // This closure is applied to the fields of the objects that have just been copied during evacuation. class G1ScanEvacuatedObjClosure : public G1ScanClosureBase { - friend class G1SkipCardEnqueueSetter; + friend class G1SkipCardMarkSetter; - enum SkipCardEnqueueTristate { + enum SkipCardMarkTristate { False = 0, True, Uninitialized }; - SkipCardEnqueueTristate _skip_card_enqueue; + SkipCardMarkTristate _skip_card_mark; public: G1ScanEvacuatedObjClosure(G1CollectedHeap* g1h, G1ParScanThreadState* par_scan_state) : - G1ScanClosureBase(g1h, par_scan_state), _skip_card_enqueue(Uninitialized) { } + G1ScanClosureBase(g1h, par_scan_state), _skip_card_mark(Uninitialized) { } template void do_oop_work(T* p); virtual void do_oop(oop* p) { do_oop_work(p); } @@ -109,22 +109,22 @@ public: } #ifdef ASSERT - bool skip_card_enqueue_set() const { return _skip_card_enqueue != Uninitialized; } + bool skip_card_mark_set() const { return _skip_card_mark != Uninitialized; } #endif }; -// RAII object to properly set the _skip_card_enqueue field in G1ScanEvacuatedObjClosure. -class G1SkipCardEnqueueSetter : public StackObj { +// RAII object to properly set the _skip_card_mark field in G1ScanEvacuatedObjClosure. +class G1SkipCardMarkSetter : public StackObj { G1ScanEvacuatedObjClosure* _closure; public: - G1SkipCardEnqueueSetter(G1ScanEvacuatedObjClosure* closure, bool skip_card_enqueue) : _closure(closure) { - assert(_closure->_skip_card_enqueue == G1ScanEvacuatedObjClosure::Uninitialized, "Must not be set"); - _closure->_skip_card_enqueue = skip_card_enqueue ? G1ScanEvacuatedObjClosure::True : G1ScanEvacuatedObjClosure::False; + G1SkipCardMarkSetter(G1ScanEvacuatedObjClosure* closure, bool skip_card_mark) : _closure(closure) { + assert(_closure->_skip_card_mark == G1ScanEvacuatedObjClosure::Uninitialized, "Must not be set"); + _closure->_skip_card_mark = skip_card_mark ? G1ScanEvacuatedObjClosure::True : G1ScanEvacuatedObjClosure::False; } - ~G1SkipCardEnqueueSetter() { - DEBUG_ONLY(_closure->_skip_card_enqueue = G1ScanEvacuatedObjClosure::Uninitialized;) + ~G1SkipCardMarkSetter() { + DEBUG_ONLY(_closure->_skip_card_mark = G1ScanEvacuatedObjClosure::Uninitialized;) } }; @@ -206,13 +206,20 @@ public: class G1ConcurrentRefineOopClosure: public BasicOopIterateClosure { G1CollectedHeap* _g1h; uint _worker_id; + bool _has_ref_to_cset; + bool _has_ref_to_old; public: G1ConcurrentRefineOopClosure(G1CollectedHeap* g1h, uint worker_id) : _g1h(g1h), - _worker_id(worker_id) { + _worker_id(worker_id), + _has_ref_to_cset(false), + _has_ref_to_old(false) { } + bool has_ref_to_cset() const { return _has_ref_to_cset; } + bool has_ref_to_old() const { return _has_ref_to_old; } + virtual ReferenceIterationMode reference_iteration_mode() { return DO_FIELDS; } template void do_oop_work(T* p); @@ -223,6 +230,7 @@ public: class G1RebuildRemSetClosure : public BasicOopIterateClosure { G1CollectedHeap* _g1h; uint _worker_id; + public: G1RebuildRemSetClosure(G1CollectedHeap* g1h, uint worker_id) : _g1h(g1h), _worker_id(worker_id) { } diff --git a/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp b/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp index c0c67fda949..87e3a1cc7c4 100644 --- a/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp +++ b/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp @@ -90,11 +90,11 @@ inline void G1ScanEvacuatedObjClosure::do_oop_work(T* p) { prefetch_and_push(p, obj); } else if (!G1HeapRegion::is_in_same_region(p, obj)) { handle_non_cset_obj_common(region_attr, p, obj); - assert(_skip_card_enqueue != Uninitialized, "Scan location has not been initialized."); - if (_skip_card_enqueue == True) { + assert(_skip_card_mark != Uninitialized, "Scan location has not been initialized."); + if (_skip_card_mark == True) { return; } - _par_scan_state->enqueue_card_if_tracked(region_attr, p, obj); + _par_scan_state->mark_card_if_tracked(region_attr, p, obj); } } @@ -127,6 +127,11 @@ inline static void check_obj_during_refinement(T* p, oop const obj) { template inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) { + // Early out if we already found a to-young reference. + if (_has_ref_to_cset) { + return; + } + T o = RawAccess::oop_load(p); if (CompressedOops::is_null(o)) { return; @@ -146,7 +151,12 @@ inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) { return; } - G1HeapRegionRemSet* to_rem_set = _g1h->heap_region_containing(obj)->rem_set(); + G1HeapRegion* to_region = _g1h->heap_region_containing(obj); + if (to_region->is_young()) { + _has_ref_to_cset = true; + return; + } + G1HeapRegionRemSet* to_rem_set = to_region->rem_set(); assert(to_rem_set != nullptr, "Need per-region 'into' remsets."); if (to_rem_set->is_tracked()) { @@ -154,6 +164,7 @@ inline void G1ConcurrentRefineOopClosure::do_oop_work(T* p) { if (from->rem_set()->cset_group() != to_rem_set->cset_group()) { to_rem_set->add_reference(p, _worker_id); + _has_ref_to_old = true; } } } @@ -180,7 +191,7 @@ inline void G1ScanCardClosure::do_oop_work(T* p) { _heap_roots_found++; } else if (!G1HeapRegion::is_in_same_region(p, obj)) { handle_non_cset_obj_common(region_attr, p, obj); - _par_scan_state->enqueue_card_if_tracked(region_attr, p, obj); + _par_scan_state->mark_card_if_tracked(region_attr, p, obj); } } @@ -272,10 +283,14 @@ template void G1RebuildRemSetClosure::do_oop_work(T* p) { G1HeapRegion* to = _g1h->heap_region_containing(obj); G1HeapRegionRemSet* rem_set = to->rem_set(); if (rem_set->is_tracked()) { - G1HeapRegion* from = _g1h->heap_region_containing(p); + if (to->is_young()) { + G1BarrierSet::g1_barrier_set()->write_ref_field_post(p); + } else { + G1HeapRegion* from = _g1h->heap_region_containing(p); - if (from->rem_set()->cset_group() != rem_set->cset_group()) { - rem_set->add_reference(p, _worker_id); + if (from->rem_set()->cset_group() != rem_set->cset_group()) { + rem_set->add_reference(p, _worker_id); + } } } } diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp index 42c3a872e6b..80e5fd44fcd 100644 --- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp +++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp @@ -57,22 +57,21 @@ #define MAYBE_INLINE_EVACUATION NOT_DEBUG(inline) DEBUG_ONLY(NOINLINE) G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, - G1RedirtyCardsQueueSet* rdcqs, uint worker_id, uint num_workers, G1CollectionSet* collection_set, G1EvacFailureRegions* evac_failure_regions) : _g1h(g1h), _task_queue(g1h->task_queue(worker_id)), - _rdc_local_qset(rdcqs), - _ct(g1h->card_table()), + _ct(g1h->refinement_table()), _closures(nullptr), _plab_allocator(nullptr), _age_table(false), _tenuring_threshold(g1h->policy()->tenuring_threshold()), _scanner(g1h, this), _worker_id(worker_id), - _last_enqueued_card(SIZE_MAX), + _num_cards_marked_dirty(0), + _num_cards_marked_to_cset(0), _stack_trim_upper_threshold(GCDrainStackTargetSize * 2 + 1), _stack_trim_lower_threshold(GCDrainStackTargetSize), _trim_ticks(), @@ -88,7 +87,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, ALLOCATION_FAILURE_INJECTOR_ONLY(_allocation_failure_inject_counter(0) COMMA) _evacuation_failed_info(), _evac_failure_regions(evac_failure_regions), - _evac_failure_enqueued_cards(0) + _num_cards_from_evac_failure(0) { // We allocate number of young gen regions in the collection set plus one // entries, since entry 0 keeps track of surviving bytes for non-young regions. @@ -112,8 +111,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, initialize_numa_stats(); } -size_t G1ParScanThreadState::flush_stats(size_t* surviving_young_words, uint num_workers, BufferNodeList* rdc_buffers) { - *rdc_buffers = _rdc_local_qset.flush(); +size_t G1ParScanThreadState::flush_stats(size_t* surviving_young_words, uint num_workers) { flush_numa_stats(); // Update allocation statistics. _plab_allocator->flush_and_retire_stats(num_workers); @@ -147,8 +145,16 @@ size_t G1ParScanThreadState::lab_undo_waste_words() const { return _plab_allocator->undo_waste(); } -size_t G1ParScanThreadState::evac_failure_enqueued_cards() const { - return _evac_failure_enqueued_cards; +size_t G1ParScanThreadState::num_cards_pending() const { + return _num_cards_marked_dirty + _num_cards_from_evac_failure; +} + +size_t G1ParScanThreadState::num_cards_marked() const { + return num_cards_pending() + _num_cards_marked_to_cset; +} + +size_t G1ParScanThreadState::num_cards_from_evac_failure() const { + return _num_cards_from_evac_failure; } #ifdef ASSERT @@ -230,7 +236,7 @@ void G1ParScanThreadState::do_partial_array(PartialArrayState* state, bool stole PartialArraySplitter::Claim claim = _partial_array_splitter.claim(state, _task_queue, stolen); G1HeapRegionAttr dest_attr = _g1h->region_attr(to_array); - G1SkipCardEnqueueSetter x(&_scanner, dest_attr.is_new_survivor()); + G1SkipCardMarkSetter x(&_scanner, dest_attr.is_new_survivor()); // Process claimed task. to_array->oop_iterate_range(&_scanner, checked_cast(claim._start), @@ -250,7 +256,7 @@ void G1ParScanThreadState::start_partial_objarray(oop from_obj, // The source array is unused when processing states. _partial_array_splitter.start(_task_queue, nullptr, to_array, array_length); - assert(_scanner.skip_card_enqueue_set(), "must be"); + assert(_scanner.skip_card_mark_set(), "must be"); // Process the initial chunk. No need to process the type in the // klass, as it will already be handled by processing the built-in // module. @@ -451,7 +457,7 @@ void G1ParScanThreadState::do_iterate_object(oop const obj, _string_dedup_requests.add(old); } - assert(_scanner.skip_card_enqueue_set(), "must be"); + assert(_scanner.skip_card_mark_set(), "must be"); obj->oop_iterate_backwards(&_scanner, klass); } @@ -546,7 +552,7 @@ oop G1ParScanThreadState::do_copy_to_survivor_space(G1HeapRegionAttr const regio // Instead, we use dest_attr.is_young() because the two values are always // equal: successfully allocated young regions must be survivor regions. assert(dest_attr.is_young() == _g1h->heap_region_containing(obj)->is_survivor(), "must be"); - G1SkipCardEnqueueSetter x(&_scanner, dest_attr.is_young()); + G1SkipCardMarkSetter x(&_scanner, dest_attr.is_young()); do_iterate_object(obj, old, klass, region_attr, dest_attr, age); } @@ -569,7 +575,7 @@ G1ParScanThreadState* G1ParScanThreadStateSet::state_for_worker(uint worker_id) assert(worker_id < _num_workers, "out of bounds access"); if (_states[worker_id] == nullptr) { _states[worker_id] = - new G1ParScanThreadState(_g1h, rdcqs(), + new G1ParScanThreadState(_g1h, worker_id, _num_workers, _collection_set, @@ -595,22 +601,24 @@ void G1ParScanThreadStateSet::flush_stats() { // because it resets the PLAB allocator where we get this info from. size_t lab_waste_bytes = pss->lab_waste_words() * HeapWordSize; size_t lab_undo_waste_bytes = pss->lab_undo_waste_words() * HeapWordSize; - size_t copied_bytes = pss->flush_stats(_surviving_young_words_total, _num_workers, &_rdc_buffers[worker_id]) * HeapWordSize; - size_t evac_fail_enqueued_cards = pss->evac_failure_enqueued_cards(); + size_t copied_bytes = pss->flush_stats(_surviving_young_words_total, _num_workers) * HeapWordSize; + size_t pending_cards = pss->num_cards_pending(); + size_t to_young_gen_cards = pss->num_cards_marked() - pss->num_cards_pending(); + size_t evac_failure_cards = pss->num_cards_from_evac_failure(); + size_t marked_cards = pss->num_cards_marked(); p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, copied_bytes, G1GCPhaseTimes::MergePSSCopiedBytes); p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, lab_waste_bytes, G1GCPhaseTimes::MergePSSLABWasteBytes); p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, lab_undo_waste_bytes, G1GCPhaseTimes::MergePSSLABUndoWasteBytes); - p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, evac_fail_enqueued_cards, G1GCPhaseTimes::MergePSSEvacFailExtra); + p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, pending_cards, G1GCPhaseTimes::MergePSSPendingCards); + p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, to_young_gen_cards, G1GCPhaseTimes::MergePSSToYoungGenCards); + p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, evac_failure_cards, G1GCPhaseTimes::MergePSSEvacFail); + p->record_or_add_thread_work_item(G1GCPhaseTimes::MergePSS, worker_id, marked_cards, G1GCPhaseTimes::MergePSSMarked); delete pss; _states[worker_id] = nullptr; } - G1DirtyCardQueueSet& dcq = G1BarrierSet::dirty_card_queue_set(); - dcq.merge_bufferlists(rdcqs()); - rdcqs()->verify_empty(); - _flushed = true; } @@ -652,7 +660,7 @@ oop G1ParScanThreadState::handle_evacuation_failure_par(oop old, markWord m, Kla // existing closure to scan evacuated objects; since we are iterating from a // collection set region (i.e. never a Survivor region), we always need to // gather cards for this case. - G1SkipCardEnqueueSetter x(&_scanner, false /* skip_card_enqueue */); + G1SkipCardMarkSetter x(&_scanner, false /* skip_card_mark */); do_iterate_object(old, old, klass, attr, attr, m.age()); } @@ -709,9 +717,7 @@ G1ParScanThreadStateSet::G1ParScanThreadStateSet(G1CollectedHeap* g1h, G1EvacFailureRegions* evac_failure_regions) : _g1h(g1h), _collection_set(collection_set), - _rdcqs(G1BarrierSet::dirty_card_queue_set().allocator()), _states(NEW_C_HEAP_ARRAY(G1ParScanThreadState*, num_workers, mtGC)), - _rdc_buffers(NEW_C_HEAP_ARRAY(BufferNodeList, num_workers, mtGC)), _surviving_young_words_total(NEW_C_HEAP_ARRAY(size_t, collection_set->young_region_length() + 1, mtGC)), _num_workers(num_workers), _flushed(false), @@ -719,7 +725,6 @@ G1ParScanThreadStateSet::G1ParScanThreadStateSet(G1CollectedHeap* g1h, { for (uint i = 0; i < num_workers; ++i) { _states[i] = nullptr; - _rdc_buffers[i] = BufferNodeList(); } memset(_surviving_young_words_total, 0, (collection_set->young_region_length() + 1) * sizeof(size_t)); } @@ -728,7 +733,6 @@ G1ParScanThreadStateSet::~G1ParScanThreadStateSet() { assert(_flushed, "thread local state from the per thread states should have been flushed"); FREE_C_HEAP_ARRAY(G1ParScanThreadState*, _states); FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_total); - FREE_C_HEAP_ARRAY(BufferNodeList, _rdc_buffers); } #if TASKQUEUE_STATS diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp index 4d569622238..3fb080d40be 100644 --- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp +++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -27,7 +27,6 @@ #include "gc/g1/g1CollectedHeap.hpp" #include "gc/g1/g1OopClosures.hpp" -#include "gc/g1/g1RedirtyCardsQueue.hpp" #include "gc/g1/g1YoungGCAllocationFailureInjector.hpp" #include "gc/shared/ageTable.hpp" #include "gc/shared/copyFailedInfo.hpp" @@ -52,7 +51,6 @@ class outputStream; class G1ParScanThreadState : public CHeapObj { G1CollectedHeap* _g1h; G1ScannerTasksQueue* _task_queue; - G1RedirtyCardsLocalQueueSet _rdc_local_qset; G1CardTable* _ct; G1EvacuationRootClosures* _closures; @@ -65,9 +63,8 @@ class G1ParScanThreadState : public CHeapObj { uint _worker_id; - // Remember the last enqueued card to avoid enqueuing the same card over and over; - // since we only ever scan a card once, this is sufficient. - size_t _last_enqueued_card; + size_t _num_cards_marked_dirty; + size_t _num_cards_marked_to_cset; // Upper and lower threshold to start and end work queue draining. uint const _stack_trim_upper_threshold; @@ -104,22 +101,19 @@ class G1ParScanThreadState : public CHeapObj { EvacuationFailedInfo _evacuation_failed_info; G1EvacFailureRegions* _evac_failure_regions; - // Number of additional cards into evacuation failed regions enqueued into - // the local DCQS. This is an approximation, as cards that would be added later - // outside of evacuation failure will not be subtracted again. - size_t _evac_failure_enqueued_cards; + // Number of additional cards into evacuation failed regions. + size_t _num_cards_from_evac_failure; - // Enqueue the card if not already in the set; this is a best-effort attempt on + // Mark the card if not already in the set; this is a best-effort attempt on // detecting duplicates. - template bool enqueue_if_new(T* p); - // Enqueue the card of p into the (evacuation failed) region. - template void enqueue_card_into_evac_fail_region(T* p, oop obj); + template bool mark_if_new(T* p, bool into_survivor); + // Mark the card of p into the (evacuation failed) region. + template void mark_card_into_evac_fail_region(T* p, oop obj); bool inject_allocation_failure(uint region_idx) ALLOCATION_FAILURE_INJECTOR_RETURN_( return false; ); public: G1ParScanThreadState(G1CollectedHeap* g1h, - G1RedirtyCardsQueueSet* rdcqs, uint worker_id, uint num_workers, G1CollectionSet* collection_set, @@ -139,16 +133,16 @@ public: void push_on_queue(ScannerTask task); - // Apply the post barrier to the given reference field. Enqueues the card of p + // Apply the post barrier to the given reference field. Marks the card of p // if the barrier does not filter out the reference for some reason (e.g. // p and q are in the same region, p is in survivor, p is in collection set) // To be called during GC if nothing particular about p and obj are known. template void write_ref_field_post(T* p, oop obj); - // Enqueue the card if the reference's target region's remembered set is tracked. + // Mark the card if the reference's target region's remembered set is tracked. // Assumes that a significant amount of pre-filtering (like done by // write_ref_field_post() above) has already been performed. - template void enqueue_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o); + template void mark_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o); G1EvacuationRootClosures* closures() { return _closures; } uint worker_id() { return _worker_id; } @@ -156,11 +150,22 @@ public: size_t lab_waste_words() const; size_t lab_undo_waste_words() const; - size_t evac_failure_enqueued_cards() const; + // Newly marked cards during this garbage collection, to be refined concurrently + // later. Contains both marks generated by new cross-region references as well + // as cards generated from regions into evacuation failed regions. + // Does not contain cards into the next collection set (e.g. survivors) - they will not + // be refined concurrently. Calculation is done on a best-effort basis. + size_t num_cards_pending() const; + // Number of cards newly generated by references into evacuation failed regions. + // Calculation is done on a best-effort basis. + size_t num_cards_from_evac_failure() const; + // Sum of cards marked by evacuation. Contains both pending cards as well as cards + // into the next collection set (e.g. survivors). + size_t num_cards_marked() const; // Pass locally gathered statistics to global state. Returns the total number of // HeapWords copied. - size_t flush_stats(size_t* surviving_young_words, uint num_workers, BufferNodeList* buffer_log); + size_t flush_stats(size_t* surviving_young_words, uint num_workers); #if TASKQUEUE_STATS PartialArrayTaskStats* partial_array_task_stats(); @@ -249,9 +254,7 @@ public: class G1ParScanThreadStateSet : public StackObj { G1CollectedHeap* _g1h; G1CollectionSet* _collection_set; - G1RedirtyCardsQueueSet _rdcqs; G1ParScanThreadState** _states; - BufferNodeList* _rdc_buffers; size_t* _surviving_young_words_total; uint _num_workers; bool _flushed; @@ -264,9 +267,6 @@ class G1ParScanThreadStateSet : public StackObj { G1EvacFailureRegions* evac_failure_regions); ~G1ParScanThreadStateSet(); - G1RedirtyCardsQueueSet* rdcqs() { return &_rdcqs; } - BufferNodeList* rdc_buffers() { return _rdc_buffers; } - void flush_stats(); void record_unused_optional_region(G1HeapRegion* hr); #if TASKQUEUE_STATS diff --git a/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp b/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp index 148284e7ef7..ee5bc93290e 100644 --- a/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp +++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp @@ -96,25 +96,24 @@ G1OopStarChunkedList* G1ParScanThreadState::oops_into_optional_region(const G1He return &_oops_into_optional_regions[hr->index_in_opt_cset()]; } -template bool G1ParScanThreadState::enqueue_if_new(T* p) { - size_t card_index = ct()->index_for(p); - // If the card hasn't been added to the buffer, do it. - if (_last_enqueued_card != card_index) { - _rdc_local_qset.enqueue(ct()->byte_for_index(card_index)); - _last_enqueued_card = card_index; +template bool G1ParScanThreadState::mark_if_new(T* p, bool into_new_survivor) { + G1CardTable::CardValue* card = ct()->byte_for(p); + G1CardTable::CardValue value = *card; + if (value == G1CardTable::clean_card_val()) { + *card = into_new_survivor ? G1CardTable::g1_to_cset_card : G1CardTable::g1_dirty_card; return true; } else { return false; } } -template void G1ParScanThreadState::enqueue_card_into_evac_fail_region(T* p, oop obj) { +template void G1ParScanThreadState::mark_card_into_evac_fail_region(T* p, oop obj) { assert(!G1HeapRegion::is_in_same_region(p, obj), "Should have filtered out cross-region references already."); assert(!_g1h->heap_region_containing(p)->is_survivor(), "Should have filtered out from-newly allocated survivor references already."); assert(_g1h->heap_region_containing(obj)->in_collection_set(), "Only for enqeueing reference into collection set region"); - if (enqueue_if_new(p)) { - _evac_failure_enqueued_cards++; + if (mark_if_new(p, false /* into_new_survivor */)) { // The reference is never into survivor regions. + _num_cards_from_evac_failure++; } } @@ -137,18 +136,18 @@ template void G1ParScanThreadState::write_ref_field_post(T* p, oop obj if (dest_attr.is_in_cset()) { assert(obj->is_forwarded(), "evac-failed but not forwarded: " PTR_FORMAT, p2i(obj)); assert(obj->forwardee() == obj, "evac-failed but not self-forwarded: " PTR_FORMAT, p2i(obj)); - enqueue_card_into_evac_fail_region(p, obj); + mark_card_into_evac_fail_region(p, obj); return; } - enqueue_card_if_tracked(dest_attr, p, obj); + mark_card_if_tracked(dest_attr, p, obj); } -template void G1ParScanThreadState::enqueue_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o) { +template void G1ParScanThreadState::mark_card_if_tracked(G1HeapRegionAttr region_attr, T* p, oop o) { assert(!G1HeapRegion::is_in_same_region(p, o), "Should have filtered out cross-region references already."); assert(!_g1h->heap_region_containing(p)->is_survivor(), "Should have filtered out from-newly allocated survivor references already."); // We relabel all regions that failed evacuation as old gen without remembered, // and so pre-filter them out in the caller. - assert(!_g1h->heap_region_containing(o)->in_collection_set(), "Should not try to enqueue reference into collection set region"); + assert(!_g1h->heap_region_containing(o)->in_collection_set(), "Should not try to mark reference into collection set region"); #ifdef ASSERT G1HeapRegion* const hr_obj = _g1h->heap_region_containing(o); @@ -161,7 +160,14 @@ template void G1ParScanThreadState::enqueue_card_if_tracked(G1HeapRegi if (!region_attr.remset_is_tracked()) { return; } - enqueue_if_new(p); + bool into_survivor = region_attr.is_new_survivor(); + if (mark_if_new(p, into_survivor)) { + if (into_survivor) { + _num_cards_marked_to_cset++; + } else { + _num_cards_marked_dirty++; + } + } } #endif // SHARE_GC_G1_G1PARSCANTHREADSTATE_INLINE_HPP diff --git a/src/hotspot/share/gc/g1/g1Policy.cpp b/src/hotspot/share/gc/g1/g1Policy.cpp index 9f872aa6ccd..754cc502031 100644 --- a/src/hotspot/share/gc/g1/g1Policy.cpp +++ b/src/hotspot/share/gc/g1/g1Policy.cpp @@ -67,8 +67,7 @@ G1Policy::G1Policy(STWGCTimer* gc_timer) : _reserve_regions(0), _young_gen_sizer(), _free_regions_at_end_of_collection(0), - _card_rs_length(0), - _pending_cards_at_gc_start(0), + _pending_cards_from_gc(0), _concurrent_start_to_mixed(), _collection_set(nullptr), _g1h(nullptr), @@ -553,12 +552,9 @@ G1GCPhaseTimes* G1Policy::phase_times() const { return _phase_times; } -void G1Policy::revise_young_list_target_length(size_t card_rs_length, size_t code_root_rs_length) { +void G1Policy::revise_young_list_target_length(size_t pending_cards, size_t card_rs_length, size_t code_root_rs_length) { guarantee(use_adaptive_young_list_length(), "should not call this otherwise" ); - size_t thread_buffer_cards = _analytics->predict_dirtied_cards_in_thread_buffers(); - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - size_t pending_cards = dcqs.num_cards() + thread_buffer_cards; update_young_length_bounds(pending_cards, card_rs_length, code_root_rs_length); } @@ -567,7 +563,7 @@ void G1Policy::record_full_collection_start() { // Release the future to-space so that it is available for compaction into. collector_state()->set_in_young_only_phase(false); collector_state()->set_in_full_gc(true); - _pending_cards_at_gc_start = 0; + _collection_set->abandon_all_candidates(); } void G1Policy::record_full_collection_end() { @@ -600,59 +596,70 @@ void G1Policy::record_full_collection_end() { record_pause(G1GCPauseType::FullGC, start_time_sec, end_sec); } -static void log_refinement_stats(const char* kind, const G1ConcurrentRefineStats& stats) { +static void log_refinement_stats(const G1ConcurrentRefineStats& stats) { log_debug(gc, refine, stats) - ("%s refinement: %.2fms, refined: %zu" - ", precleaned: %zu, dirtied: %zu", - kind, - stats.refinement_time().seconds() * MILLIUNITS, + ("Refinement: sweep: %.2fms, yield: %.2fms refined: %zu, dirtied: %zu", + TimeHelper::counter_to_millis(stats.sweep_duration()), + TimeHelper::counter_to_millis(stats.yield_during_sweep_duration()), stats.refined_cards(), - stats.precleaned_cards(), - stats.dirtied_cards()); + stats.cards_pending()); } -void G1Policy::record_concurrent_refinement_stats(size_t pending_cards, - size_t thread_buffer_cards) { - _pending_cards_at_gc_start = pending_cards; - _analytics->report_dirtied_cards_in_thread_buffers(thread_buffer_cards); - - // Collect per-thread stats, mostly from mutator activity. - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - G1ConcurrentRefineStats mut_stats = dcqs.concatenated_refinement_stats(); - - // Collect specialized concurrent refinement thread stats. - G1ConcurrentRefine* cr = _g1h->concurrent_refine(); - G1ConcurrentRefineStats cr_stats = cr->get_and_reset_refinement_stats(); - - G1ConcurrentRefineStats total_stats = mut_stats + cr_stats; - - log_refinement_stats("Mutator", mut_stats); - log_refinement_stats("Concurrent", cr_stats); - log_refinement_stats("Total", total_stats); +void G1Policy::record_refinement_stats(G1ConcurrentRefineStats* refine_stats) { + log_refinement_stats(*refine_stats); // Record the rate at which cards were refined. - // Don't update the rate if the current sample is empty or time is zero. - Tickspan refinement_time = total_stats.refinement_time(); - size_t refined_cards = total_stats.refined_cards(); - if ((refined_cards > 0) && (refinement_time > Tickspan())) { - double rate = refined_cards / (refinement_time.seconds() * MILLIUNITS); + // Don't update the rate if the current sample is empty or time is zero (which is + // the case during GC). + double refinement_time = TimeHelper::counter_to_millis(refine_stats->sweep_duration()); + size_t refined_cards = refine_stats->refined_cards(); + if ((refined_cards > 0) && (refinement_time > 0)) { + double rate = refined_cards / refinement_time; _analytics->report_concurrent_refine_rate_ms(rate); - log_debug(gc, refine, stats)("Concurrent refinement rate: %.2f cards/ms", rate); + log_debug(gc, refine, stats)("Concurrent refinement rate: %.2f cards/ms predicted: %.2f cards/ms", rate, _analytics->predict_concurrent_refine_rate_ms()); } +} +template +static T saturated_sub(T x, T y) { + return (x < y) ? T() : (x - y); +} + +void G1Policy::record_dirtying_stats(double last_mutator_start_dirty_ms, + double last_mutator_end_dirty_ms, + size_t pending_cards, + double yield_duration_ms, + size_t next_pending_cards_from_gc, + size_t next_to_collection_set_cards) { + assert(SafepointSynchronize::is_at_safepoint() || G1ReviseYoungLength_lock->is_locked(), + "must be (at safepoint %s locked %s)", + BOOL_TO_STR(SafepointSynchronize::is_at_safepoint()), BOOL_TO_STR(G1ReviseYoungLength_lock->is_locked())); // Record mutator's card logging rate. - double mut_start_time = _analytics->prev_collection_pause_end_ms(); - double mut_end_time = cur_pause_start_sec() * MILLIUNITS; - double mut_time = mut_end_time - mut_start_time; + // Unlike above for conc-refine rate, here we should not require a // non-empty sample, since an application could go some time with only // young-gen or filtered out writes. But we'll ignore unusually short // sample periods, as they may just pollute the predictions. - if (mut_time > 1.0) { // Require > 1ms sample time. - double dirtied_rate = total_stats.dirtied_cards() / mut_time; + double const mutator_dirty_time_ms = (last_mutator_end_dirty_ms - last_mutator_start_dirty_ms) - yield_duration_ms; + assert(mutator_dirty_time_ms >= 0.0, + "must be (start: %.2f end: %.2f yield: %.2f)", + last_mutator_start_dirty_ms, last_mutator_end_dirty_ms, yield_duration_ms); + + if (mutator_dirty_time_ms > 1.0) { // Require > 1ms sample time. + // The subtractive term is pending_cards_from_gc() which includes both dirtied and dirty-as-young cards, + // which can be larger than what is actually considered as "pending" (dirty cards only). + size_t dirtied_cards = saturated_sub(pending_cards, pending_cards_from_gc()); + double dirtied_rate = dirtied_cards / mutator_dirty_time_ms; _analytics->report_dirtied_cards_rate_ms(dirtied_rate); - log_debug(gc, refine, stats)("Generate dirty cards rate: %.2f cards/ms", dirtied_rate); + log_debug(gc, refine, stats)("Generate dirty cards rate: %.2f cards/ms dirtying time %.2f (start %.2f end %.2f yield %.2f) dirtied %zu (pending %zu during_gc %zu)", + dirtied_rate, + mutator_dirty_time_ms, + last_mutator_start_dirty_ms, last_mutator_end_dirty_ms, yield_duration_ms, + dirtied_cards, pending_cards, pending_cards_from_gc()); } + + _pending_cards_from_gc = next_pending_cards_from_gc; + _to_collection_set_cards = next_to_collection_set_cards; } bool G1Policy::should_retain_evac_failed_region(uint index) const { @@ -761,27 +768,27 @@ bool G1Policy::concurrent_operation_is_full_mark(const char* msg) { ((_g1h->gc_cause() != GCCause::_g1_humongous_allocation) || need_to_start_conc_mark(msg)); } -double G1Policy::logged_cards_processing_time() const { +double G1Policy::pending_cards_processing_time() const { double all_cards_processing_time = average_time_ms(G1GCPhaseTimes::ScanHR) + average_time_ms(G1GCPhaseTimes::OptScanHR); - size_t logged_dirty_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards); + size_t pending_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) + + phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards); size_t scan_heap_roots_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) + phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards); - double merge_logged_cards_time = average_time_ms(G1GCPhaseTimes::MergeLB) + - phase_times()->cur_distribute_log_buffers_time_ms(); + double merge_pending_cards_time = phase_times()->cur_merge_refinement_table_time(); - // Approximate the time spent processing cards from log buffers by scaling - // the total processing time by the ratio of logged cards to total cards + // Approximate the time spent processing cards from pending cards by scaling + // the total processing time by the ratio of pending cards to total cards // processed. There might be duplicate cards in different log buffers, // leading to an overestimate. That effect should be relatively small // unless there are few cards to process, because cards in buffers are // dirtied to limit duplication. Also need to avoid scaling when both // counts are zero, which happens especially during early GCs. So ascribe - // all of the time to the logged cards unless there are more total cards. - if (logged_dirty_cards >= scan_heap_roots_cards) { - return all_cards_processing_time + merge_logged_cards_time; + // all of the time to the pending cards unless there are more total cards. + if (pending_cards >= scan_heap_roots_cards) { + return all_cards_processing_time + merge_pending_cards_time; } - return (all_cards_processing_time * logged_dirty_cards / scan_heap_roots_cards) + merge_logged_cards_time; + return (all_cards_processing_time * pending_cards / scan_heap_roots_cards) + merge_pending_cards_time; } // Anything below that is considered to be zero @@ -815,6 +822,22 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar // We make the assumption that these are rare. bool update_stats = !allocation_failure; + size_t const total_cards_scanned = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) + + p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards); + + // Number of scanned cards with "Dirty" value (and nothing else). + size_t const pending_cards_from_refinement_table = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) + + p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards); + // Number of cards actually merged in the Merge RS phase. MergeRSCards below includes the cards from the Eager Reclaim phase. + size_t const merged_cards_from_card_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSFromRemSetCards) + + p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSFromRemSetCards); + // Number of cards attempted to merge in the Merge RS phase. + size_t const total_cards_from_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSTotalCards) + + p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSTotalCards); + + // Cards marked as being to collection set. May be inaccurate due to races. + size_t const total_non_young_rs_cards = MIN2(pending_cards_from_refinement_table + merged_cards_from_card_rs, total_cards_scanned); + if (update_stats) { // We maintain the invariant that all objects allocated by mutator // threads will be allocated out of eden regions. So, we can use @@ -827,6 +850,98 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar uint regions_allocated = _collection_set->eden_region_length(); double alloc_rate_ms = (double) regions_allocated / app_time_ms; _analytics->report_alloc_rate_ms(alloc_rate_ms); + + double merge_refinement_table_time = p->cur_merge_refinement_table_time(); + if (merge_refinement_table_time != 0.0) { + _analytics->report_merge_refinement_table_time_ms(merge_refinement_table_time); + } + if (merged_cards_from_card_rs >= G1NumCardsCostSampleThreshold) { + double avg_time_merge_cards = average_time_ms(G1GCPhaseTimes::MergeER) + + average_time_ms(G1GCPhaseTimes::MergeRS) + + average_time_ms(G1GCPhaseTimes::OptMergeRS); + _analytics->report_cost_per_card_merge_ms(avg_time_merge_cards / merged_cards_from_card_rs, is_young_only_pause); + log_debug(gc, ergo, cset)("cost per card merge (young %s): avg time %.2f merged cards %zu cost(1m) %.2f pred_cost(1m-yo) %.2f pred_cost(1m-old) %.2f", + BOOL_TO_STR(is_young_only_pause), + avg_time_merge_cards, merged_cards_from_card_rs, 1e6 * avg_time_merge_cards / merged_cards_from_card_rs, _analytics->predict_card_merge_time_ms(1e6, true), _analytics->predict_card_merge_time_ms(1e6, false)); + } else { + log_debug(gc, ergo, cset)("cost per card merge (young: %s): skipped, total cards %zu", BOOL_TO_STR(is_young_only_pause), total_non_young_rs_cards); + } + + // Update prediction for card scan + + if (total_cards_scanned >= G1NumCardsCostSampleThreshold) { + double avg_card_scan_time = average_time_ms(G1GCPhaseTimes::ScanHR) + + average_time_ms(G1GCPhaseTimes::OptScanHR); + + _analytics->report_cost_per_card_scan_ms(avg_card_scan_time / total_cards_scanned, is_young_only_pause); + + log_debug(gc, ergo, cset)("cost per card scan (young: %s): avg time %.2f total cards %zu cost(1m) %.2f pred_cost(1m-yo) %.2f pred_cost(1m-old) %.2f", + BOOL_TO_STR(is_young_only_pause), + avg_card_scan_time, total_cards_scanned, 1e6 * avg_card_scan_time / total_cards_scanned, _analytics->predict_card_scan_time_ms(1e6, true), _analytics->predict_card_scan_time_ms(1e6, false)); + } else { + log_debug(gc, ergo, cset)("cost per card scan (young: %s): skipped, total cards %zu", BOOL_TO_STR(is_young_only_pause), total_cards_scanned); + } + + // Update prediction for the ratio between cards actually merged onto the card + // table from the remembered sets and the total number of cards attempted to + // merge. + double merge_to_scan_ratio = 1.0; + if (total_cards_from_rs > 0) { + merge_to_scan_ratio = (double)merged_cards_from_card_rs / total_cards_from_rs; + } + _analytics->report_card_merge_to_scan_ratio(merge_to_scan_ratio, is_young_only_pause); + + // Update prediction for code root scan + size_t const total_code_roots_scanned = p->sum_thread_work_items(G1GCPhaseTimes::CodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods) + + p->sum_thread_work_items(G1GCPhaseTimes::OptCodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods); + + if (total_code_roots_scanned >= G1NumCodeRootsCostSampleThreshold) { + double avg_time_code_root_scan = average_time_ms(G1GCPhaseTimes::CodeRoots) + + average_time_ms(G1GCPhaseTimes::OptCodeRoots); + + _analytics->report_cost_per_code_root_scan_ms(avg_time_code_root_scan / total_code_roots_scanned, is_young_only_pause); + } + + // Update prediction for copy cost per byte + size_t copied_bytes = p->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSCopiedBytes); + + if (copied_bytes > 0) { + double avg_copy_time = average_time_ms(G1GCPhaseTimes::ObjCopy) + average_time_ms(G1GCPhaseTimes::OptObjCopy); + double cost_per_byte_ms = avg_copy_time / copied_bytes; + _analytics->report_cost_per_byte_ms(cost_per_byte_ms, is_young_only_pause); + } + + if (_collection_set->young_region_length() > 0) { + _analytics->report_young_other_cost_per_region_ms(young_other_time_ms() / + _collection_set->young_region_length()); + } + + if (_collection_set->initial_old_region_length() > 0) { + _analytics->report_non_young_other_cost_per_region_ms(non_young_other_time_ms() / + _collection_set->initial_old_region_length()); + } + + _analytics->report_constant_other_time_ms(constant_other_time_ms(pause_time_ms)); + + _analytics->report_pending_cards(pending_cards_from_refinement_table, is_young_only_pause); + + _analytics->report_card_rs_length(total_cards_scanned - total_non_young_rs_cards, is_young_only_pause); + _analytics->report_code_root_rs_length((double)total_code_roots_scanned, is_young_only_pause); + } + + { + double mutator_end_time = cur_pause_start_sec() * MILLIUNITS; + G1ConcurrentRefineStats* stats = _g1h->concurrent_refine()->sweep_state().stats(); + // Record any available refinement statistics. + record_refinement_stats(stats); + + double yield_duration_ms = TimeHelper::counter_to_millis(_g1h->yield_duration_in_refinement_epoch()); + record_dirtying_stats(TimeHelper::counter_to_millis(_g1h->last_refinement_epoch_start()), + mutator_end_time, + pending_cards_from_refinement_table, + yield_duration_ms, + phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSPendingCards), + phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSToYoungGenCards)); } record_pause(this_pause, start_time_sec, end_time_sec, allocation_failure); @@ -857,82 +972,6 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar _eden_surv_rate_group->start_adding_regions(); - if (update_stats) { - // Update prediction for card merge. - size_t const merged_cards_from_log_buffers = p->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards); - // MergeRSCards includes the cards from the Eager Reclaim phase. - size_t const merged_cards_from_rs = p->sum_thread_work_items(G1GCPhaseTimes::MergeRS, G1GCPhaseTimes::MergeRSCards) + - p->sum_thread_work_items(G1GCPhaseTimes::OptMergeRS, G1GCPhaseTimes::MergeRSCards); - size_t const total_cards_merged = merged_cards_from_rs + - merged_cards_from_log_buffers; - - if (total_cards_merged >= G1NumCardsCostSampleThreshold) { - double avg_time_merge_cards = average_time_ms(G1GCPhaseTimes::MergeER) + - average_time_ms(G1GCPhaseTimes::MergeRS) + - average_time_ms(G1GCPhaseTimes::MergeLB) + - p->cur_distribute_log_buffers_time_ms() + - average_time_ms(G1GCPhaseTimes::OptMergeRS); - _analytics->report_cost_per_card_merge_ms(avg_time_merge_cards / total_cards_merged, is_young_only_pause); - } - - // Update prediction for card scan - size_t const total_cards_scanned = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) + - p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards); - - if (total_cards_scanned >= G1NumCardsCostSampleThreshold) { - double avg_time_dirty_card_scan = average_time_ms(G1GCPhaseTimes::ScanHR) + - average_time_ms(G1GCPhaseTimes::OptScanHR); - - _analytics->report_cost_per_card_scan_ms(avg_time_dirty_card_scan / total_cards_scanned, is_young_only_pause); - } - - // Update prediction for the ratio between cards from the remembered - // sets and actually scanned cards from the remembered sets. - // Due to duplicates in the log buffers, the number of scanned cards - // can be smaller than the cards in the log buffers. - const size_t scanned_cards_from_rs = (total_cards_scanned > merged_cards_from_log_buffers) ? total_cards_scanned - merged_cards_from_log_buffers : 0; - double scan_to_merge_ratio = 0.0; - if (merged_cards_from_rs > 0) { - scan_to_merge_ratio = (double)scanned_cards_from_rs / merged_cards_from_rs; - } - _analytics->report_card_scan_to_merge_ratio(scan_to_merge_ratio, is_young_only_pause); - - // Update prediction for code root scan - size_t const total_code_roots_scanned = p->sum_thread_work_items(G1GCPhaseTimes::CodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods) + - p->sum_thread_work_items(G1GCPhaseTimes::OptCodeRoots, G1GCPhaseTimes::CodeRootsScannedNMethods); - - if (total_code_roots_scanned >= G1NumCodeRootsCostSampleThreshold) { - double avg_time_code_root_scan = average_time_ms(G1GCPhaseTimes::CodeRoots) + - average_time_ms(G1GCPhaseTimes::OptCodeRoots); - - _analytics->report_cost_per_code_root_scan_ms(avg_time_code_root_scan / total_code_roots_scanned, is_young_only_pause); - } - - // Update prediction for copy cost per byte - size_t copied_bytes = p->sum_thread_work_items(G1GCPhaseTimes::MergePSS, G1GCPhaseTimes::MergePSSCopiedBytes); - - if (copied_bytes > 0) { - double cost_per_byte_ms = (average_time_ms(G1GCPhaseTimes::ObjCopy) + average_time_ms(G1GCPhaseTimes::OptObjCopy)) / copied_bytes; - _analytics->report_cost_per_byte_ms(cost_per_byte_ms, is_young_only_pause); - } - - if (_collection_set->young_region_length() > 0) { - _analytics->report_young_other_cost_per_region_ms(young_other_time_ms() / - _collection_set->young_region_length()); - } - - if (_collection_set->initial_old_region_length() > 0) { - _analytics->report_non_young_other_cost_per_region_ms(non_young_other_time_ms() / - _collection_set->initial_old_region_length()); - } - - _analytics->report_constant_other_time_ms(constant_other_time_ms(pause_time_ms)); - - _analytics->report_pending_cards((double)pending_cards_at_gc_start(), is_young_only_pause); - _analytics->report_card_rs_length((double)_card_rs_length, is_young_only_pause); - _analytics->report_code_root_rs_length((double)total_code_roots_scanned, is_young_only_pause); - } - assert(!(G1GCPauseTypeHelper::is_concurrent_start_pause(this_pause) && collector_state()->mark_or_rebuild_in_progress()), "If the last pause has been concurrent start, we should not have been in the marking window"); if (G1GCPauseTypeHelper::is_concurrent_start_pause(this_pause)) { @@ -963,29 +1002,26 @@ void G1Policy::record_young_collection_end(bool concurrent_operation_is_full_mar } // Note that _mmu_tracker->max_gc_time() returns the time in seconds. - double logged_cards_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0; + double pending_cards_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0; - double const logged_cards_time_ms = logged_cards_processing_time(); - size_t logged_cards = - phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB, - G1GCPhaseTimes::MergeLBDirtyCards); - bool exceeded_goal = logged_cards_time_goal_ms < logged_cards_time_ms; - size_t predicted_thread_buffer_cards = _analytics->predict_dirtied_cards_in_thread_buffers(); + double const pending_cards_time_ms = pending_cards_processing_time(); + size_t pending_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRPendingCards) + + phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRPendingCards); + + bool exceeded_goal = pending_cards_time_goal_ms < pending_cards_time_ms; G1ConcurrentRefine* cr = _g1h->concurrent_refine(); log_debug(gc, ergo, refine) - ("GC refinement: goal: %zu + %zu / %1.2fms, actual: %zu / %1.2fms, %s", + ("GC refinement: goal: %zu / %1.2fms, actual: %zu / %1.2fms, %s", cr->pending_cards_target(), - predicted_thread_buffer_cards, - logged_cards_time_goal_ms, - logged_cards, - logged_cards_time_ms, + pending_cards_time_goal_ms, + pending_cards, + pending_cards_time_ms, (exceeded_goal ? " (exceeded goal)" : "")); - cr->adjust_after_gc(logged_cards_time_ms, - logged_cards, - predicted_thread_buffer_cards, - logged_cards_time_goal_ms); + cr->adjust_after_gc(pending_cards_time_ms, + pending_cards, + pending_cards_time_goal_ms); } G1IHOPControl* G1Policy::create_ihop_control(const G1OldGenAllocationTracker* old_gen_alloc_tracker, @@ -1057,34 +1093,27 @@ double G1Policy::predict_base_time_ms(size_t pending_cards, size_t code_root_rs_length) const { bool in_young_only_phase = collector_state()->in_young_only_phase(); - size_t unique_cards_from_rs = _analytics->predict_scan_card_num(card_rs_length, in_young_only_phase); - // Assume that all cards from the log buffers will be scanned, i.e. there are no - // duplicates in that set. - size_t effective_scanned_cards = unique_cards_from_rs + pending_cards; + // Cards from the refinement table and the cards from the young gen remset are + // unique to each other as they are located on the card table. + size_t effective_scanned_cards = card_rs_length + pending_cards; - double card_merge_time = _analytics->predict_card_merge_time_ms(pending_cards + card_rs_length, in_young_only_phase); + double refinement_table_merge_time = _analytics->predict_merge_refinement_table_time_ms(); double card_scan_time = _analytics->predict_card_scan_time_ms(effective_scanned_cards, in_young_only_phase); double code_root_scan_time = _analytics->predict_code_root_scan_time_ms(code_root_rs_length, in_young_only_phase); double constant_other_time = _analytics->predict_constant_other_time_ms(); double survivor_evac_time = predict_survivor_regions_evac_time(); - double total_time = card_merge_time + card_scan_time + code_root_scan_time + constant_other_time + survivor_evac_time; + double total_time = refinement_table_merge_time + card_scan_time + code_root_scan_time + constant_other_time + survivor_evac_time; log_trace(gc, ergo, heap)("Predicted base time: total %f lb_cards %zu card_rs_length %zu effective_scanned_cards %zu " - "card_merge_time %f card_scan_time %f code_root_rs_length %zu code_root_scan_time %f " + "refinement_table_merge_time %f card_scan_time %f code_root_rs_length %zu code_root_scan_time %f " "constant_other_time %f survivor_evac_time %f", total_time, pending_cards, card_rs_length, effective_scanned_cards, - card_merge_time, card_scan_time, code_root_rs_length, code_root_scan_time, + refinement_table_merge_time, card_scan_time, code_root_rs_length, code_root_scan_time, constant_other_time, survivor_evac_time); return total_time; } -double G1Policy::predict_base_time_ms(size_t pending_cards) const { - bool for_young_only_phase = collector_state()->in_young_only_phase(); - size_t card_rs_length = _analytics->predict_card_rs_length(for_young_only_phase); - return predict_base_time_ms(pending_cards, card_rs_length); -} - double G1Policy::predict_base_time_ms(size_t pending_cards, size_t card_rs_length) const { bool for_young_only_phase = collector_state()->in_young_only_phase(); size_t code_root_rs_length = _analytics->predict_code_root_rs_length(for_young_only_phase); @@ -1428,6 +1457,64 @@ size_t G1Policy::allowed_waste_in_collection_set() const { return G1HeapWastePercent * _g1h->capacity() / 100; } +bool G1Policy::try_get_available_bytes_estimate(size_t& available_bytes) const { + // Getting used young bytes requires holding Heap_lock. But we can't use + // normal lock and block until available. Blocking on the lock could + // deadlock with a GC VMOp that is holding the lock and requesting a + // safepoint. Instead try to lock, and return the result of that attempt, + // and the estimate if successful. + if (Heap_lock->try_lock()) { + size_t used_bytes = estimate_used_young_bytes_locked(); + Heap_lock->unlock(); + + size_t young_bytes = young_list_target_length() * G1HeapRegion::GrainBytes; + available_bytes = young_bytes - MIN2(young_bytes, used_bytes); + return true; + } else { + available_bytes = 0; + return false; + } +} + +double G1Policy::predict_time_to_next_gc_ms(size_t available_bytes) const { + double alloc_region_rate = _analytics->predict_alloc_rate_ms(); + double alloc_bytes_rate = alloc_region_rate * G1HeapRegion::GrainBytes; + if (alloc_bytes_rate == 0.0) { + // A zero rate indicates we don't yet have data to use for predictions. + // Since we don't have any idea how long until the next GC, use a time of + // zero. + return 0.0; + } else { + // If the heap size is large and the allocation rate is small, we can get + // a predicted time until next GC that is so large it can cause problems + // (such as overflow) in other calculations. Limit the prediction to one + // hour, which is still large in this context. + const double one_hour_ms = 60.0 * 60.0 * MILLIUNITS; + double raw_time_ms = available_bytes / alloc_bytes_rate; + return MIN2(raw_time_ms, one_hour_ms); + } +} + +uint64_t G1Policy::adjust_wait_time_ms(double wait_time_ms, uint64_t min_time_ms) { + return MAX2(static_cast(sqrt(wait_time_ms) * 4.0), min_time_ms); +} + +double G1Policy::last_mutator_dirty_start_time_ms() { + return TimeHelper::counter_to_millis(_g1h->last_refinement_epoch_start()); +} + +size_t G1Policy::current_pending_cards() { + double now = os::elapsedTime() * MILLIUNITS; + return _pending_cards_from_gc + _analytics->predict_dirtied_cards_rate_ms() * (now - last_mutator_dirty_start_time_ms()); +} + +size_t G1Policy::current_to_collection_set_cards() { + // The incremental part is covered by the dirtied_cards_rate, i.e. pending cards + // cover both to collection set cards and other interesting cards because we do not + // know which is which until we look. + return _to_collection_set_cards; +} + uint G1Policy::min_retained_old_cset_length() const { // Guarantee some progress with retained regions regardless of available time by // taking at least one region. diff --git a/src/hotspot/share/gc/g1/g1Policy.hpp b/src/hotspot/share/gc/g1/g1Policy.hpp index e9f7529e509..01bad97ab84 100644 --- a/src/hotspot/share/gc/g1/g1Policy.hpp +++ b/src/hotspot/share/gc/g1/g1Policy.hpp @@ -48,6 +48,7 @@ class G1HeapRegion; class G1CollectionSet; class G1CollectionSetCandidates; class G1CollectionSetChooser; +class G1ConcurrentRefineStats; class G1IHOPControl; class G1Analytics; class G1SurvivorRegions; @@ -101,9 +102,18 @@ class G1Policy: public CHeapObj { uint _free_regions_at_end_of_collection; - size_t _card_rs_length; - - size_t _pending_cards_at_gc_start; + // Tracks the number of cards marked as dirty (only) during garbage collection + // (evacuation) on the card table. + // This is needed to properly account for those cards in the heuristics to start + // refinement at the correct time which needs to know how many cards are currently + // approximately on the card table. + // After the first completed refinement sweep of the refinement table between two + // garbage collections this value is reset to zero as that refinement processed all + // those cards. + size_t _pending_cards_from_gc; + // Tracks the approximate number of cards found as to-collection-set by either the + // garbage collection or the most recent refinement sweep. + size_t _to_collection_set_cards; G1ConcurrentStartToMixedTimeTracker _concurrent_start_to_mixed; @@ -111,7 +121,7 @@ class G1Policy: public CHeapObj { return collector_state()->in_young_only_phase() && !collector_state()->mark_or_rebuild_in_progress(); } - double logged_cards_processing_time() const; + double pending_cards_processing_time() const; public: const G1Predictions& predictor() const { return _predictor; } const G1Analytics* analytics() const { return const_cast(_analytics); } @@ -129,16 +139,10 @@ public: hr->install_surv_rate_group(_survivor_surv_rate_group); } - void record_card_rs_length(size_t num_cards) { - _card_rs_length = num_cards; - } - double cur_pause_start_sec() const { return _cur_pause_start_sec; } - double predict_base_time_ms(size_t pending_cards) const; - double predict_base_time_ms(size_t pending_cards, size_t card_rs_length) const; // Base time contains handling remembered sets and constant other time of the @@ -239,7 +243,13 @@ private: public: size_t predict_bytes_to_copy(G1HeapRegion* hr) const; - size_t pending_cards_at_gc_start() const { return _pending_cards_at_gc_start; } + + double last_mutator_dirty_start_time_ms(); + size_t pending_cards_from_gc() const { return _pending_cards_from_gc; } + + size_t current_pending_cards(); + + size_t current_to_collection_set_cards(); // GC efficiency for collecting the region based on the time estimate for // merging and scanning incoming references. @@ -286,7 +296,7 @@ public: // Check the current value of the young list RSet length and // compare it against the last prediction. If the current value is // higher, recalculate the young list target length prediction. - void revise_young_list_target_length(size_t card_rs_length, size_t code_root_rs_length); + void revise_young_list_target_length(size_t pending_cards, size_t card_rs_length, size_t code_root_rs_length); // This should be called after the heap is resized. void record_new_heap_size(uint new_number_of_regions); @@ -325,7 +335,6 @@ public: // Amount of allowed waste in bytes in the collection set. size_t allowed_waste_in_collection_set() const; - private: // Predict the number of bytes of surviving objects from survivor and old @@ -359,17 +368,39 @@ public: bool use_adaptive_young_list_length() const; + // Try to get an estimate of the currently available bytes in the young gen. This + // operation considers itself low-priority: if other threads need the resources + // required to get the information, return false to indicate that the caller + // should retry "soon". + bool try_get_available_bytes_estimate(size_t& bytes) const; + // Estimate time until next GC, based on remaining bytes available for + // allocation and the allocation rate. + double predict_time_to_next_gc_ms(size_t available_bytes) const; + + // Adjust wait times to make them less frequent the longer the next GC is away. + // But don't increase the wait time too rapidly, further bound it by min_time_ms. + // This reduces the number of thread wakeups that just immediately + // go back to waiting, while still being responsive to behavior changes. + uint64_t adjust_wait_time_ms(double wait_time_ms, uint64_t min_time_ms); + +private: // Return an estimate of the number of bytes used in young gen. // precondition: holding Heap_lock size_t estimate_used_young_bytes_locked() const; +public: + void transfer_survivors_to_cset(const G1SurvivorRegions* survivors); - // Record and log stats and pending cards before not-full collection. - // thread_buffer_cards is the number of cards that were in per-thread - // buffers. pending_cards includes thread_buffer_cards. - void record_concurrent_refinement_stats(size_t pending_cards, - size_t thread_buffer_cards); + // Record and log stats and pending cards to update predictors. + void record_refinement_stats(G1ConcurrentRefineStats* stats); + + void record_dirtying_stats(double last_mutator_start_dirty_ms, + double last_mutator_end_dirty_ms, + size_t pending_cards, + double yield_duration, + size_t next_pending_cards_from_gc, + size_t next_to_collection_set_cards); bool should_retain_evac_failed_region(G1HeapRegion* r) const { return should_retain_evac_failed_region(r->hrm_index()); diff --git a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp deleted file mode 100644 index 45e262c440a..00000000000 --- a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - * - */ - -#include "gc/g1/g1RedirtyCardsQueue.hpp" -#include "gc/shared/bufferNode.hpp" -#include "runtime/atomicAccess.hpp" -#include "utilities/debug.hpp" -#include "utilities/macros.hpp" - -// G1RedirtyCardsLocalQueueSet - -G1RedirtyCardsLocalQueueSet::G1RedirtyCardsLocalQueueSet(G1RedirtyCardsQueueSet* shared_qset) : - PtrQueueSet(shared_qset->allocator()), - _shared_qset(shared_qset), - _buffers(), - _queue(this) -{} - -#ifdef ASSERT -G1RedirtyCardsLocalQueueSet::~G1RedirtyCardsLocalQueueSet() { - assert(_buffers._head == nullptr, "unflushed qset"); - assert(_buffers._tail == nullptr, "invariant"); - assert(_buffers._entry_count == 0, "invariant"); -} -#endif // ASSERT - -void G1RedirtyCardsLocalQueueSet::enqueue_completed_buffer(BufferNode* node) { - _buffers._entry_count += node->size(); - node->set_next(_buffers._head); - _buffers._head = node; - if (_buffers._tail == nullptr) { - _buffers._tail = node; - } -} - -void G1RedirtyCardsLocalQueueSet::enqueue(void* value) { - if (!try_enqueue(_queue, value)) { - BufferNode* old_node = exchange_buffer_with_new(_queue); - if (old_node != nullptr) { - enqueue_completed_buffer(old_node); - } - retry_enqueue(_queue, value); - } -} - -BufferNodeList G1RedirtyCardsLocalQueueSet::flush() { - flush_queue(_queue); - BufferNodeList cur_buffers = _buffers; - _shared_qset->add_bufferlist(_buffers); - _buffers = BufferNodeList(); - return cur_buffers; -} - -// G1RedirtyCardsLocalQueueSet::Queue - -G1RedirtyCardsLocalQueueSet::Queue::Queue(G1RedirtyCardsLocalQueueSet* qset) : - PtrQueue(qset) -{} - -#ifdef ASSERT -G1RedirtyCardsLocalQueueSet::Queue::~Queue() { - assert(buffer() == nullptr, "unflushed queue"); -} -#endif // ASSERT - -// G1RedirtyCardsQueueSet - -G1RedirtyCardsQueueSet::G1RedirtyCardsQueueSet(BufferNode::Allocator* allocator) : - PtrQueueSet(allocator), - _list(), - _entry_count(0), - _tail(nullptr) - DEBUG_ONLY(COMMA _collecting(true)) -{} - -G1RedirtyCardsQueueSet::~G1RedirtyCardsQueueSet() { - verify_empty(); -} - -#ifdef ASSERT -void G1RedirtyCardsQueueSet::verify_empty() const { - assert(_list.empty(), "precondition"); - assert(_tail == nullptr, "invariant"); - assert(_entry_count == 0, "invariant"); -} -#endif // ASSERT - -BufferNode* G1RedirtyCardsQueueSet::all_completed_buffers() const { - DEBUG_ONLY(_collecting = false;) - return _list.top(); -} - -BufferNodeList G1RedirtyCardsQueueSet::take_all_completed_buffers() { - DEBUG_ONLY(_collecting = false;) - BufferNodeList result(_list.pop_all(), _tail, _entry_count); - _tail = nullptr; - _entry_count = 0; - DEBUG_ONLY(_collecting = true;) - return result; -} - -void G1RedirtyCardsQueueSet::update_tail(BufferNode* node) { - // Node is the tail of a (possibly single element) list just prepended to - // _list. If, after that prepend, node's follower is null, then node is - // also the tail of _list, so record it as such. - if (node->next() == nullptr) { - assert(_tail == nullptr, "invariant"); - _tail = node; - } -} - -void G1RedirtyCardsQueueSet::enqueue_completed_buffer(BufferNode* node) { - assert(_collecting, "precondition"); - AtomicAccess::add(&_entry_count, node->size()); - _list.push(*node); - update_tail(node); -} - -void G1RedirtyCardsQueueSet::add_bufferlist(const BufferNodeList& buffers) { - assert(_collecting, "precondition"); - if (buffers._head != nullptr) { - assert(buffers._tail != nullptr, "invariant"); - AtomicAccess::add(&_entry_count, buffers._entry_count); - _list.prepend(*buffers._head, *buffers._tail); - update_tail(buffers._tail); - } -} diff --git a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp deleted file mode 100644 index add66f24cca..00000000000 --- a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - * - */ - -#ifndef SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP -#define SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP - -#include "gc/shared/bufferNode.hpp" -#include "gc/shared/bufferNodeList.hpp" -#include "gc/shared/ptrQueue.hpp" -#include "memory/padded.hpp" -#include "utilities/macros.hpp" - -class G1RedirtyCardsQueueSet; - -// A thread-local qset and queue. It provides an uncontended staging -// area for completed buffers, to be flushed to the shared qset en masse. -class G1RedirtyCardsLocalQueueSet : private PtrQueueSet { - class Queue : public PtrQueue { - public: - Queue(G1RedirtyCardsLocalQueueSet* qset); - ~Queue() NOT_DEBUG(= default); - }; - - G1RedirtyCardsQueueSet* _shared_qset; - BufferNodeList _buffers; - Queue _queue; - - // Add the buffer to the local list. - virtual void enqueue_completed_buffer(BufferNode* node); - -public: - G1RedirtyCardsLocalQueueSet(G1RedirtyCardsQueueSet* shared_qset); - ~G1RedirtyCardsLocalQueueSet() NOT_DEBUG(= default); - - void enqueue(void* value); - - // Transfer all completed buffers to the shared qset. - // Returns the flushed BufferNodeList which is later used - // as a shortcut into the shared qset. - BufferNodeList flush(); -}; - -// Card table entries to be redirtied and the cards reprocessed later. -// Has two phases, collecting and processing. During the collecting -// phase buffers are added to the set. Once collecting is complete and -// processing starts, buffers can no longer be added. Taking all the -// collected (and processed) buffers reverts back to collecting, allowing -// the set to be reused for another round of redirtying. -class G1RedirtyCardsQueueSet : public PtrQueueSet { - DEFINE_PAD_MINUS_SIZE(1, DEFAULT_PADDING_SIZE, 0); - BufferNode::Stack _list; - DEFINE_PAD_MINUS_SIZE(2, DEFAULT_PADDING_SIZE, sizeof(size_t)); - volatile size_t _entry_count; - DEFINE_PAD_MINUS_SIZE(3, DEFAULT_PADDING_SIZE, sizeof(BufferNode*)); - BufferNode* _tail; - DEBUG_ONLY(mutable bool _collecting;) - - void update_tail(BufferNode* node); - -public: - G1RedirtyCardsQueueSet(BufferNode::Allocator* allocator); - ~G1RedirtyCardsQueueSet(); - - void verify_empty() const NOT_DEBUG_RETURN; - - // Collect buffers. These functions are thread-safe. - // precondition: Must not be concurrent with buffer processing. - virtual void enqueue_completed_buffer(BufferNode* node); - void add_bufferlist(const BufferNodeList& buffers); - - // Processing phase operations. - // precondition: Must not be concurrent with buffer collection. - BufferNode* all_completed_buffers() const; - BufferNodeList take_all_completed_buffers(); -}; - -#endif // SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP diff --git a/src/hotspot/share/gc/g1/g1RemSet.cpp b/src/hotspot/share/gc/g1/g1RemSet.cpp index 2a09512730c..d2df416edc2 100644 --- a/src/hotspot/share/gc/g1/g1RemSet.cpp +++ b/src/hotspot/share/gc/g1/g1RemSet.cpp @@ -27,11 +27,12 @@ #include "gc/g1/g1BlockOffsetTable.inline.hpp" #include "gc/g1/g1CardSet.inline.hpp" #include "gc/g1/g1CardTable.inline.hpp" +#include "gc/g1/g1CardTableClaimTable.inline.hpp" #include "gc/g1/g1CardTableEntryClosure.hpp" #include "gc/g1/g1CollectedHeap.inline.hpp" #include "gc/g1/g1CollectionSet.inline.hpp" #include "gc/g1/g1ConcurrentRefine.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" +#include "gc/g1/g1ConcurrentRefineSweepTask.hpp" #include "gc/g1/g1FromCardCache.hpp" #include "gc/g1/g1GCParPhaseTimesTracker.hpp" #include "gc/g1/g1GCPhaseTimes.hpp" @@ -42,8 +43,6 @@ #include "gc/g1/g1Policy.hpp" #include "gc/g1/g1RemSet.hpp" #include "gc/g1/g1RootClosures.hpp" -#include "gc/shared/bufferNode.hpp" -#include "gc/shared/bufferNodeList.hpp" #include "gc/shared/gc_globals.hpp" #include "gc/shared/gcTraceTime.inline.hpp" #include "jfr/jfrEvents.hpp" @@ -63,7 +62,7 @@ // Collects information about the overall heap root scan progress during an evacuation. // // Scanning the remembered sets works by first merging all sources of cards to be -// scanned (log buffers, remembered sets) into a single data structure to remove +// scanned (refinement table, remembered sets) into a single data structure to remove // duplicates and simplify work distribution. // // During the following card scanning we not only scan this combined set of cards, but @@ -89,37 +88,13 @@ class G1RemSetScanState : public CHeapObj { class G1DirtyRegions; - size_t _max_reserved_regions; - - // Card table iteration claim for each heap region, from 0 (completely unscanned) - // to (>=) G1HeapRegion::CardsPerRegion (completely scanned). - uint volatile* _card_table_scan_state; - - uint _scan_chunks_per_region; // Number of chunks per region. - uint8_t _log_scan_chunks_per_region; // Log of number of chunks per region. - bool* _region_scan_chunks; - size_t _num_total_scan_chunks; // Total number of elements in _region_scan_chunks. - uint8_t _scan_chunks_shift; // For conversion between card index and chunk index. -public: - uint scan_chunk_size_in_cards() const { return (uint)1 << _scan_chunks_shift; } - - // Returns whether the chunk corresponding to the given region/card in region contain a - // dirty card, i.e. actually needs scanning. - bool chunk_needs_scan(uint const region_idx, uint const card_in_region) const { - size_t const idx = ((size_t)region_idx << _log_scan_chunks_per_region) + (card_in_region >> _scan_chunks_shift); - assert(idx < _num_total_scan_chunks, "Index %zu out of bounds %zu", - idx, _num_total_scan_chunks); - return _region_scan_chunks[idx]; - } - -private: + G1CardTableClaimTable _card_claim_table; // The complete set of regions which card table needs to be cleared at the end - // of GC because we scribbled over these card tables. + // of GC because we scribbled over these card table entries. // // Regions may be added for two reasons: - // - they were part of the collection set: they may contain g1_young_card_val - // or regular card marks that we never scan so we must always clear their card - // table + // - they were part of the collection set: they may contain regular card marks + // that we never scan so we must always clear their card table. // - or in case g1 does an optional evacuation pass, g1 marks the cards in there // as g1_scanned_card_val. If G1 only did an initial evacuation pass, the // scanning already cleared these cards. In that case they are not in this set @@ -129,7 +104,7 @@ private: // in the current evacuation pass. G1DirtyRegions* _next_dirty_regions; - // Set of (unique) regions that can be added to concurrently. +// Set of (unique) regions that can be added to concurrently. class G1DirtyRegions : public CHeapObj { uint* _buffer; uint _cur_idx; @@ -147,8 +122,6 @@ private: reset(); } - static size_t chunk_size() { return M; } - ~G1DirtyRegions() { FREE_C_HEAP_ARRAY(uint, _buffer); FREE_C_HEAP_ARRAY(bool, _contains); @@ -197,7 +170,7 @@ private: // entries from free regions. HeapWord** _scan_top; - class G1ClearCardTableTask : public G1AbstractSubTask { +class G1ClearCardTableTask : public G1AbstractSubTask { G1CollectedHeap* _g1h; G1DirtyRegions* _regions; uint volatile _cur_dirty_regions; @@ -229,9 +202,9 @@ private: virtual ~G1ClearCardTableTask() { _scan_state->cleanup(); -#ifndef PRODUCT - G1CollectedHeap::heap()->verifier()->verify_card_table_cleanup(); -#endif + if (VerifyDuringGC) { + G1CollectedHeap::heap()->verifier()->verify_card_table_cleanup(); + } } void do_work(uint worker_id) override { @@ -243,7 +216,15 @@ private: for (uint i = next; i < max; i++) { G1HeapRegion* r = _g1h->region_at(_regions->at(i)); - r->clear_cardtable(); + // The card table contains "dirty" card marks. Clear unconditionally. + // + // Humongous reclaim candidates are not in the dirty set. This is fine because + // their card and refinement table should always be clear as they are typeArrays. + r->clear_card_table(); + // There is no need to clear the refinement table here: at the start of the collection + // we had to clear the refinement card table for collection set regions already, and any + // old regions use it for old->collection set candidates, so they should not be cleared + // either. } } } @@ -251,56 +232,41 @@ private: public: G1RemSetScanState() : - _max_reserved_regions(0), - _card_table_scan_state(nullptr), - _scan_chunks_per_region(G1CollectedHeap::get_chunks_per_region()), - _log_scan_chunks_per_region(log2i(_scan_chunks_per_region)), - _region_scan_chunks(nullptr), - _num_total_scan_chunks(0), - _scan_chunks_shift(0), + _card_claim_table(G1CollectedHeap::get_chunks_per_region_for_scan()), _all_dirty_regions(nullptr), _next_dirty_regions(nullptr), - _scan_top(nullptr) { - } + _scan_top(nullptr) { } ~G1RemSetScanState() { - FREE_C_HEAP_ARRAY(uint, _card_table_scan_state); - FREE_C_HEAP_ARRAY(bool, _region_scan_chunks); FREE_C_HEAP_ARRAY(HeapWord*, _scan_top); } - void initialize(size_t max_reserved_regions) { - assert(_card_table_scan_state == nullptr, "Must not be initialized twice"); - _max_reserved_regions = max_reserved_regions; - _card_table_scan_state = NEW_C_HEAP_ARRAY(uint, max_reserved_regions, mtGC); - _num_total_scan_chunks = max_reserved_regions * _scan_chunks_per_region; - _region_scan_chunks = NEW_C_HEAP_ARRAY(bool, _num_total_scan_chunks, mtGC); - - _scan_chunks_shift = (uint8_t)log2i(G1HeapRegion::CardsPerRegion / _scan_chunks_per_region); + void initialize(uint max_reserved_regions) { + _card_claim_table.initialize(max_reserved_regions); _scan_top = NEW_C_HEAP_ARRAY(HeapWord*, max_reserved_regions, mtGC); } + // Reset the claim and clear scan top for all regions, including + // regions currently not available or free. Since regions might + // become used during the collection these values must be valid + // for those regions as well. void prepare() { - // Reset the claim and clear scan top for all regions, including - // regions currently not available or free. Since regions might - // become used during the collection these values must be valid - // for those regions as well. - for (size_t i = 0; i < _max_reserved_regions; i++) { + size_t max_reserved_regions = _card_claim_table.max_reserved_regions(); + + for (size_t i = 0; i < max_reserved_regions; i++) { clear_scan_top((uint)i); } - _all_dirty_regions = new G1DirtyRegions(_max_reserved_regions); - _next_dirty_regions = new G1DirtyRegions(_max_reserved_regions); + _all_dirty_regions = new G1DirtyRegions(max_reserved_regions); + _next_dirty_regions = new G1DirtyRegions(max_reserved_regions); } void prepare_for_merge_heap_roots() { - assert(_next_dirty_regions->size() == 0, "next dirty regions must be empty"); + // We populate the next dirty regions at the start of GC with all old/humongous + // regions. + //assert(_next_dirty_regions->size() == 0, "next dirty regions must be empty"); - for (size_t i = 0; i < _max_reserved_regions; i++) { - _card_table_scan_state[i] = 0; - } - - ::memset(_region_scan_chunks, false, _num_total_scan_chunks * sizeof(*_region_scan_chunks)); + _card_claim_table.reset_all_to_unclaimed(); } void complete_evac_phase(bool merge_dirty_regions) { @@ -321,38 +287,10 @@ public: return (hr != nullptr && !hr->in_collection_set() && hr->is_old_or_humongous()); } - size_t num_visited_cards() const { - size_t result = 0; - for (uint i = 0; i < _num_total_scan_chunks; i++) { - if (_region_scan_chunks[i]) { - result++; - } - } - return result * (G1HeapRegion::CardsPerRegion / _scan_chunks_per_region); - } - size_t num_cards_in_dirty_regions() const { return _next_dirty_regions->size() * G1HeapRegion::CardsPerRegion; } - void set_chunk_range_dirty(size_t const region_card_idx, size_t const card_length) { - size_t chunk_idx = region_card_idx >> _scan_chunks_shift; - // Make sure that all chunks that contain the range are marked. Calculate the - // chunk of the last card that is actually marked. - size_t const end_chunk = (region_card_idx + card_length - 1) >> _scan_chunks_shift; - for (; chunk_idx <= end_chunk; chunk_idx++) { - _region_scan_chunks[chunk_idx] = true; - } - } - - void set_chunk_dirty(size_t const card_idx) { - assert((card_idx >> _scan_chunks_shift) < _num_total_scan_chunks, - "Trying to access index %zu out of bounds %zu", - card_idx >> _scan_chunks_shift, _num_total_scan_chunks); - size_t const chunk_idx = card_idx >> _scan_chunks_shift; - _region_scan_chunks[chunk_idx] = true; - } - G1AbstractSubTask* create_cleanup_after_scan_heap_roots_task() { return new G1ClearCardTableTask(G1CollectedHeap::heap(), _all_dirty_regions, this); } @@ -391,22 +329,16 @@ public: } bool has_cards_to_scan(uint region) { - assert(region < _max_reserved_regions, "Tried to access invalid region %u", region); - return _card_table_scan_state[region] < G1HeapRegion::CardsPerRegion; - } - - uint claim_cards_to_scan(uint region, uint increment) { - assert(region < _max_reserved_regions, "Tried to access invalid region %u", region); - return AtomicAccess::fetch_then_add(&_card_table_scan_state[region], increment, memory_order_relaxed); + return _card_claim_table.has_unclaimed_cards(region); } void add_dirty_region(uint const region) { -#ifdef ASSERT + #ifdef ASSERT G1HeapRegion* hr = G1CollectedHeap::heap()->region_at(region); assert(!hr->in_collection_set() && hr->is_old_or_humongous(), "Region %u is not suitable for scanning, is %sin collection set or %s", hr->hrm_index(), hr->in_collection_set() ? "" : "not ", hr->get_short_type_str()); -#endif + #endif _next_dirty_regions->add_dirty_region(region); } @@ -431,14 +363,16 @@ public: void clear_scan_top(uint region_idx) { set_scan_top(region_idx, nullptr); } + + G1CardTableChunkClaimer claimer(uint region_idx) { + return G1CardTableChunkClaimer(&_card_claim_table, region_idx); + } }; -G1RemSet::G1RemSet(G1CollectedHeap* g1h, - G1CardTable* ct) : +G1RemSet::G1RemSet(G1CollectedHeap* g1h) : _scan_state(new G1RemSetScanState()), _prev_period_summary(false), _g1h(g1h), - _ct(ct), _g1p(_g1h->policy()) { } @@ -450,36 +384,6 @@ void G1RemSet::initialize(uint max_reserved_regions) { _scan_state->initialize(max_reserved_regions); } -// Helper class to claim dirty chunks within the card table. -class G1CardTableChunkClaimer { - G1RemSetScanState* _scan_state; - uint _region_idx; - uint _cur_claim; - -public: - G1CardTableChunkClaimer(G1RemSetScanState* scan_state, uint region_idx) : - _scan_state(scan_state), - _region_idx(region_idx), - _cur_claim(0) { - guarantee(size() <= G1HeapRegion::CardsPerRegion, "Should not claim more space than possible."); - } - - bool has_next() { - while (true) { - _cur_claim = _scan_state->claim_cards_to_scan(_region_idx, size()); - if (_cur_claim >= G1HeapRegion::CardsPerRegion) { - return false; - } - if (_scan_state->chunk_needs_scan(_region_idx, _cur_claim)) { - return true; - } - } - } - - uint value() const { return _cur_claim; } - uint size() const { return _scan_state->scan_chunk_size_in_cards(); } -}; - // Scans a heap region for dirty cards. class G1ScanHRForRegionClosure : public G1HeapRegionClosure { using CardValue = CardTable::CardValue; @@ -495,6 +399,8 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure { uint _worker_id; + size_t _cards_pending; + size_t _cards_empty; size_t _cards_scanned; size_t _blocks_scanned; size_t _chunks_claimed; @@ -508,9 +414,9 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure { HeapWord* _scanned_to; CardValue _scanned_card_value; - HeapWord* scan_memregion(uint region_idx_for_card, MemRegion mr) { + HeapWord* scan_memregion(uint region_idx_for_card, MemRegion mr, size_t &roots_found) { G1HeapRegion* const card_region = _g1h->region_at(region_idx_for_card); - G1ScanCardClosure card_cl(_g1h, _pss, _heap_roots_found); + G1ScanCardClosure card_cl(_g1h, _pss, roots_found); HeapWord* const scanned_to = card_region->oops_on_memregion_seq_iterate_careful(mr, &card_cl); assert(scanned_to != nullptr, "Should be able to scan range"); @@ -520,8 +426,8 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure { return scanned_to; } - void do_claimed_block(uint const region_idx, CardValue* const dirty_l, CardValue* const dirty_r) { - _ct->change_dirty_cards_to(dirty_l, dirty_r, _scanned_card_value); + void do_claimed_block(uint const region_idx, CardValue* const dirty_l, CardValue* const dirty_r, size_t& pending_cards) { + pending_cards += _ct->change_dirty_cards_to(dirty_l, dirty_r, _scanned_card_value); size_t num_cards = pointer_delta(dirty_r, dirty_l, sizeof(CardValue)); _blocks_scanned++; @@ -536,115 +442,22 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure { return; } MemRegion mr(MAX2(card_start, _scanned_to), scan_end); - _scanned_to = scan_memregion(region_idx, mr); + size_t roots_found = 0; + _scanned_to = scan_memregion(region_idx, mr, roots_found); + if (roots_found == 0) { + _cards_empty += num_cards; + } _cards_scanned += num_cards; + _heap_roots_found += roots_found; } - // To locate consecutive dirty cards inside a chunk. - class ChunkScanner { - using Word = size_t; - - CardValue* const _start_card; - CardValue* const _end_card; - - static const size_t ExpandedToScanMask = G1CardTable::WordAlreadyScanned; - static const size_t ToScanMask = G1CardTable::g1_card_already_scanned; - - static bool is_card_dirty(const CardValue* const card) { - return (*card & ToScanMask) == 0; - } - - static bool is_word_aligned(const void* const addr) { - return ((uintptr_t)addr) % sizeof(Word) == 0; - } - - CardValue* find_first_dirty_card(CardValue* i_card) const { - while (!is_word_aligned(i_card)) { - if (is_card_dirty(i_card)) { - return i_card; - } - i_card++; - } - - for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) { - Word word_value = *reinterpret_cast(i_card); - bool has_dirty_cards_in_word = (~word_value & ExpandedToScanMask) != 0; - - if (has_dirty_cards_in_word) { - for (uint i = 0; i < sizeof(Word); ++i) { - if (is_card_dirty(i_card)) { - return i_card; - } - i_card++; - } - assert(false, "should have early-returned"); - } - } - - return _end_card; - } - - CardValue* find_first_non_dirty_card(CardValue* i_card) const { - while (!is_word_aligned(i_card)) { - if (!is_card_dirty(i_card)) { - return i_card; - } - i_card++; - } - - for (/* empty */; i_card < _end_card; i_card += sizeof(Word)) { - Word word_value = *reinterpret_cast(i_card); - bool all_cards_dirty = (word_value == G1CardTable::WordAllDirty); - - if (!all_cards_dirty) { - for (uint i = 0; i < sizeof(Word); ++i) { - if (!is_card_dirty(i_card)) { - return i_card; - } - i_card++; - } - assert(false, "should have early-returned"); - } - } - - return _end_card; - } - - public: - ChunkScanner(CardValue* const start_card, CardValue* const end_card) : - _start_card(start_card), - _end_card(end_card) { - assert(is_word_aligned(start_card), "precondition"); - assert(is_word_aligned(end_card), "precondition"); - } - - template - void on_dirty_cards(Func&& f) { - for (CardValue* cur_card = _start_card; cur_card < _end_card; /* empty */) { - CardValue* dirty_l = find_first_dirty_card(cur_card); - CardValue* dirty_r = find_first_non_dirty_card(dirty_l); - - assert(dirty_l <= dirty_r, "inv"); - - if (dirty_l == dirty_r) { - assert(dirty_r == _end_card, "finished the entire chunk"); - return; - } - - f(dirty_l, dirty_r); - - cur_card = dirty_r + 1; - } - } - }; - void scan_heap_roots(G1HeapRegion* r) { uint const region_idx = r->hrm_index(); ResourceMark rm; - G1CardTableChunkClaimer claim(_scan_state, region_idx); + G1CardTableChunkClaimer claim = _scan_state->claimer(region_idx); // Set the current scan "finger" to null for every heap region to scan. Since // the claim value is monotonically increasing, the check to not scan below this @@ -652,6 +465,8 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure { // to resetting this value for every claim. _scanned_to = nullptr; + size_t pending_cards = 0; + while (claim.has_next()) { _chunks_claimed++; @@ -660,11 +475,12 @@ class G1ScanHRForRegionClosure : public G1HeapRegionClosure { CardValue* const start_card = _ct->byte_for_index(region_card_base_idx); CardValue* const end_card = start_card + claim.size(); - ChunkScanner chunk_scanner{start_card, end_card}; + G1ChunkScanner chunk_scanner{start_card, end_card}; chunk_scanner.on_dirty_cards([&] (CardValue* dirty_l, CardValue* dirty_r) { - do_claimed_block(region_idx, dirty_l, dirty_r); + do_claimed_block(region_idx, dirty_l, dirty_r, pending_cards); }); } + _cards_pending += pending_cards; } public: @@ -679,6 +495,8 @@ public: _scan_state(scan_state), _phase(phase), _worker_id(worker_id), + _cards_pending(0), + _cards_empty(0), _cards_scanned(0), _blocks_scanned(0), _chunks_claimed(0), @@ -706,6 +524,8 @@ public: Tickspan rem_set_root_scan_time() const { return _rem_set_root_scan_time; } Tickspan rem_set_trim_partially_time() const { return _rem_set_trim_partially_time; } + size_t cards_pending() const { return _cards_pending; } + size_t cards_scanned_empty() const { return _cards_empty; } size_t cards_scanned() const { return _cards_scanned; } size_t blocks_scanned() const { return _blocks_scanned; } size_t chunks_claimed() const { return _chunks_claimed; } @@ -728,6 +548,9 @@ void G1RemSet::scan_heap_roots(G1ParScanThreadState* pss, p->record_or_add_time_secs(objcopy_phase, worker_id, cl.rem_set_trim_partially_time().seconds()); p->record_or_add_time_secs(scan_phase, worker_id, cl.rem_set_root_scan_time().seconds()); + + p->record_or_add_thread_work_item(scan_phase, worker_id, cl.cards_pending(), G1GCPhaseTimes::ScanHRPendingCards); + p->record_or_add_thread_work_item(scan_phase, worker_id, cl.cards_scanned_empty(), G1GCPhaseTimes::ScanHRScannedEmptyCards); p->record_or_add_thread_work_item(scan_phase, worker_id, cl.cards_scanned(), G1GCPhaseTimes::ScanHRScannedCards); p->record_or_add_thread_work_item(scan_phase, worker_id, cl.blocks_scanned(), G1GCPhaseTimes::ScanHRScannedBlocks); p->record_or_add_thread_work_item(scan_phase, worker_id, cl.chunks_claimed(), G1GCPhaseTimes::ScanHRClaimedChunks); @@ -901,6 +724,7 @@ void G1RemSet::prepare_region_for_scan(G1HeapRegion* r) { assert_scan_top_is_null(hrm_index); } else if (r->is_old_or_humongous()) { _scan_state->set_scan_top(hrm_index, r->top()); + _scan_state->add_dirty_region(hrm_index); } else { assert_scan_top_is_null(hrm_index); assert(r->is_free(), @@ -956,6 +780,90 @@ public: } }; +// Task to merge a non-dirty refinement table into the (primary) card table. +class MergeRefinementTableTask : public WorkerTask { + + G1CardTableClaimTable* _scan_state; + uint _max_workers; + + class G1MergeRefinementTableRegionClosure : public G1HeapRegionClosure { + G1CardTableClaimTable* _scan_state; + + bool do_heap_region(G1HeapRegion* r) override { + if (!_scan_state->has_unclaimed_cards(r->hrm_index())) { + return false; + } + + // We can blindly clear all collection set region's refinement tables: these + // regions will be evacuated and need their refinement table reset in case + // of evacuation failure. + // Young regions contain random marks, which are obvious to just clear. The + // card marks of other collection set region's refinement tables are also + // uninteresting. + if (r->in_collection_set()) { + uint claim = _scan_state->claim_all_cards(r->hrm_index()); + // Concurrent refinement may have started merging this region (we also + // get here for non-young regions), the claim may be non-zero for those. + // We could get away here with just clearing the area from the current + // claim to the last card in the region, but for now just do it all. + if (claim < G1HeapRegion::CardsPerRegion) { + r->clear_refinement_table(); + } + return false; + } + + assert(r->is_old_or_humongous(), "must be"); + + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + G1CardTable* card_table = g1h->card_table(); + G1CardTable* refinement_table = g1h->refinement_table(); + + size_t const region_card_base_idx = (size_t)r->hrm_index() << G1HeapRegion::LogCardsPerRegion; + + G1CardTableChunkClaimer claim(_scan_state, r->hrm_index()); + + while (claim.has_next()) { + size_t const start_idx = region_card_base_idx + claim.value(); + + size_t* card_cur_word = (size_t*)card_table->byte_for_index(start_idx); + + size_t* refinement_cur_word = (size_t*)refinement_table->byte_for_index(start_idx); + size_t* const refinement_end_word = refinement_cur_word + claim.size() / (sizeof(size_t) / sizeof(G1CardTable::CardValue)); + + for (; refinement_cur_word < refinement_end_word; ++refinement_cur_word, ++card_cur_word) { + size_t value = *refinement_cur_word; + *refinement_cur_word = G1CardTable::WordAllClean; + // Dirty is "0", so we need to logically-and here. This is also safe + // for all other possible values in the card table; at this point this + // can be either g1_dirty_card or g1_to_cset_card which will both be + // scanned. + size_t new_value = *card_cur_word & value; + *card_cur_word = new_value; + } + } + + return false; + } + + public: + G1MergeRefinementTableRegionClosure(G1CardTableClaimTable* scan_state) : G1HeapRegionClosure(), _scan_state(scan_state) { + } + }; + +public: + MergeRefinementTableTask(G1CardTableClaimTable* scan_state, uint max_workers) : + WorkerTask("Merge Refinement Table"), _scan_state(scan_state), _max_workers(max_workers) { guarantee(_scan_state != nullptr, "must be"); } + + void work(uint worker_id) override { + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + + G1GCParPhaseTimesTracker x(g1h->phase_times(), G1GCPhaseTimes::SweepRT, worker_id, false /* allow multiple invocation */); + + G1MergeRefinementTableRegionClosure cl(_scan_state); + _scan_state->heap_region_iterate_from_worker_offset(&cl, worker_id, _max_workers); + } +}; + class G1MergeHeapRootsTask : public WorkerTask { class G1MergeCardSetStats { @@ -973,12 +881,16 @@ class G1MergeHeapRootsTask : public WorkerTask { _merged[tag]++; } - void inc_remset_cards(size_t increment = 1) { - _merged[G1GCPhaseTimes::MergeRSCards] += increment; + void inc_merged_cards(size_t increment = 1) { + _merged[G1GCPhaseTimes::MergeRSFromRemSetCards] += increment; + } + + void inc_total_cards(size_t increment = 1) { + _merged[G1GCPhaseTimes::MergeRSTotalCards] += increment; } void dec_remset_cards(size_t decrement) { - _merged[G1GCPhaseTimes::MergeRSCards] -= decrement; + _merged[G1GCPhaseTimes::MergeRSTotalCards] -= decrement; } size_t merged(uint i) const { return _merged[i]; } @@ -1031,10 +943,10 @@ class G1MergeHeapRootsTask : public WorkerTask { } void mark_card(G1CardTable::CardValue* value) { - if (_ct->mark_clean_as_dirty(value)) { - _scan_state->set_chunk_dirty(_ct->index_for_cardvalue(value)); + if (_ct->mark_clean_as_from_remset(value)) { + _stats.inc_merged_cards(); } - _stats.inc_remset_cards(); + _stats.inc_total_cards(); } public: @@ -1054,7 +966,7 @@ class G1MergeHeapRootsTask : public WorkerTask { // Returns whether the given region actually needs iteration. bool start_iterate(uint const tag, uint const region_idx) { - assert(tag < G1GCPhaseTimes::MergeRSCards, "invalid tag %u", tag); + assert(tag < G1GCPhaseTimes::MergeRSFromRemSetCards, "invalid tag %u", tag); if (remember_if_interesting(region_idx)) { _region_base_idx = (size_t)region_idx << G1HeapRegion::LogCardsPerRegion; _stats.inc_card_set_merged(tag); @@ -1064,9 +976,9 @@ class G1MergeHeapRootsTask : public WorkerTask { } void do_card_range(uint const start_card_idx, uint const length) { - _ct->mark_range_dirty(_region_base_idx + start_card_idx, length); - _stats.inc_remset_cards(length); - _scan_state->set_chunk_range_dirty(_region_base_idx + start_card_idx, length); + size_t cards_changed = _ct->mark_clean_range_as_from_remset(_region_base_idx + start_card_idx, length); + _stats.inc_merged_cards(cards_changed); + _stats.inc_total_cards(length); } G1MergeCardSetStats stats() { @@ -1086,12 +998,19 @@ class G1MergeHeapRootsTask : public WorkerTask { class G1ClearBitmapClosure : public G1HeapRegionClosure { G1CollectedHeap* _g1h; G1RemSetScanState* _scan_state; + bool _initial_evacuation; void assert_bitmap_clear(G1HeapRegion* hr, const G1CMBitMap* bitmap) { assert(bitmap->get_next_marked_addr(hr->bottom(), hr->end()) == hr->end(), "Bitmap should have no mark for region %u (%s)", hr->hrm_index(), hr->get_short_type_str()); } + void assert_refinement_table_clear(G1HeapRegion* hr) { +#ifdef ASSERT + _g1h->refinement_table()->verify_region(MemRegion(hr->bottom(), hr->end()), G1CardTable::clean_card_val(), true); +#endif + } + bool should_clear_region(G1HeapRegion* hr) const { // The bitmap for young regions must obviously be clear as we never mark through them; // old regions that are currently being marked through are only in the collection set @@ -1110,14 +1029,31 @@ class G1MergeHeapRootsTask : public WorkerTask { } public: - G1ClearBitmapClosure(G1CollectedHeap* g1h, G1RemSetScanState* scan_state) : + G1ClearBitmapClosure(G1CollectedHeap* g1h, G1RemSetScanState* scan_state, bool initial_evacuation) : _g1h(g1h), - _scan_state(scan_state) + _scan_state(scan_state), + _initial_evacuation(initial_evacuation) { } bool do_heap_region(G1HeapRegion* hr) { assert(_g1h->is_in_cset(hr), "Should only be used iterating the collection set"); + // Collection set regions after the initial evacuation need their refinement + // table cleared because + // * we use the refinement table for recording references to other regions + // during evacuation failure handling + // * during previous passes we used the refinement table to contain marks for + // cross-region references. Now that we evacuate the region, they need to be + // cleared. + // + // We do not need to do this extra work for initial evacuation because we + // make sure the refinement table is clean for all regions either in + // concurrent refinement or in the merge refinement table phase earlier. + if (!_initial_evacuation) { + hr->clear_refinement_table(); + } else { + assert_refinement_table_clear(hr); + } // Evacuation failure uses the bitmap to record evacuation failed objects, // so the bitmap for the regions in the collection set must be cleared if not already. if (should_clear_region(hr)) { @@ -1177,145 +1113,23 @@ class G1MergeHeapRootsTask : public WorkerTask { } }; - // Visitor for the log buffer entries to merge them into the card table. - class G1MergeLogBufferCardsClosure : public G1CardTableEntryClosure { - - G1RemSetScanState* _scan_state; - G1CardTable* _ct; - - size_t _cards_dirty; - size_t _cards_skipped; - - void process_card(CardValue* card_ptr) { - if (*card_ptr == G1CardTable::dirty_card_val()) { - uint const region_idx = _ct->region_idx_for(card_ptr); - _scan_state->add_dirty_region(region_idx); - _scan_state->set_chunk_dirty(_ct->index_for_cardvalue(card_ptr)); - _cards_dirty++; - } - } - - public: - G1MergeLogBufferCardsClosure(G1CollectedHeap* g1h, G1RemSetScanState* scan_state) : - _scan_state(scan_state), - _ct(g1h->card_table()), - _cards_dirty(0), - _cards_skipped(0) - {} - - void do_card_ptr(CardValue* card_ptr) override { - // The only time we care about recording cards that - // contain references that point into the collection set - // is during RSet updating within an evacuation pause. - assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause"); - - uint const region_idx = _ct->region_idx_for(card_ptr); - - // The second clause must come after - the log buffers might contain cards to uncommitted - // regions. - // This code may count duplicate entries in the log buffers (even if rare) multiple - // times. - if (_scan_state->contains_cards_to_process(region_idx)) { - process_card(card_ptr); - } else { - // We may have had dirty cards in the (initial) collection set (or the - // young regions which are always in the initial collection set). We do - // not fix their cards here: we already added these regions to the set of - // regions to clear the card table at the end during the prepare() phase. - _cards_skipped++; - } - } - - size_t cards_dirty() const { return _cards_dirty; } - size_t cards_skipped() const { return _cards_skipped; } - }; - uint _num_workers; G1HeapRegionClaimer _hr_claimer; G1RemSetScanState* _scan_state; - // To mitigate contention due multiple threads accessing and popping BufferNodes from a shared - // G1DirtyCardQueueSet, we implement a sequential distribution phase. Here, BufferNodes are - // distributed to worker threads in a sequential manner utilizing the _dirty_card_buffers. By doing - // so, we effectively alleviate the bottleneck encountered during pop operations on the - // G1DirtyCardQueueSet. Importantly, this approach preserves the helping aspect among worker - // threads, allowing them to assist one another in case of imbalances in work distribution. - BufferNode::Stack* _dirty_card_buffers; - bool _initial_evacuation; volatile bool _fast_reclaim_handled; - void apply_closure_to_dirty_card_buffers(G1MergeLogBufferCardsClosure* cl, uint worker_id) { - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - for (uint i = 0; i < _num_workers; i++) { - uint index = (worker_id + i) % _num_workers; - while (BufferNode* node = _dirty_card_buffers[index].pop()) { - cl->apply_to_buffer(node, worker_id); - dcqs.deallocate_buffer(node); - } - } - } - public: G1MergeHeapRootsTask(G1RemSetScanState* scan_state, uint num_workers, bool initial_evacuation) : WorkerTask("G1 Merge Heap Roots"), _num_workers(num_workers), _hr_claimer(num_workers), _scan_state(scan_state), - _dirty_card_buffers(nullptr), _initial_evacuation(initial_evacuation), _fast_reclaim_handled(false) - { - if (initial_evacuation) { - Ticks start = Ticks::now(); - - _dirty_card_buffers = NEW_C_HEAP_ARRAY(BufferNode::Stack, num_workers, mtGC); - for (uint i = 0; i < num_workers; i++) { - new (&_dirty_card_buffers[i]) BufferNode::Stack(); - } - - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - BufferNodeList buffers = dcqs.take_all_completed_buffers(); - - size_t entries_per_thread = ceil(buffers._entry_count / (double)num_workers); - - BufferNode* head = buffers._head; - BufferNode* tail = head; - - uint worker = 0; - while (tail != nullptr) { - size_t count = tail->size(); - BufferNode* cur = tail->next(); - - while (count < entries_per_thread && cur != nullptr) { - tail = cur; - count += tail->size(); - cur = tail->next(); - } - - tail->set_next(nullptr); - _dirty_card_buffers[worker++ % num_workers].prepend(*head, *tail); - - assert(cur != nullptr || tail == buffers._tail, "Must be"); - head = cur; - tail = cur; - } - - Tickspan total = Ticks::now() - start; - G1CollectedHeap::heap()->phase_times()->record_distribute_log_buffers_time_ms(total.seconds() * 1000.0); - } - } - - ~G1MergeHeapRootsTask() { - if (_dirty_card_buffers != nullptr) { - using Stack = BufferNode::Stack; - for (uint i = 0; i < _num_workers; i++) { - _dirty_card_buffers[i].~Stack(); - } - FREE_C_HEAP_ARRAY(Stack, _dirty_card_buffers); - } - } + { } virtual void work(uint worker_id) { G1CollectedHeap* g1h = G1CollectedHeap::heap(); @@ -1368,50 +1182,28 @@ public: // Preparation for evacuation failure handling. { - G1ClearBitmapClosure clear(g1h, _scan_state); + G1ClearBitmapClosure clear(g1h, _scan_state, _initial_evacuation); g1h->collection_set_iterate_increment_from(&clear, &_hr_claimer, worker_id); } - - // Now apply the closure to all remaining log entries. - if (_initial_evacuation) { - assert(merge_remset_phase == G1GCPhaseTimes::MergeRS, "Wrong merge phase"); - G1GCParPhaseTimesTracker x(p, G1GCPhaseTimes::MergeLB, worker_id); - - G1MergeLogBufferCardsClosure cl(g1h, _scan_state); - apply_closure_to_dirty_card_buffers(&cl, worker_id); - - p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_dirty(), G1GCPhaseTimes::MergeLBDirtyCards); - p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_skipped(), G1GCPhaseTimes::MergeLBSkippedCards); - } } }; -void G1RemSet::print_merge_heap_roots_stats() { - LogTarget(Debug, gc, remset) lt; - if (lt.is_enabled()) { - LogStream ls(lt); +static void merge_refinement_table() { + G1CollectedHeap* g1h = G1CollectedHeap::heap(); - size_t num_visited_cards = _scan_state->num_visited_cards(); + G1ConcurrentRefineSweepState& state = g1h->concurrent_refine()->sweep_state_for_merge(); + WorkerThreads* workers = g1h->workers(); - size_t total_dirty_region_cards = _scan_state->num_cards_in_dirty_regions(); - - G1CollectedHeap* g1h = G1CollectedHeap::heap(); - size_t total_old_region_cards = - (g1h->num_committed_regions() - (g1h->num_free_regions() - g1h->collection_set()->cur_length())) * G1HeapRegion::CardsPerRegion; - - ls.print_cr("Visited cards %zu Total dirty %zu (%.2lf%%) Total old %zu (%.2lf%%)", - num_visited_cards, - total_dirty_region_cards, - percent_of(num_visited_cards, total_dirty_region_cards), - total_old_region_cards, - percent_of(num_visited_cards, total_old_region_cards)); - } + MergeRefinementTableTask cl(state.sweep_table(), workers->active_workers()); + log_debug(gc, ergo)("Running %s using %u workers", cl.name(), workers->active_workers()); + workers->run_task(&cl); } void G1RemSet::merge_heap_roots(bool initial_evacuation) { G1CollectedHeap* g1h = G1CollectedHeap::heap(); G1GCPhaseTimes* pt = g1h->phase_times(); + // 1. Prepare the merging process { Ticks start = Ticks::now(); @@ -1425,28 +1217,42 @@ void G1RemSet::merge_heap_roots(bool initial_evacuation) { } } - WorkerThreads* workers = g1h->workers(); - size_t const increment_length = g1h->collection_set()->regions_cur_length(); + // 2. (Optionally) Merge the refinement table into the card table (if needed). + G1ConcurrentRefineSweepState& state = g1h->concurrent_refine()->sweep_state(); + if (initial_evacuation && state.is_in_progress()) { + Ticks start = Ticks::now(); - uint const num_workers = initial_evacuation ? workers->active_workers() : - MIN2(workers->active_workers(), (uint)increment_length); + merge_refinement_table(); + g1h->phase_times()->record_merge_refinement_table_time((Ticks::now() - start).seconds() * MILLIUNITS); + } + + // 3. Merge other heap roots. Ticks start = Ticks::now(); { + WorkerThreads* workers = g1h->workers(); + + size_t const increment_length = g1h->collection_set()->groups_increment_length(); + + uint const num_workers = initial_evacuation ? workers->active_workers() : + MIN2(workers->active_workers(), (uint)increment_length); + G1MergeHeapRootsTask cl(_scan_state, num_workers, initial_evacuation); log_debug(gc, ergo)("Running %s using %u workers for %zu regions", cl.name(), num_workers, increment_length); workers->run_task(&cl, num_workers); } - print_merge_heap_roots_stats(); - if (initial_evacuation) { pt->record_merge_heap_roots_time((Ticks::now() - start).seconds() * 1000.0); } else { pt->record_or_add_optional_merge_heap_roots_time((Ticks::now() - start).seconds() * 1000.0); } + + if (VerifyDuringGC && initial_evacuation) { + g1h->verifier()->verify_card_tables_clean(false /* both_card_tables */); + } } void G1RemSet::complete_evac_phase(bool has_more_than_one_evacuation_phase) { @@ -1482,86 +1288,20 @@ inline void check_card_ptr(CardTable::CardValue* card_ptr, G1CardTable* ct) { #endif } -bool G1RemSet::clean_card_before_refine(CardValue** const card_ptr_addr) { - assert(!SafepointSynchronize::is_at_safepoint(), "Only call concurrently"); - - CardValue* card_ptr = *card_ptr_addr; - // Find the start address represented by the card. - HeapWord* start = _ct->addr_for(card_ptr); - // And find the region containing it. - G1HeapRegion* r = _g1h->heap_region_containing_or_null(start); - - // If this is a (stale) card into an uncommitted region, exit. - if (r == nullptr) { - return false; - } - - check_card_ptr(card_ptr, _ct); - - // If the card is no longer dirty, nothing to do. - // We cannot load the card value before the "r == nullptr" check above, because G1 - // could uncommit parts of the card table covering uncommitted regions. - if (*card_ptr != G1CardTable::dirty_card_val()) { - return false; - } - - // This check is needed for some uncommon cases where we should - // ignore the card. - // - // The region could be young. Cards for young regions are - // distinctly marked (set to g1_young_gen), so the post-barrier will - // filter them out. However, that marking is performed - // concurrently. A write to a young object could occur before the - // card has been marked young, slipping past the filter. - // - // The card could be stale, because the region has been freed since - // the card was recorded. In this case the region type could be - // anything. If (still) free or (reallocated) young, just ignore - // it. If (reallocated) old or humongous, the later card trimming - // and additional checks in iteration may detect staleness. At - // worst, we end up processing a stale card unnecessarily. - // - // In the normal (non-stale) case, the synchronization between the - // enqueueing of the card and processing it here will have ensured - // we see the up-to-date region type here. - if (!r->is_old_or_humongous()) { - return false; - } - - // Trim the region designated by the card to what's been allocated - // in the region. The card could be stale, or the card could cover - // (part of) an object at the end of the allocated space and extend - // beyond the end of allocation. - - // Non-humongous objects are either allocated in the old regions during GC. - // So if region is old then top is stable. - // Humongous object allocation sets top last; if top has not yet been set, - // this is a stale card and we'll end up with an empty intersection. - // If this is not a stale card, the synchronization between the - // enqueuing of the card and processing it here will have ensured - // we see the up-to-date top here. - HeapWord* scan_limit = r->top(); - - if (scan_limit <= start) { - // If the trimmed region is empty, the card must be stale. - return false; - } - - // Okay to clean and process the card now. There are still some - // stale card cases that may be detected by iteration and dealt with - // as iteration failure. - *const_cast(card_ptr) = G1CardTable::clean_card_val(); - - return true; -} - -void G1RemSet::refine_card_concurrently(CardValue* const card_ptr, - const uint worker_id) { +G1RemSet::RefineResult G1RemSet::refine_card_concurrently(CardValue* const card_ptr, + const uint worker_id) { assert(!_g1h->is_stw_gc_active(), "Only call concurrently"); - check_card_ptr(card_ptr, _ct); + G1CardTable* ct = _g1h->refinement_table(); + check_card_ptr(card_ptr, ct); + + // That card is already known to contain a reference to the collection set. Skip + // further processing. + if (*card_ptr == G1CardTable::g1_to_cset_card) { + return AlreadyToCSet; + } // Construct the MemRegion representing the card. - HeapWord* start = _ct->addr_for(card_ptr); + HeapWord* start = ct->addr_for(card_ptr); // And find the region containing it. G1HeapRegion* r = _g1h->heap_region_containing(start); // This reload of the top is safe even though it happens after the full @@ -1571,7 +1311,7 @@ void G1RemSet::refine_card_concurrently(CardValue* const card_ptr, // cannot span across safepoint, so we don't need to worry about top being // changed during safepoint. HeapWord* scan_limit = r->top(); - assert(scan_limit > start, "sanity"); + assert(scan_limit > start, "sanity region %u (%s) scan_limit " PTR_FORMAT " start " PTR_FORMAT, r->hrm_index(), r->get_short_type_str(), p2i(scan_limit), p2i(start)); // Don't use addr_for(card_ptr + 1) which can ask for // a card beyond the heap. @@ -1581,43 +1321,21 @@ void G1RemSet::refine_card_concurrently(CardValue* const card_ptr, G1ConcurrentRefineOopClosure conc_refine_cl(_g1h, worker_id); if (r->oops_on_memregion_seq_iterate_careful(dirty_region, &conc_refine_cl) != nullptr) { - return; + if (conc_refine_cl.has_ref_to_cset()) { + return HasRefToCSet; + } else if (conc_refine_cl.has_ref_to_old()) { + return HasRefToOld; + } else { + return NoCrossRegion; + } } - // If unable to process the card then we encountered an unparsable // part of the heap (e.g. a partially allocated object, so only // temporarily a problem) while processing a stale card. Despite // the card being stale, we can't simply ignore it, because we've - // already marked the card cleaned, so taken responsibility for + // already marked the card as cleaned, so taken responsibility for // ensuring the card gets scanned. - // - // However, the card might have gotten re-dirtied and re-enqueued - // while we worked. (In fact, it's pretty likely.) - if (*card_ptr == G1CardTable::dirty_card_val()) { - return; - } - - enqueue_for_reprocessing(card_ptr); -} - -// Re-dirty and re-enqueue the card to retry refinement later. -// This is used to deal with a rare race condition in concurrent refinement. -void G1RemSet::enqueue_for_reprocessing(CardValue* card_ptr) { - // We can't use the thread-local queue, because that might be the queue - // that is being processed by us; we could be a Java thread conscripted to - // perform refinement on our queue's current buffer. This situation only - // arises from rare race condition, so it's not worth any significant - // development effort or clever lock-free queue implementation. Instead - // we use brute force, allocating and enqueuing an entire buffer for just - // this card. Since buffers are processed in FIFO order and we try to - // keep some in the queue, it is likely that the racing state will have - // resolved by the time this card comes up for reprocessing. - *card_ptr = G1CardTable::dirty_card_val(); - G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); - void** buffer = dcqs.allocate_buffer(); - size_t index = dcqs.buffer_capacity() - 1; - buffer[index] = card_ptr; - dcqs.enqueue_completed_buffer(BufferNode::make_node_from_buffer(buffer, index)); + return CouldNotParse; } void G1RemSet::print_periodic_summary_info(const char* header, uint period_count, bool show_thread_times) { diff --git a/src/hotspot/share/gc/g1/g1RemSet.hpp b/src/hotspot/share/gc/g1/g1RemSet.hpp index 50cc029a9a1..8b2353cdbb3 100644 --- a/src/hotspot/share/gc/g1/g1RemSet.hpp +++ b/src/hotspot/share/gc/g1/g1RemSet.hpp @@ -26,6 +26,7 @@ #define SHARE_GC_G1_G1REMSET_HPP #include "gc/g1/g1CardTable.hpp" +#include "gc/g1/g1CardTableClaimTable.hpp" #include "gc/g1/g1GCPhaseTimes.hpp" #include "gc/g1/g1HeapRegion.hpp" #include "gc/g1/g1OopClosures.hpp" @@ -65,20 +66,15 @@ private: G1CollectedHeap* _g1h; - G1CardTable* _ct; - G1Policy* _g1p; - - void print_merge_heap_roots_stats(); + G1Policy* _g1p; void assert_scan_top_is_null(uint hrm_index) NOT_DEBUG_RETURN; - void enqueue_for_reprocessing(CardValue* card_ptr); - public: // Initialize data that depends on the heap size being known. void initialize(uint max_num_regions); - G1RemSet(G1CollectedHeap* g1h, G1CardTable* ct); + G1RemSet(G1CollectedHeap* g1h); ~G1RemSet(); // Scan all cards in the non-collection set regions that potentially contain @@ -101,7 +97,7 @@ public: // Print coarsening stats. void print_coarsen_stats(); - // Creates a task for cleaining up temporary data structures and the + // Creates a task for cleaning up temporary data structures and the // card table, removing temporary duplicate detection information. G1AbstractSubTask* create_cleanup_after_scan_heap_roots_task(); // Excludes the given region from heap root scanning. @@ -122,16 +118,19 @@ public: G1GCPhaseTimes::GCParPhases scan_phase, G1GCPhaseTimes::GCParPhases objcopy_phase); - // Two methods for concurrent refinement support, executed concurrently to - // the mutator: - // Cleans the card at "*card_ptr_addr" before refinement, returns true iff the - // card needs later refinement. - bool clean_card_before_refine(CardValue** const card_ptr_addr); + enum RefineResult { + HasRefToCSet, // The (dirty) card has a reference to the collection set. + AlreadyToCSet, // The card is already one marked as having a reference to the collection set. + HasRefToOld, // The dirty card contains references to other old regions (not the collection set). + NoCrossRegion, // There is no interesting reference in the card any more. The mutator changed all + // references to such after dirtying the card. + CouldNotParse // The card is unparsable, need to retry later. + }; // Refine the region corresponding to "card_ptr". Must be called after // being filtered by clean_card_before_refine(), and after proper // fence/synchronization. - void refine_card_concurrently(CardValue* const card_ptr, - const uint worker_id); + RefineResult refine_card_concurrently(CardValue* const card_ptr, + const uint worker_id); // Print accumulated summary info from the start of the VM. void print_summary_info(); diff --git a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp index 49cc993dac2..3e9cf938097 100644 --- a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp +++ b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp @@ -27,7 +27,6 @@ #include "gc/g1/g1CollectedHeap.inline.hpp" #include "gc/g1/g1ConcurrentRefine.hpp" #include "gc/g1/g1ConcurrentRefineThread.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1HeapRegion.hpp" #include "gc/g1/g1HeapRegionRemSet.inline.hpp" #include "gc/g1/g1RemSet.hpp" @@ -37,39 +36,61 @@ #include "runtime/javaThread.hpp" void G1RemSetSummary::update() { - class CollectData : public ThreadClosure { + G1ConcurrentRefine* refine = G1CollectedHeap::heap()->concurrent_refine(); + + class CollectWorkerData : public ThreadClosure { G1RemSetSummary* _summary; uint _counter; public: - CollectData(G1RemSetSummary * summary) : _summary(summary), _counter(0) {} + CollectWorkerData(G1RemSetSummary* summary) : _summary(summary), _counter(0) {} virtual void do_thread(Thread* t) { G1ConcurrentRefineThread* crt = static_cast(t); - _summary->set_refine_thread_cpu_time(_counter, crt->cpu_time()); + _summary->set_worker_thread_cpu_time(_counter, crt->cpu_time()); _counter++; } } collector(this); - G1CollectedHeap* g1h = G1CollectedHeap::heap(); - g1h->concurrent_refine()->threads_do(&collector); + refine->worker_threads_do(&collector); + + class CollectControlData : public ThreadClosure { + G1RemSetSummary* _summary; + public: + CollectControlData(G1RemSetSummary* summary) : _summary(summary) {} + virtual void do_thread(Thread* t) { + G1ConcurrentRefineThread* crt = static_cast(t); + _summary->set_control_thread_cpu_time(crt->cpu_time()); + } + } control(this); + + refine->control_thread_do(&control); } -void G1RemSetSummary::set_refine_thread_cpu_time(uint thread, jlong value) { - assert(_refine_threads_cpu_times != nullptr, "just checking"); - assert(thread < _num_refine_threads, "just checking"); - _refine_threads_cpu_times[thread] = value; +void G1RemSetSummary::set_worker_thread_cpu_time(uint thread, jlong value) { + assert(_worker_threads_cpu_times != nullptr, "just checking"); + assert(thread < _num_worker_threads, "just checking"); + _worker_threads_cpu_times[thread] = value; } -jlong G1RemSetSummary::refine_thread_cpu_time(uint thread) const { - assert(_refine_threads_cpu_times != nullptr, "just checking"); - assert(thread < _num_refine_threads, "just checking"); - return _refine_threads_cpu_times[thread]; +void G1RemSetSummary::set_control_thread_cpu_time(jlong value) { + _control_thread_cpu_time = value; +} + +jlong G1RemSetSummary::worker_thread_cpu_time(uint thread) const { + assert(_worker_threads_cpu_times != nullptr, "just checking"); + assert(thread < _num_worker_threads, "just checking"); + return _worker_threads_cpu_times[thread]; +} + +jlong G1RemSetSummary::control_thread_cpu_time() const { + return _control_thread_cpu_time; } G1RemSetSummary::G1RemSetSummary(bool should_update) : - _num_refine_threads(G1ConcRefinementThreads), - _refine_threads_cpu_times(NEW_C_HEAP_ARRAY(jlong, _num_refine_threads, mtGC)) { + _num_worker_threads(G1ConcRefinementThreads), + _worker_threads_cpu_times(NEW_C_HEAP_ARRAY(jlong, _num_worker_threads, mtGC)), + _control_thread_cpu_time(0) { - memset(_refine_threads_cpu_times, 0, sizeof(jlong) * _num_refine_threads); + memset(_worker_threads_cpu_times, 0, sizeof(jlong) * _num_worker_threads); if (should_update) { update(); @@ -77,23 +98,25 @@ G1RemSetSummary::G1RemSetSummary(bool should_update) : } G1RemSetSummary::~G1RemSetSummary() { - FREE_C_HEAP_ARRAY(jlong, _refine_threads_cpu_times); + FREE_C_HEAP_ARRAY(jlong, _worker_threads_cpu_times); } void G1RemSetSummary::set(G1RemSetSummary* other) { assert(other != nullptr, "just checking"); - assert(_num_refine_threads == other->_num_refine_threads, "just checking"); + assert(_num_worker_threads == other->_num_worker_threads, "just checking"); - memcpy(_refine_threads_cpu_times, other->_refine_threads_cpu_times, sizeof(jlong) * _num_refine_threads); + memcpy(_worker_threads_cpu_times, other->_worker_threads_cpu_times, sizeof(jlong) * _num_worker_threads); + _control_thread_cpu_time = other->_control_thread_cpu_time; } void G1RemSetSummary::subtract_from(G1RemSetSummary* other) { assert(other != nullptr, "just checking"); - assert(_num_refine_threads == other->_num_refine_threads, "just checking"); + assert(_num_worker_threads == other->_num_worker_threads, "just checking"); - for (uint i = 0; i < _num_refine_threads; i++) { - set_refine_thread_cpu_time(i, other->refine_thread_cpu_time(i) - refine_thread_cpu_time(i)); + for (uint i = 0; i < _num_worker_threads; i++) { + set_worker_thread_cpu_time(i, other->worker_thread_cpu_time(i) - worker_thread_cpu_time(i)); } + _control_thread_cpu_time = other->_control_thread_cpu_time - _control_thread_cpu_time; } class G1PerRegionTypeRemSetCounters { @@ -376,9 +399,10 @@ public: void G1RemSetSummary::print_on(outputStream* out, bool show_thread_times) { if (show_thread_times) { out->print_cr(" Concurrent refinement threads times (s)"); + out->print_cr(" Control %5.2f Workers", (double)control_thread_cpu_time() / NANOSECS_PER_SEC); out->print(" "); - for (uint i = 0; i < _num_refine_threads; i++) { - out->print(" %5.2f", (double)refine_thread_cpu_time(i) / NANOSECS_PER_SEC); + for (uint i = 0; i < _num_worker_threads; i++) { + out->print(" %5.2f", (double)worker_thread_cpu_time(i) / NANOSECS_PER_SEC); } out->cr(); } diff --git a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp index 373f38952c8..dd7d55d5a2e 100644 --- a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp +++ b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp @@ -33,10 +33,12 @@ class G1RemSet; // A G1RemSetSummary manages statistical information about the remembered set. class G1RemSetSummary { - size_t _num_refine_threads; - jlong* _refine_threads_cpu_times; + size_t _num_worker_threads; + jlong* _worker_threads_cpu_times; + jlong _control_thread_cpu_time; - void set_refine_thread_cpu_time(uint thread, jlong value); + void set_worker_thread_cpu_time(uint thread, jlong value); + void set_control_thread_cpu_time(jlong value); // Update this summary with current data from various places. void update(); @@ -53,7 +55,8 @@ public: void print_on(outputStream* out, bool show_thread_times); - jlong refine_thread_cpu_time(uint thread) const; + jlong worker_thread_cpu_time(uint thread) const; + jlong control_thread_cpu_time() const; }; #endif // SHARE_GC_G1_G1REMSETSUMMARY_HPP diff --git a/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp new file mode 100644 index 00000000000..2f7acd9b710 --- /dev/null +++ b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "gc/g1/g1CollectedHeap.hpp" +#include "gc/g1/g1Policy.hpp" +#include "gc/g1/g1ReviseYoungLengthTask.hpp" +#include "gc/g1/g1ServiceThread.hpp" +#include "gc/shared/suspendibleThreadSet.hpp" + + +jlong G1ReviseYoungLengthTask::reschedule_delay_ms() const { + G1Policy* policy = G1CollectedHeap::heap()->policy(); + size_t available_bytes; + if (policy->try_get_available_bytes_estimate(available_bytes)) { + double predicted_time_to_next_gc_ms = policy->predict_time_to_next_gc_ms(available_bytes); + + // Use a prime number close to 50ms as minimum time, different to other components + // that derive their wait time from the try_get_available_bytes_estimate() call + // to minimize interference. + uint64_t const min_wait_time_ms = 47; + + return policy->adjust_wait_time_ms(predicted_time_to_next_gc_ms, min_wait_time_ms); + } else { + // Failed to get estimate of available bytes. Try again asap. + return 1; + } +} + +class G1ReviseYoungLengthTask::RemSetSamplingClosure : public G1HeapRegionClosure { + size_t _sampled_code_root_rs_length; + +public: + RemSetSamplingClosure() : _sampled_code_root_rs_length(0) { } + + bool do_heap_region(G1HeapRegion* r) override { + G1HeapRegionRemSet* rem_set = r->rem_set(); + _sampled_code_root_rs_length += rem_set->code_roots_list_length(); + return false; + } + + size_t sampled_code_root_rs_length() const { return _sampled_code_root_rs_length; } +}; + +void G1ReviseYoungLengthTask::adjust_young_list_target_length() { + G1CollectedHeap* g1h = G1CollectedHeap::heap(); + G1Policy* policy = g1h->policy(); + + assert(policy->use_adaptive_young_list_length(), "should not call otherwise"); + + size_t pending_cards; + size_t current_to_collection_set_cards; + { + MutexLocker x(G1ReviseYoungLength_lock, Mutex::_no_safepoint_check_flag); + pending_cards = policy->current_pending_cards(); + current_to_collection_set_cards = policy->current_to_collection_set_cards(); + } + + RemSetSamplingClosure cl; + g1h->collection_set()->iterate(&cl); + + policy->revise_young_list_target_length(pending_cards, + current_to_collection_set_cards, + cl.sampled_code_root_rs_length()); +} + +G1ReviseYoungLengthTask::G1ReviseYoungLengthTask(const char* name) : + G1ServiceTask(name) { } + +void G1ReviseYoungLengthTask::execute() { + SuspendibleThreadSetJoiner sts; + + adjust_young_list_target_length(); + + schedule(reschedule_delay_ms()); +} diff --git a/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp new file mode 100644 index 00000000000..baa8af75fb7 --- /dev/null +++ b/src/hotspot/share/gc/g1/g1ReviseYoungLengthTask.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP +#define SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP + +#include "gc/g1/g1CardSetMemory.hpp" +#include "gc/g1/g1HeapRegionRemSet.hpp" +#include "gc/g1/g1MonotonicArenaFreePool.hpp" +#include "gc/g1/g1ServiceThread.hpp" +#include "utilities/growableArray.hpp" +#include "utilities/ticks.hpp" + +// ServiceTask to revise the young generation target length. +class G1ReviseYoungLengthTask : public G1ServiceTask { + + // The delay used to reschedule this task. + jlong reschedule_delay_ms() const; + + class RemSetSamplingClosure; // Helper class for calculating remembered set summary. + + // Adjust the target length (in regions) of the young gen, based on the + // current length of the remembered sets. + // + // At the end of the GC G1 determines the length of the young gen based on + // how much time the next GC can take, and when the next GC may occur + // according to the MMU. + // + // The assumption is that a significant part of the GC is spent on scanning + // the remembered sets (and many other components), so this thread constantly + // reevaluates the prediction for the remembered set scanning costs, and potentially + // resizes the young gen. This may do a premature GC or even increase the young + // gen size to keep pause time length goal. + void adjust_young_list_target_length(); + +public: + explicit G1ReviseYoungLengthTask(const char* name); + + void execute() override; +}; + +#endif // SHARE_GC_G1_G1REVISEYOUNGLENGTHTASK_HPP \ No newline at end of file diff --git a/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp b/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp index d0dcb59d7f0..858081b0581 100644 --- a/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp +++ b/src/hotspot/share/gc/g1/g1ThreadLocalData.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,7 +25,7 @@ #define SHARE_GC_G1_G1THREADLOCALDATA_HPP #include "gc/g1/g1BarrierSet.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" +#include "gc/g1/g1CardTable.hpp" #include "gc/g1/g1RegionPinCache.hpp" #include "gc/shared/gc_globals.hpp" #include "gc/shared/satbMarkQueue.hpp" @@ -36,7 +36,7 @@ class G1ThreadLocalData { private: SATBMarkQueue _satb_mark_queue; - G1DirtyCardQueue _dirty_card_queue; + G1CardTable::CardValue* _byte_map_base; // Per-thread cache of pinned object count to reduce atomic operation traffic // due to region pinning. Holds the last region where the mutator pinned an @@ -45,8 +45,8 @@ private: G1ThreadLocalData() : _satb_mark_queue(&G1BarrierSet::satb_mark_queue_set()), - _dirty_card_queue(&G1BarrierSet::dirty_card_queue_set()), - _pin_cache() {} + _byte_map_base(nullptr), + _pin_cache() { } static G1ThreadLocalData* data(Thread* thread) { assert(UseG1GC, "Sanity"); @@ -57,10 +57,6 @@ private: return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _satb_mark_queue); } - static ByteSize dirty_card_queue_offset() { - return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _dirty_card_queue); - } - public: static void create(Thread* thread) { new (data(thread)) G1ThreadLocalData(); @@ -74,10 +70,6 @@ public: return data(thread)->_satb_mark_queue; } - static G1DirtyCardQueue& dirty_card_queue(Thread* thread) { - return data(thread)->_dirty_card_queue; - } - static ByteSize satb_mark_queue_active_offset() { return satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active(); } @@ -90,14 +82,20 @@ public: return satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf(); } - static ByteSize dirty_card_queue_index_offset() { - return dirty_card_queue_offset() + G1DirtyCardQueue::byte_offset_of_index(); + static ByteSize card_table_base_offset() { + return Thread::gc_data_offset() + byte_offset_of(G1ThreadLocalData, _byte_map_base); } - static ByteSize dirty_card_queue_buffer_offset() { - return dirty_card_queue_offset() + G1DirtyCardQueue::byte_offset_of_buf(); + static void set_byte_map_base(Thread* thread, G1CardTable::CardValue* new_byte_map_base) { + data(thread)->_byte_map_base = new_byte_map_base; } +#ifndef PRODUCT + static G1CardTable::CardValue* get_byte_map_base(Thread* thread) { + return data(thread)->_byte_map_base; + } +#endif + static G1RegionPinCache& pin_count_cache(Thread* thread) { return data(thread)->_pin_cache; } diff --git a/src/hotspot/share/gc/g1/g1YoungCollector.cpp b/src/hotspot/share/gc/g1/g1YoungCollector.cpp index ee25e5fc028..e97e59575e3 100644 --- a/src/hotspot/share/gc/g1/g1YoungCollector.cpp +++ b/src/hotspot/share/gc/g1/g1YoungCollector.cpp @@ -39,7 +39,6 @@ #include "gc/g1/g1MonitoringSupport.hpp" #include "gc/g1/g1ParScanThreadState.inline.hpp" #include "gc/g1/g1Policy.hpp" -#include "gc/g1/g1RedirtyCardsQueue.hpp" #include "gc/g1/g1RegionPinCache.inline.hpp" #include "gc/g1/g1RemSet.hpp" #include "gc/g1/g1RootProcessor.hpp" @@ -914,13 +913,8 @@ class G1STWRefProcProxyTask : public RefProcProxyTask { TaskTerminator _terminator; G1ScannerTasksQueueSet& _task_queues; - // Special closure for enqueuing discovered fields: during enqueue the card table - // may not be in shape to properly handle normal barrier calls (e.g. card marks - // in regions that failed evacuation, scribbling of various values by card table - // scan code). Additionally the regular barrier enqueues into the "global" - // DCQS, but during GC we need these to-be-refined entries in the GC local queue - // so that after clearing the card table, the redirty cards phase will properly - // mark all dirty cards to be picked up by refinement. + // G1 specific closure for marking discovered fields. Need to mark the card in the + // refinement table as the card table is in use by garbage collection. class G1EnqueueDiscoveredFieldClosure : public EnqueueDiscoveredFieldClosure { G1CollectedHeap* _g1h; G1ParScanThreadState* _pss; diff --git a/src/hotspot/share/gc/g1/g1YoungCollector.hpp b/src/hotspot/share/gc/g1/g1YoungCollector.hpp index 2c4929958fe..76d443b1a9f 100644 --- a/src/hotspot/share/gc/g1/g1YoungCollector.hpp +++ b/src/hotspot/share/gc/g1/g1YoungCollector.hpp @@ -45,7 +45,6 @@ class G1MonotonicArenaMemoryStats; class G1NewTracer; class G1ParScanThreadStateSet; class G1Policy; -class G1RedirtyCardsQueueSet; class G1RemSet; class G1SurvivorRegions; class G1YoungGCAllocationFailureInjector; diff --git a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp index 5b13e8fc206..2737def7e84 100644 --- a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp +++ b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp @@ -287,7 +287,7 @@ public: _chunk_bitmap(mtGC) { _num_evac_fail_regions = _evac_failure_regions->num_regions_evac_failed(); - _num_chunks_per_region = G1CollectedHeap::get_chunks_per_region(); + _num_chunks_per_region = G1CollectedHeap::get_chunks_per_region_for_scan(); _chunk_size = static_cast(G1HeapRegion::GrainWords / _num_chunks_per_region); @@ -300,7 +300,7 @@ public: double worker_cost() const override { assert(_evac_failure_regions->has_regions_evac_failed(), "Should not call this if there were no evacuation failures"); - double workers_per_region = (double)G1CollectedHeap::get_chunks_per_region() / G1RestoreRetainedRegionChunksPerWorker; + double workers_per_region = (double)G1CollectedHeap::get_chunks_per_region_for_scan() / G1RestoreRetainedRegionChunksPerWorker; return workers_per_region * _evac_failure_regions->num_regions_evac_failed(); } @@ -480,43 +480,6 @@ public: } }; -class RedirtyLoggedCardTableEntryClosure : public G1CardTableEntryClosure { - size_t _num_dirtied; - G1CollectedHeap* _g1h; - G1CardTable* _g1_ct; - G1EvacFailureRegions* _evac_failure_regions; - - G1HeapRegion* region_for_card(CardValue* card_ptr) const { - return _g1h->heap_region_containing(_g1_ct->addr_for(card_ptr)); - } - - bool will_become_free(G1HeapRegion* hr) const { - // A region will be freed by during the FreeCollectionSet phase if the region is in the - // collection set and has not had an evacuation failure. - return _g1h->is_in_cset(hr) && !_evac_failure_regions->contains(hr->hrm_index()); - } - -public: - RedirtyLoggedCardTableEntryClosure(G1CollectedHeap* g1h, G1EvacFailureRegions* evac_failure_regions) : - G1CardTableEntryClosure(), - _num_dirtied(0), - _g1h(g1h), - _g1_ct(g1h->card_table()), - _evac_failure_regions(evac_failure_regions) { } - - void do_card_ptr(CardValue* card_ptr) override { - G1HeapRegion* hr = region_for_card(card_ptr); - - // Should only dirty cards in regions that won't be freed. - if (!will_become_free(hr)) { - *card_ptr = G1CardTable::dirty_card_val(); - _num_dirtied++; - } - } - - size_t num_dirtied() const { return _num_dirtied; } -}; - class G1PostEvacuateCollectionSetCleanupTask2::ProcessEvacuationFailedRegionsTask : public G1AbstractSubTask { G1EvacFailureRegions* _evac_failure_regions; G1HeapRegionClaimer _claimer; @@ -572,48 +535,6 @@ public: } }; -class G1PostEvacuateCollectionSetCleanupTask2::RedirtyLoggedCardsTask : public G1AbstractSubTask { - BufferNodeList* _rdc_buffers; - uint _num_buffer_lists; - G1EvacFailureRegions* _evac_failure_regions; - -public: - RedirtyLoggedCardsTask(G1EvacFailureRegions* evac_failure_regions, BufferNodeList* rdc_buffers, uint num_buffer_lists) : - G1AbstractSubTask(G1GCPhaseTimes::RedirtyCards), - _rdc_buffers(rdc_buffers), - _num_buffer_lists(num_buffer_lists), - _evac_failure_regions(evac_failure_regions) { } - - double worker_cost() const override { - // Needs more investigation. - return G1CollectedHeap::heap()->workers()->active_workers(); - } - - void do_work(uint worker_id) override { - RedirtyLoggedCardTableEntryClosure cl(G1CollectedHeap::heap(), _evac_failure_regions); - - uint start = worker_id; - for (uint i = 0; i < _num_buffer_lists; i++) { - uint index = (start + i) % _num_buffer_lists; - - BufferNode* next = AtomicAccess::load(&_rdc_buffers[index]._head); - BufferNode* tail = AtomicAccess::load(&_rdc_buffers[index]._tail); - - while (next != nullptr) { - BufferNode* node = next; - next = AtomicAccess::cmpxchg(&_rdc_buffers[index]._head, node, (node != tail ) ? node->next() : nullptr); - if (next == node) { - cl.apply_to_buffer(node, worker_id); - next = (node != tail ) ? node->next() : nullptr; - } else { - break; // If there is contention, move to the next BufferNodeList - } - } - } - record_work_item(worker_id, 0, cl.num_dirtied()); - } -}; - // Helper class to keep statistics for the collection set freeing class FreeCSetStats { size_t _before_used_bytes; // Usage in regions successfully evacuate @@ -797,7 +718,6 @@ public: JFREventForRegion event(r, _worker_id); TimerForRegion timer(timer_for_region(r)); - if (r->is_young()) { assert_tracks_surviving_words(r); r->record_surv_words_in_group(_surviving_young_words[r->young_index_in_cset()]); @@ -908,24 +828,34 @@ public: } }; -class G1PostEvacuateCollectionSetCleanupTask2::ResizeTLABsTask : public G1AbstractSubTask { +class G1PostEvacuateCollectionSetCleanupTask2::ResizeTLABsAndSwapCardTableTask : public G1AbstractSubTask { G1JavaThreadsListClaimer _claimer; // There is not much work per thread so the number of threads per worker is high. static const uint ThreadsPerWorker = 250; public: - ResizeTLABsTask() : G1AbstractSubTask(G1GCPhaseTimes::ResizeThreadLABs), _claimer(ThreadsPerWorker) { } + ResizeTLABsAndSwapCardTableTask() + : G1AbstractSubTask(G1GCPhaseTimes::ResizeThreadLABs), _claimer(ThreadsPerWorker) + { + G1BarrierSet::g1_barrier_set()->swap_global_card_table(); + } void do_work(uint worker_id) override { - class ResizeClosure : public ThreadClosure { + + class ResizeAndSwapCardTableClosure : public ThreadClosure { public: void do_thread(Thread* thread) { - static_cast(thread)->tlab().resize(); + if (UseTLAB && ResizeTLAB) { + static_cast(thread)->tlab().resize(); + } + + G1BarrierSet::g1_barrier_set()->update_card_table_base(thread); } - } cl; - _claimer.apply(&cl); + } resize_and_swap_cl; + + _claimer.apply(&resize_and_swap_cl); } double worker_cost() const override { @@ -968,13 +898,8 @@ G1PostEvacuateCollectionSetCleanupTask2::G1PostEvacuateCollectionSetCleanupTask2 if (evac_failure_regions->has_regions_evac_failed()) { add_parallel_task(new ProcessEvacuationFailedRegionsTask(evac_failure_regions)); } - add_parallel_task(new RedirtyLoggedCardsTask(evac_failure_regions, - per_thread_states->rdc_buffers(), - per_thread_states->num_workers())); - if (UseTLAB && ResizeTLAB) { - add_parallel_task(new ResizeTLABsTask()); - } + add_parallel_task(new ResizeTLABsAndSwapCardTableTask()); add_parallel_task(new FreeCollectionSetTask(evacuation_info, per_thread_states->surviving_young_words(), evac_failure_regions)); diff --git a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp index ad850af2eac..bc3a08e2080 100644 --- a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp +++ b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -55,9 +55,8 @@ public: // - Eagerly Reclaim Humongous Objects (s) // - Update Derived Pointers (s) // - Clear Retained Region Data (on evacuation failure) -// - Redirty Logged Cards // - Free Collection Set -// - Resize TLABs +// - Resize TLABs and Swap Card Table // - Reset the reusable PartialArrayStateManager. class G1PostEvacuateCollectionSetCleanupTask2 : public G1BatchedTask { class EagerlyReclaimHumongousObjectsTask; @@ -66,9 +65,8 @@ class G1PostEvacuateCollectionSetCleanupTask2 : public G1BatchedTask { #endif class ProcessEvacuationFailedRegionsTask; - class RedirtyLoggedCardsTask; class FreeCollectionSetTask; - class ResizeTLABsTask; + class ResizeTLABsAndSwapCardTableTask; class ResetPartialArrayStateManagerTask; public: diff --git a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp index 7214d624def..b11213ddeb3 100644 --- a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp +++ b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.cpp @@ -24,7 +24,6 @@ #include "gc/g1/g1CollectedHeap.inline.hpp" #include "gc/g1/g1ConcurrentRefineStats.hpp" -#include "gc/g1/g1DirtyCardQueue.hpp" #include "gc/g1/g1RegionPinCache.inline.hpp" #include "gc/g1/g1ThreadLocalData.hpp" #include "gc/g1/g1YoungGCPreEvacuateTasks.hpp" @@ -35,23 +34,21 @@ #include "runtime/thread.inline.hpp" #include "runtime/threads.hpp" -class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABAndFlushLogs : public G1AbstractSubTask { +class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABs : public G1AbstractSubTask { G1JavaThreadsListClaimer _claimer; // Per worker thread statistics. ThreadLocalAllocStats* _local_tlab_stats; - G1ConcurrentRefineStats* _local_refinement_stats; uint _num_workers; // There is relatively little work to do per thread. static const uint ThreadsPerWorker = 250; - struct RetireTLABAndFlushLogsClosure : public ThreadClosure { + struct RetireTLABClosure : public ThreadClosure { ThreadLocalAllocStats _tlab_stats; - G1ConcurrentRefineStats _refinement_stats; - RetireTLABAndFlushLogsClosure() : _tlab_stats(), _refinement_stats() { } + RetireTLABClosure() : _tlab_stats() { } void do_thread(Thread* thread) override { assert(thread->is_Java_thread(), "must be"); @@ -61,37 +58,29 @@ class G1PreEvacuateCollectionSetBatchTask::JavaThreadRetireTLABAndFlushLogs : pu if (UseTLAB) { thread->retire_tlab(&_tlab_stats); } - // Concatenate logs. - G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set(); - _refinement_stats += qset.concatenate_log_and_stats(thread); // Flush region pin count cache. G1ThreadLocalData::pin_count_cache(thread).flush(); } }; public: - JavaThreadRetireTLABAndFlushLogs() : - G1AbstractSubTask(G1GCPhaseTimes::RetireTLABsAndFlushLogs), + JavaThreadRetireTLABs() : + G1AbstractSubTask(G1GCPhaseTimes::RetireTLABs), _claimer(ThreadsPerWorker), _local_tlab_stats(nullptr), - _local_refinement_stats(nullptr), _num_workers(0) { } - ~JavaThreadRetireTLABAndFlushLogs() { - static_assert(std::is_trivially_destructible::value, "must be"); - FREE_C_HEAP_ARRAY(G1ConcurrentRefineStats, _local_refinement_stats); - + ~JavaThreadRetireTLABs() { static_assert(std::is_trivially_destructible::value, "must be"); FREE_C_HEAP_ARRAY(ThreadLocalAllocStats, _local_tlab_stats); } void do_work(uint worker_id) override { - RetireTLABAndFlushLogsClosure tc; + RetireTLABClosure tc; _claimer.apply(&tc); _local_tlab_stats[worker_id] = tc._tlab_stats; - _local_refinement_stats[worker_id] = tc._refinement_stats; } double worker_cost() const override { @@ -101,11 +90,9 @@ public: void set_max_workers(uint max_workers) override { _num_workers = max_workers; _local_tlab_stats = NEW_C_HEAP_ARRAY(ThreadLocalAllocStats, _num_workers, mtGC); - _local_refinement_stats = NEW_C_HEAP_ARRAY(G1ConcurrentRefineStats, _num_workers, mtGC); for (uint i = 0; i < _num_workers; i++) { ::new (&_local_tlab_stats[i]) ThreadLocalAllocStats(); - ::new (&_local_refinement_stats[i]) G1ConcurrentRefineStats(); } } @@ -116,85 +103,15 @@ public: } return result; } - - G1ConcurrentRefineStats refinement_stats() const { - G1ConcurrentRefineStats result; - for (uint i = 0; i < _num_workers; i++) { - result += _local_refinement_stats[i]; - } - return result; - } -}; - -class G1PreEvacuateCollectionSetBatchTask::NonJavaThreadFlushLogs : public G1AbstractSubTask { - struct FlushLogsClosure : public ThreadClosure { - G1ConcurrentRefineStats _refinement_stats; - - FlushLogsClosure() : _refinement_stats() { } - - void do_thread(Thread* thread) override { - G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set(); - _refinement_stats += qset.concatenate_log_and_stats(thread); - - assert(G1ThreadLocalData::pin_count_cache(thread).count() == 0, "NonJava thread has pinned Java objects"); - } - } _tc; - -public: - NonJavaThreadFlushLogs() : G1AbstractSubTask(G1GCPhaseTimes::NonJavaThreadFlushLogs), _tc() { } - - void do_work(uint worker_id) override { - Threads::non_java_threads_do(&_tc); - } - - double worker_cost() const override { - return 1.0; - } - - G1ConcurrentRefineStats refinement_stats() const { return _tc._refinement_stats; } }; G1PreEvacuateCollectionSetBatchTask::G1PreEvacuateCollectionSetBatchTask() : G1BatchedTask("Pre Evacuate Prepare", G1CollectedHeap::heap()->phase_times()), - _old_pending_cards(G1BarrierSet::dirty_card_queue_set().num_cards()), - _java_retire_task(new JavaThreadRetireTLABAndFlushLogs()), - _non_java_retire_task(new NonJavaThreadFlushLogs()) { + _java_retire_task(new JavaThreadRetireTLABs()) { - // Disable mutator refinement until concurrent refinement decides otherwise. - G1BarrierSet::dirty_card_queue_set().set_mutator_refinement_threshold(SIZE_MAX); - - add_serial_task(_non_java_retire_task); add_parallel_task(_java_retire_task); } -static void verify_empty_dirty_card_logs() { -#ifdef ASSERT - ResourceMark rm; - - struct Verifier : public ThreadClosure { - Verifier() {} - void do_thread(Thread* t) override { - G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(t); - assert(queue.is_empty(), "non-empty dirty card queue for thread %s", t->name()); - } - } verifier; - Threads::threads_do(&verifier); -#endif -} - G1PreEvacuateCollectionSetBatchTask::~G1PreEvacuateCollectionSetBatchTask() { _java_retire_task->tlab_stats().publish(); - - G1DirtyCardQueueSet& qset = G1BarrierSet::dirty_card_queue_set(); - - G1ConcurrentRefineStats total_refinement_stats; - total_refinement_stats += _java_retire_task->refinement_stats(); - total_refinement_stats += _non_java_retire_task->refinement_stats(); - qset.update_refinement_stats(total_refinement_stats); - - verify_empty_dirty_card_logs(); - - size_t pending_cards = qset.num_cards(); - size_t thread_buffer_cards = pending_cards - _old_pending_cards; - G1CollectedHeap::heap()->policy()->record_concurrent_refinement_stats(pending_cards, thread_buffer_cards); } diff --git a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp index 791031d979f..7574862872c 100644 --- a/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp +++ b/src/hotspot/share/gc/g1/g1YoungGCPreEvacuateTasks.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,18 +28,13 @@ #include "gc/g1/g1BatchedTask.hpp" // Set of pre evacuate collection set tasks containing ("s" means serial): -// - Retire TLAB and Flush Logs (Java threads) +// - Retire TLABs (Java threads) // - Flush pin count cache (Java threads) -// - Flush Logs (s) (Non-Java threads) class G1PreEvacuateCollectionSetBatchTask : public G1BatchedTask { - class JavaThreadRetireTLABAndFlushLogs; - class NonJavaThreadFlushLogs; - - size_t _old_pending_cards; + class JavaThreadRetireTLABs; // References to the tasks to retain access to statistics. - JavaThreadRetireTLABAndFlushLogs* _java_retire_task; - NonJavaThreadFlushLogs* _non_java_retire_task; + JavaThreadRetireTLABs* _java_retire_task; public: G1PreEvacuateCollectionSetBatchTask(); diff --git a/src/hotspot/share/gc/g1/g1_globals.hpp b/src/hotspot/share/gc/g1/g1_globals.hpp index 1c712492f74..b338c11d5be 100644 --- a/src/hotspot/share/gc/g1/g1_globals.hpp +++ b/src/hotspot/share/gc/g1/g1_globals.hpp @@ -162,6 +162,11 @@ "a single expand attempt.") \ range(0, 100) \ \ + product(size_t, G1PerThreadPendingCardThreshold, 256, DIAGNOSTIC, \ + "Number of pending cards allowed on the card table per GC " \ + "worker thread before considering starting refinement.") \ + range(0, UINT_MAX) \ + \ product(uint, G1ShrinkByPercentOfAvailable, 50, DIAGNOSTIC, \ "When shrinking, maximum % of free space to free for a single " \ "shrink attempt.") \ @@ -188,10 +193,6 @@ "bound of acceptable deviation range.") \ constraint(G1CPUUsageShrinkConstraintFunc, AfterErgo) \ \ - product(size_t, G1UpdateBufferSize, 256, \ - "Size of an update buffer") \ - constraint(G1UpdateBufferSizeConstraintFunc, AfterErgo) \ - \ product(uint, G1RSetUpdatingPauseTimePercent, 10, \ "A target percentage of time that is allowed to be spend on " \ "processing remembered set update buffers during the collection " \ diff --git a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp index 488a9c7aac9..2b084b387bc 100644 --- a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp +++ b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.cpp @@ -206,12 +206,6 @@ JVMFlag::Error G1SATBBufferSizeConstraintFunc(size_t value, bool verbose) { verbose); } -JVMFlag::Error G1UpdateBufferSizeConstraintFunc(size_t value, bool verbose) { - return buffer_size_constraint_helper(FLAG_MEMBER_ENUM(G1UpdateBufferSize), - value, - verbose); -} - JVMFlag::Error gc_cpu_usage_threshold_helper(JVMFlagsEnum flagid, uint value, bool verbose) { diff --git a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp index 89f05d73dcc..b2c7bb6dc96 100644 --- a/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp +++ b/src/hotspot/share/gc/g1/jvmFlagConstraintsG1.hpp @@ -47,7 +47,6 @@ \ /* G1 PtrQueue buffer size constraints */ \ f(size_t, G1SATBBufferSizeConstraintFunc) \ - f(size_t, G1UpdateBufferSizeConstraintFunc) \ \ /* G1 GC deviation counter threshold constraints */ \ f(uint, G1CPUUsageExpandConstraintFunc) \ diff --git a/src/hotspot/share/gc/g1/vmStructs_g1.hpp b/src/hotspot/share/gc/g1/vmStructs_g1.hpp index 651808b4ba0..67c930e1b63 100644 --- a/src/hotspot/share/gc/g1/vmStructs_g1.hpp +++ b/src/hotspot/share/gc/g1/vmStructs_g1.hpp @@ -82,8 +82,7 @@ declare_constant(G1HeapRegionType::StartsHumongousTag) \ declare_constant(G1HeapRegionType::ContinuesHumongousTag) \ declare_constant(G1HeapRegionType::OldMask) \ - declare_constant(BarrierSet::G1BarrierSet) \ - declare_constant(G1CardTable::g1_young_gen) + declare_constant(BarrierSet::G1BarrierSet) #define VM_TYPES_G1GC(declare_type, \ declare_toplevel_type, \ @@ -100,7 +99,6 @@ declare_toplevel_type(PtrQueue) \ declare_toplevel_type(G1HeapRegionType) \ declare_toplevel_type(SATBMarkQueue) \ - declare_toplevel_type(G1DirtyCardQueue) \ \ declare_toplevel_type(G1CollectedHeap*) \ declare_toplevel_type(G1HeapRegion*) \ diff --git a/src/hotspot/share/gc/shared/bufferNodeList.cpp b/src/hotspot/share/gc/shared/bufferNodeList.cpp deleted file mode 100644 index 768f40e0985..00000000000 --- a/src/hotspot/share/gc/shared/bufferNodeList.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - * - */ - -#include "gc/shared/bufferNodeList.hpp" -#include "utilities/debug.hpp" - -BufferNodeList::BufferNodeList() : - _head(nullptr), _tail(nullptr), _entry_count(0) {} - -BufferNodeList::BufferNodeList(BufferNode* head, - BufferNode* tail, - size_t entry_count) : - _head(head), _tail(tail), _entry_count(entry_count) -{ - assert((_head == nullptr) == (_tail == nullptr), "invariant"); - assert((_head == nullptr) == (_entry_count == 0), "invariant"); -} diff --git a/src/hotspot/share/gc/shared/cardTable.cpp b/src/hotspot/share/gc/shared/cardTable.cpp index 76b8eb4d718..34f1847befe 100644 --- a/src/hotspot/share/gc/shared/cardTable.cpp +++ b/src/hotspot/share/gc/shared/cardTable.cpp @@ -225,6 +225,9 @@ uintx CardTable::ct_max_alignment_constraint() { #ifndef PRODUCT void CardTable::verify_region(MemRegion mr, CardValue val, bool val_equals) { + if (mr.is_empty()) { + return; + } CardValue* start = byte_for(mr.start()); CardValue* end = byte_for(mr.last()); bool failures = false; @@ -255,7 +258,8 @@ void CardTable::verify_dirty_region(MemRegion mr) { } #endif -void CardTable::print_on(outputStream* st) const { - st->print_cr("Card table byte_map: [" PTR_FORMAT "," PTR_FORMAT "] _byte_map_base: " PTR_FORMAT, +void CardTable::print_on(outputStream* st, const char* description) const { + st->print_cr("%s table byte_map: [" PTR_FORMAT "," PTR_FORMAT "] _byte_map_base: " PTR_FORMAT, + description, p2i(_byte_map), p2i(_byte_map + _byte_map_size), p2i(_byte_map_base)); } diff --git a/src/hotspot/share/gc/shared/cardTable.hpp b/src/hotspot/share/gc/shared/cardTable.hpp index ee41be06be0..63dcfe7aecb 100644 --- a/src/hotspot/share/gc/shared/cardTable.hpp +++ b/src/hotspot/share/gc/shared/cardTable.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -203,12 +203,12 @@ public: virtual bool is_in_young(const void* p) const = 0; - // Print a description of the memory for the card table - virtual void print_on(outputStream* st) const; + // Print card table information. + void print_on(outputStream* st, const char* description = "Card") const; // val_equals -> it will check that all cards covered by mr equal val // !val_equals -> it will check that all cards covered by mr do not equal val - void verify_region(MemRegion mr, CardValue val, bool val_equals) PRODUCT_RETURN; + virtual void verify_region(MemRegion mr, CardValue val, bool val_equals) PRODUCT_RETURN; void verify_not_dirty_region(MemRegion mr) PRODUCT_RETURN; void verify_dirty_region(MemRegion mr) PRODUCT_RETURN; }; diff --git a/src/hotspot/share/gc/shared/workerDataArray.hpp b/src/hotspot/share/gc/shared/workerDataArray.hpp index b2a81bc9482..587f9bbd167 100644 --- a/src/hotspot/share/gc/shared/workerDataArray.hpp +++ b/src/hotspot/share/gc/shared/workerDataArray.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -34,7 +34,7 @@ template class WorkerDataArray : public CHeapObj { friend class WDAPrinter; public: - static const uint MaxThreadWorkItems = 9; + static const uint MaxThreadWorkItems = 10; private: T* _data; uint _length; diff --git a/src/hotspot/share/jvmci/jvmciRuntime.cpp b/src/hotspot/share/jvmci/jvmciRuntime.cpp index 137782f93ef..e75527235f0 100644 --- a/src/hotspot/share/jvmci/jvmciRuntime.cpp +++ b/src/hotspot/share/jvmci/jvmciRuntime.cpp @@ -589,10 +589,6 @@ void JVMCIRuntime::write_barrier_pre(JavaThread* thread, oopDesc* obj) { G1BarrierSetRuntime::write_ref_field_pre_entry(obj, thread); } -void JVMCIRuntime::write_barrier_post(JavaThread* thread, volatile CardValue* card_addr) { - G1BarrierSetRuntime::write_ref_field_post_entry(card_addr, thread); -} - #endif // INCLUDE_G1GC JRT_LEAF(jboolean, JVMCIRuntime::validate_object(JavaThread* thread, oopDesc* parent, oopDesc* child)) diff --git a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp index 3ddf7de0510..7ddb9be540a 100644 --- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp +++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp @@ -560,6 +560,7 @@ declare_constant(BranchData::not_taken_off_set) \ \ declare_constant_with_value("CardTable::dirty_card", CardTable::dirty_card_val()) \ + declare_constant_with_value("CardTable::clean_card", CardTable::clean_card_val()) \ declare_constant_with_value("LockStack::_end_offset", LockStack::end_offset()) \ declare_constant_with_value("OMCache::oop_to_oop_difference", OMCache::oop_to_oop_difference()) \ declare_constant_with_value("OMCache::oop_to_monitor_difference", OMCache::oop_to_monitor_difference()) \ @@ -928,7 +929,6 @@ declare_function(JVMCIRuntime::vm_error) \ declare_function(JVMCIRuntime::load_and_clear_exception) \ G1GC_ONLY(declare_function(JVMCIRuntime::write_barrier_pre)) \ - G1GC_ONLY(declare_function(JVMCIRuntime::write_barrier_post)) \ SHENANDOAHGC_ONLY(declare_function(ShenandoahRuntime::load_reference_barrier_strong)) \ SHENANDOAHGC_ONLY(declare_function(ShenandoahRuntime::load_reference_barrier_strong_narrow)) \ SHENANDOAHGC_ONLY(declare_function(ShenandoahRuntime::load_reference_barrier_weak)) \ @@ -947,12 +947,10 @@ static_field(G1HeapRegion, LogOfHRGrainBytes, uint) #define VM_INT_CONSTANTS_JVMCI_G1GC(declare_constant, declare_constant_with_value, declare_preprocessor_constant) \ - declare_constant_with_value("G1CardTable::g1_young_gen", G1CardTable::g1_young_card_val()) \ declare_constant_with_value("G1ThreadLocalData::satb_mark_queue_active_offset", in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset())) \ declare_constant_with_value("G1ThreadLocalData::satb_mark_queue_index_offset", in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset())) \ declare_constant_with_value("G1ThreadLocalData::satb_mark_queue_buffer_offset", in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset())) \ - declare_constant_with_value("G1ThreadLocalData::dirty_card_queue_index_offset", in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset())) \ - declare_constant_with_value("G1ThreadLocalData::dirty_card_queue_buffer_offset", in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset())) + declare_constant_with_value("G1ThreadLocalData::card_table_base_offset", in_bytes(G1ThreadLocalData::card_table_base_offset())) \ #endif // INCLUDE_G1GC diff --git a/src/hotspot/share/oops/oop.cpp b/src/hotspot/share/oops/oop.cpp index 51480c68c22..f874a39bf31 100644 --- a/src/hotspot/share/oops/oop.cpp +++ b/src/hotspot/share/oops/oop.cpp @@ -87,7 +87,16 @@ void oopDesc::print_value_on(outputStream* st) const { java_lang_String::print(obj, st); print_address_on(st); } else { - klass()->oop_print_value_on(obj, st); + Klass* k = klass_without_asserts(); + if (k == nullptr) { + st->print("null klass"); + } else if (!Metaspace::contains(k)) { + st->print("klass not in Metaspace"); + } else if (!k->is_klass()) { + st->print("klass not a Klass"); + } else { + k->oop_print_value_on(obj, st); + } } } diff --git a/src/hotspot/share/runtime/arguments.cpp b/src/hotspot/share/runtime/arguments.cpp index 6cfeb1dcb0f..b7ab68e143c 100644 --- a/src/hotspot/share/runtime/arguments.cpp +++ b/src/hotspot/share/runtime/arguments.cpp @@ -546,6 +546,7 @@ static SpecialFlag const special_jvm_flags[] = { { "MetaspaceReclaimPolicy", JDK_Version::undefined(), JDK_Version::jdk(21), JDK_Version::undefined() }, { "ZGenerational", JDK_Version::jdk(23), JDK_Version::jdk(24), JDK_Version::undefined() }, { "ZMarkStackSpaceLimit", JDK_Version::undefined(), JDK_Version::jdk(25), JDK_Version::undefined() }, + { "G1UpdateBufferSize", JDK_Version::undefined(), JDK_Version::jdk(26), JDK_Version::jdk(27) }, #if defined(AARCH64) { "NearCpool", JDK_Version::undefined(), JDK_Version::jdk(25), JDK_Version::undefined() }, #endif diff --git a/src/hotspot/share/runtime/cpuTimeCounters.cpp b/src/hotspot/share/runtime/cpuTimeCounters.cpp index c7e48441662..e5364550b6c 100644 --- a/src/hotspot/share/runtime/cpuTimeCounters.cpp +++ b/src/hotspot/share/runtime/cpuTimeCounters.cpp @@ -36,6 +36,8 @@ const char* CPUTimeGroups::to_string(CPUTimeType val) { return "gc_conc_mark"; case CPUTimeType::gc_conc_refine: return "gc_conc_refine"; + case CPUTimeType::gc_conc_refine_control: + return "gc_conc_refine_control"; case CPUTimeType::gc_service: return "gc_service"; case CPUTimeType::vm: @@ -53,6 +55,7 @@ bool CPUTimeGroups::is_gc_counter(CPUTimeType val) { case CPUTimeType::gc_parallel_workers: case CPUTimeType::gc_conc_mark: case CPUTimeType::gc_conc_refine: + case CPUTimeType::gc_conc_refine_control: case CPUTimeType::gc_service: return true; default: diff --git a/src/hotspot/share/runtime/cpuTimeCounters.hpp b/src/hotspot/share/runtime/cpuTimeCounters.hpp index efa44f9173d..9ad00492731 100644 --- a/src/hotspot/share/runtime/cpuTimeCounters.hpp +++ b/src/hotspot/share/runtime/cpuTimeCounters.hpp @@ -40,6 +40,7 @@ public: gc_parallel_workers, gc_conc_mark, gc_conc_refine, + gc_conc_refine_control, gc_service, vm, conc_dedup, diff --git a/src/hotspot/share/runtime/mutexLocker.cpp b/src/hotspot/share/runtime/mutexLocker.cpp index e0eafbc416b..8274d767e4e 100644 --- a/src/hotspot/share/runtime/mutexLocker.cpp +++ b/src/hotspot/share/runtime/mutexLocker.cpp @@ -98,15 +98,15 @@ Mutex* PerfDataManager_lock = nullptr; #if INCLUDE_G1GC Monitor* G1CGC_lock = nullptr; -Mutex* G1DetachedRefinementStats_lock = nullptr; Mutex* G1FreeList_lock = nullptr; Mutex* G1MarkStackChunkList_lock = nullptr; Mutex* G1MarkStackFreeList_lock = nullptr; Monitor* G1OldGCCount_lock = nullptr; Mutex* G1OldSets_lock = nullptr; -Mutex* G1Uncommit_lock = nullptr; +Mutex* G1ReviseYoungLength_lock = nullptr; Monitor* G1RootRegionScan_lock = nullptr; Mutex* G1RareEvent_lock = nullptr; +Mutex* G1Uncommit_lock = nullptr; #endif Mutex* Management_lock = nullptr; @@ -211,7 +211,6 @@ void mutex_init() { #if INCLUDE_G1GC if (UseG1GC) { MUTEX_DEFN(G1CGC_lock , PaddedMonitor, nosafepoint); - MUTEX_DEFN(G1DetachedRefinementStats_lock, PaddedMutex , nosafepoint-2); MUTEX_DEFN(G1FreeList_lock , PaddedMutex , service-1); MUTEX_DEFN(G1MarkStackChunkList_lock , PaddedMutex , nosafepoint); MUTEX_DEFN(G1MarkStackFreeList_lock , PaddedMutex , nosafepoint); @@ -341,8 +340,9 @@ void mutex_init() { #if INCLUDE_G1GC if (UseG1GC) { - MUTEX_DEFL(G1OldGCCount_lock , PaddedMonitor, Threads_lock, true); - MUTEX_DEFL(G1RareEvent_lock , PaddedMutex , Threads_lock, true); + MUTEX_DEFL(G1OldGCCount_lock , PaddedMonitor, Threads_lock, true); + MUTEX_DEFL(G1RareEvent_lock , PaddedMutex , Threads_lock, true); + MUTEX_DEFL(G1ReviseYoungLength_lock , PaddedMutex , Threads_lock, true); } #endif diff --git a/src/hotspot/share/runtime/mutexLocker.hpp b/src/hotspot/share/runtime/mutexLocker.hpp index 3a73edc7bf2..8cd408c99c9 100644 --- a/src/hotspot/share/runtime/mutexLocker.hpp +++ b/src/hotspot/share/runtime/mutexLocker.hpp @@ -93,13 +93,13 @@ extern Mutex* FullGCALot_lock; // a lock to make FullGCALot MT #if INCLUDE_G1GC extern Monitor* G1CGC_lock; // used for coordination between fore- & background G1 concurrent GC threads. -extern Mutex* G1DetachedRefinementStats_lock; // Lock protecting detached refinement stats for G1. extern Mutex* G1FreeList_lock; // protects the G1 free region list during safepoints extern Mutex* G1MarkStackChunkList_lock; // Protects access to the G1 global mark stack chunk list. extern Mutex* G1MarkStackFreeList_lock; // Protects access to the G1 global mark stack free list. extern Monitor* G1OldGCCount_lock; // in support of "concurrent" full gc extern Mutex* G1OldSets_lock; // protects the G1 old region sets extern Mutex* G1RareEvent_lock; // Synchronizes (rare) parallel GC operations. +extern Mutex* G1ReviseYoungLength_lock; // Protects access to young gen length revising operations. extern Monitor* G1RootRegionScan_lock; // used to notify that the G1 CM threads have finished scanning the root regions extern Mutex* G1Uncommit_lock; // protects the G1 uncommit list when not at safepoints #endif diff --git a/src/hotspot/share/runtime/vmOperation.hpp b/src/hotspot/share/runtime/vmOperation.hpp index 89a806bb75d..ada5014beee 100644 --- a/src/hotspot/share/runtime/vmOperation.hpp +++ b/src/hotspot/share/runtime/vmOperation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -59,6 +59,7 @@ template(G1PauseRemark) \ template(G1PauseCleanup) \ template(G1TryInitiateConcMark) \ + template(G1RendezvousGCThreads) \ template(ZMarkEndOld) \ template(ZMarkEndYoung) \ template(ZMarkFlushOperation) \ diff --git a/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java b/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java index a9df0019ab1..01e015d50cb 100644 --- a/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java +++ b/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java @@ -506,10 +506,10 @@ public class TestG1BarrierGeneration { @Test @IR(failOn = IRNode.SAFEPOINT) @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "false"}, - counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, POST_ONLY, "1"}, + counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, POST_ONLY, ">1"}, phase = CompilePhase.FINAL_CODE) @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "false"}, - counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, POST_ONLY, "1"}, + counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, POST_ONLY, ">1"}, phase = CompilePhase.FINAL_CODE) @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "true"}, failOn = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, ANY}, diff --git a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java index 17ae437358d..d28c0888579 100644 --- a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java +++ b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java @@ -108,8 +108,7 @@ public class TestGCLogMessages { new LogMessageWithLevel("Other:", Level.INFO), // Pre Evacuate Collection Set - new LogMessageWithLevel("JT Retire TLABs And Flush Logs \\(ms\\):", Level.DEBUG), - new LogMessageWithLevel("Non-JT Flush Logs \\(ms\\):", Level.DEBUG), + new LogMessageWithLevel("JavaThread Retire TLABs \\(ms\\):", Level.DEBUG), new LogMessageWithLevel("Choose Collection Set:", Level.DEBUG), new LogMessageWithLevel("Region Register:", Level.DEBUG), new LogMessageWithLevel("Prepare Heap Roots:", Level.DEBUG), @@ -126,10 +125,11 @@ public class TestGCLogMessages { new LogMessageWithLevel("Merged Howl ArrayOfCards:", Level.DEBUG), new LogMessageWithLevel("Merged Howl BitMap:", Level.DEBUG), new LogMessageWithLevel("Merged Howl Full:", Level.DEBUG), - new LogMessageWithLevel("Log Buffers \\(ms\\):", Level.DEBUG), - new LogMessageWithLevel("Dirty Cards:", Level.DEBUG), - new LogMessageWithLevel("Merged Cards:", Level.DEBUG), - new LogMessageWithLevel("Skipped Cards:", Level.DEBUG), + new LogMessageWithLevel("Merged From RS Cards:", Level.DEBUG), + new LogMessageWithLevel("Total Cards:", Level.DEBUG), + new LogMessageWithLevel("Merge Refinement Table:", Level.DEBUG), + new LogMessageWithLevel("Sweep \\(ms\\):", Level.DEBUG), + // Evacuate Collection Set new LogMessageWithLevel("Ext Root Scanning \\(ms\\):", Level.DEBUG), new LogMessageWithLevel("Thread Roots \\(ms\\):", Level.TRACE), @@ -173,15 +173,16 @@ public class TestGCLogMessages { new LogMessageWithLevel("Merge Per-Thread State \\(ms\\):", Level.DEBUG), new LogMessageWithLevel("LAB Waste:", Level.DEBUG), new LogMessageWithLevel("LAB Undo Waste:", Level.DEBUG), - new LogMessageWithLevel("Evac Fail Extra Cards:", Level.DEBUG), - new LogMessageWithLevel("Clear Logged Cards \\(ms\\):", Level.DEBUG), + new LogMessageWithLevel("Pending Cards:", Level.DEBUG), + new LogMessageWithLevel("To-Young-Gen Cards:", Level.DEBUG), + new LogMessageWithLevel("Evac-Fail Cards:", Level.DEBUG), + new LogMessageWithLevel("Marked Cards:", Level.DEBUG), + new LogMessageWithLevel("Clear Pending Cards \\(ms\\):", Level.DEBUG), new LogMessageWithLevel("Recalculate Used Memory \\(ms\\):", Level.DEBUG), // Post Evacuate Cleanup 2 new LogMessageWithLevel("Post Evacuate Cleanup 2:", Level.DEBUG), new LogMessageWithLevelC2OrJVMCIOnly("Update Derived Pointers", Level.DEBUG), - new LogMessageWithLevel("Redirty Logged Cards \\(ms\\):", Level.DEBUG), - new LogMessageWithLevel("Redirtied Cards:", Level.DEBUG), new LogMessageWithLevel("Resize TLABs \\(ms\\):", Level.DEBUG), new LogMessageWithLevel("Free Collection Set \\(ms\\):", Level.DEBUG), new LogMessageWithLevel("Serial Free Collection Set:", Level.TRACE), @@ -243,9 +244,7 @@ public class TestGCLogMessages { } LogMessageWithLevel concRefineMessages[] = new LogMessageWithLevel[] { - new LogMessageWithLevel("Mutator refinement: ", Level.DEBUG), - new LogMessageWithLevel("Concurrent refinement: ", Level.DEBUG), - new LogMessageWithLevel("Total refinement: ", Level.DEBUG), + new LogMessageWithLevel("Refinement: sweep: ", Level.DEBUG), // "Concurrent refinement rate" optionally printed if any. // "Generate dirty cards rate" optionally printed if any. }; diff --git a/test/hotspot/jtreg/runtime/CommandLine/OptionsValidation/TestOptionsWithRanges.java b/test/hotspot/jtreg/runtime/CommandLine/OptionsValidation/TestOptionsWithRanges.java index 033b74f7eb1..d4b47422c38 100644 --- a/test/hotspot/jtreg/runtime/CommandLine/OptionsValidation/TestOptionsWithRanges.java +++ b/test/hotspot/jtreg/runtime/CommandLine/OptionsValidation/TestOptionsWithRanges.java @@ -235,7 +235,6 @@ public class TestOptionsWithRanges { */ excludeTestMaxRange("ConcGCThreads"); excludeTestMaxRange("G1ConcRefinementThreads"); - excludeTestMaxRange("G1UpdateBufferSize"); excludeTestMaxRange("InitialHeapSize"); excludeTestMaxRange("MaxHeapSize"); excludeTestMaxRange("MaxRAM"); diff --git a/test/hotspot/jtreg/testlibrary_tests/ir_framework/tests/TestIRMatching.java b/test/hotspot/jtreg/testlibrary_tests/ir_framework/tests/TestIRMatching.java index c7f8badf83b..5615cce983a 100644 --- a/test/hotspot/jtreg/testlibrary_tests/ir_framework/tests/TestIRMatching.java +++ b/test/hotspot/jtreg/testlibrary_tests/ir_framework/tests/TestIRMatching.java @@ -1486,7 +1486,7 @@ class CompilationOutputOfFails { @Test @IR(failOn = IRNode.ALLOC) - @IR(counts = {IRNode.COUNTED_LOOP, "1"}) // not fail + @IR(counts = {IRNode.COUNTED_LOOP, ">1"}) // not fail public void macro3() { for (int i = 0; i < 100; i++) { obj = new Object(); diff --git a/test/hotspot/jtreg/vmTestbase/gc/ArrayJuggle/Juggle2.java b/test/hotspot/jtreg/vmTestbase/gc/ArrayJuggle/Juggle2.java index 1ab01e8179f..eff35559626 100644 --- a/test/hotspot/jtreg/vmTestbase/gc/ArrayJuggle/Juggle2.java +++ b/test/hotspot/jtreg/vmTestbase/gc/ArrayJuggle/Juggle2.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -30,6 +30,11 @@ /* @test @key stress randomness @library /vmTestbase /test/lib @run main/othervm -Xlog:gc=debug:gc.log gc.ArrayJuggle.Juggle2 */ /* @test @key stress randomness @library /vmTestbase /test/lib @run main/othervm -Xlog:gc=debug:gc.log gc.ArrayJuggle.Juggle2 -tg */ +/* + * The next test stresses the interaction between (mostly) full garbage collections and refinement. + */ +/* @test @key stress randomness @library /vmTestbase /test/lib @run main/othervm -XX:-G1UseAdaptiveIHOP -XX:InitiatingHeapOccupancyPercent=0 -XX:G1HeapRegionSize=1m -XX:G1RSetUpdatingPauseTimePercent=0 -XX:+UnlockDiagnosticVMOptions -XX:G1PerThreadPendingCardThreshold=0 -XX:+VerifyAfterGC -Xlog:gc=debug,gc+refine=debug:gc.log gc.ArrayJuggle.Juggle2 -tg */ + package gc.ArrayJuggle; import nsk.share.test.*; diff --git a/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java b/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java index 568104a7b50..d69d47f1911 100644 --- a/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java +++ b/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java @@ -87,8 +87,6 @@ public class TestG1ParallelPhases { .collect(toSet()); Set allPhases = of( - "RetireTLABsAndFlushLogs", - "NonJavaThreadFlushLogs", "ExtRootScan", "ThreadRoots", "VM Global", @@ -100,31 +98,32 @@ public class TestG1ParallelPhases { "CMRefRoots", "MergeER", "MergeRS", - "MergeLB", "ScanHR", "CodeRoots", "ObjCopy", "Termination", - "RedirtyCards", "RecalculateUsed", "ResizeTLABs", "FreeCSet", "UpdateDerivedPointers", "EagerlyReclaimHumongousObjects", "ResetPartialArrayStateManager", - "ClearLoggedCards", + "ClearPendingCards", "MergePSS", "NonYoungFreeCSet", "YoungFreeCSet", "RebuildFreeList", "SampleCandidates", "ResetMarkingState", - "NoteStartOfMark" + "NoteStartOfMark", + "RetireTLABs" ); // Some GC phases may or may not occur depending on environment. Filter them out // since we can not reliably guarantee that they occur (or not). Set optPhases = of( + // Does not always occur + "SweepRT", // The following phases only occur on evacuation failure. "RestoreEvacuationFailedRegions", "RemoveSelfForwards",