From 2c7efc08805237bcc5103fad6d88d4b947de3e6e Mon Sep 17 00:00:00 2001 From: Albert Mingkun Yang Date: Thu, 28 May 2026 07:20:06 +0000 Subject: [PATCH] 8381834: Improve TLAB sizing heuristics Reviewed-by: jsikstro, tschatzl, aboldtch --- .../gc/shared/threadLocalAllocBuffer.cpp | 218 ++++++++++-------- .../gc/shared/threadLocalAllocBuffer.hpp | 38 ++- .../shared/threadLocalAllocBuffer.inline.hpp | 18 +- 3 files changed, 164 insertions(+), 110 deletions(-) diff --git a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp index 59d7befb32d..fae0365719a 100644 --- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp +++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp @@ -35,6 +35,7 @@ #include "runtime/perfData.hpp" #include "runtime/threadSMR.hpp" #include "utilities/copy.hpp" +#include "utilities/integerCast.hpp" size_t ThreadLocalAllocBuffer::_max_size = 0; unsigned int ThreadLocalAllocBuffer::_target_num_refills = 0; @@ -75,43 +76,54 @@ size_t ThreadLocalAllocBuffer::remaining() { } void ThreadLocalAllocBuffer::accumulate_and_reset_statistics(ThreadLocalAllocStats* stats) { - size_t capacity = Universe::heap()->tlab_capacity(); - size_t used = Universe::heap()->tlab_used(); - _gc_waste += (unsigned)remaining(); - uint64_t total_allocated = thread()->allocated_bytes(); - uint64_t allocated_since_last_gc = total_allocated - _allocated_before_last_gc; - _allocated_before_last_gc = total_allocated; + const uint64_t allocated_bytes = thread()->allocated_bytes(); - print_stats("gc"); + const size_t allocated_since_last_gc = integer_cast_permit_tautology(allocated_bytes - _allocated_before_last_gc); + _allocated_before_last_gc = allocated_bytes; - if (_num_refills > 0) { - // Update allocation history if a reasonable amount of eden was allocated. - bool update_allocation_history = used > 0.5 * capacity; - - if (update_allocation_history) { - // Average the fraction of eden allocated in a tlab by this - // thread for use in the next resize operation. - // _gc_waste is not subtracted because it's included in - // "used". - // The result can be larger than 1.0 due to direct to old allocations. - // These allocations should ideally not be counted but since it is not possible - // to filter them out here we just cap the fraction to be at most 1.0. - // Keep alloc_frac as float and not double to avoid the double to float conversion - float alloc_frac = MIN2(1.0f, allocated_since_last_gc / (float) used); - _allocation_fraction.sample(alloc_frac); + if (allocated_since_last_gc > 0) { + const size_t tlab_capacity = Universe::heap()->tlab_capacity(); + const size_t tlab_used = Universe::heap()->tlab_used(); + if (tlab_used > 0.5 * tlab_capacity) { + // To avoid divide-by-zero + const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1)); + const float alloc_frac = (float)allocated_since_last_gc / effective_tlab_capacity; + _allocation_fraction.sample(MIN2(alloc_frac, 1.0f)); } - - stats->update_fast_allocations(_num_refills, - _allocated_size, - _gc_waste, - _refill_waste); + stats->update_current_thread_stats(_num_refills, + allocated_since_last_gc, + _allocated_size, + _gc_waste, + _refill_waste, + _num_slow_allocations); } else { - assert(_num_refills == 0 && _refill_waste == 0 && _gc_waste == 0, + assert(_num_refills == 0 && _refill_waste == 0 + && _gc_waste == 0 && _num_slow_allocations == 0, "tlab stats == 0"); } - stats->update_num_slow_allocations(_num_slow_allocations); + { + Log(gc, tlab) log; + if (log.is_trace()) { + Thread* thrd = thread(); + size_t waste = _gc_waste + _refill_waste; + double waste_percent = percent_of(waste, _allocated_size); + log.trace("TLAB GC: thread: " PTR_FORMAT " [id: %2d]" + " desired: %zuK" + " allocated: %zuK" + " slow allocs: %d refill waste: %zuB" + " refills: %d waste %4.1f%% gc: %dB" + " slow: %dB", + p2i(thrd), thrd->osthread()->thread_id(), + _desired_size*HeapWordSize/K, + allocated_since_last_gc/K, + _num_slow_allocations, _refill_waste_limit * HeapWordSize, + _num_refills, waste_percent, + _gc_waste * HeapWordSize, + _refill_waste * HeapWordSize); + } + } reset_statistics(); } @@ -147,20 +159,27 @@ void ThreadLocalAllocBuffer::record_refill_waste() { } void ThreadLocalAllocBuffer::resize() { - // Compute the next tlab size using expected allocation amount assert(ResizeTLAB, "Should not call this otherwise"); - size_t alloc = (size_t)(_allocation_fraction.average() * - (Universe::heap()->tlab_capacity() / HeapWordSize)); + size_t capacity_in_words = Universe::heap()->tlab_capacity() / HeapWordSize; + float alloc_fraction = _allocation_fraction.average(); + if (alloc_fraction == 0.0) { + // No samples, use global alloc fraction as an approximation. + const float total_frac = ThreadLocalAllocStats::total_requested_size_fraction_avg(); + const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg(); + alloc_fraction = total_frac / num_threads; + } + size_t alloc = (size_t)(alloc_fraction * capacity_in_words); size_t new_size = alloc / _target_num_refills; new_size = clamp(new_size, min_size(), max_size()); size_t aligned_new_size = align_object_size(new_size); - log_trace(gc, tlab)("TLAB new size: thread: " PTR_FORMAT " [id: %2d]" - " refills %d alloc: %8.6f desired_size: %zu -> %zu", + log_trace(gc, tlab)("TLAB resize: thread: " PTR_FORMAT " [id: %2d]" + " alloc-fraction: %.3f desired_size: %zuK -> %zuK", p2i(thread()), thread()->osthread()->thread_id(), - _target_num_refills, _allocation_fraction.average(), desired_size(), aligned_new_size); + alloc_fraction, + desired_size() * HeapWordSize/K, aligned_new_size * HeapWordSize/K); set_desired_size(aligned_new_size); set_refill_waste_limit(initial_refill_waste_limit()); @@ -179,11 +198,24 @@ void ThreadLocalAllocBuffer::fill(HeapWord* start, size_t new_size) { _num_refills++; _allocated_size += new_size; - print_stats("fill"); + assert(top <= start + new_size - alignment_reserve(), "size too small"); initialize(start, top, start + new_size - alignment_reserve()); - + { + Log(gc, tlab) log; + if (log.is_trace()) { + Thread* thrd = thread(); + log.trace("TLAB fill: thread: " PTR_FORMAT " [id: %2d]" + " capacity: %zuK" + " slow allocs: %d " + " refills: %d", + p2i(thrd), thrd->osthread()->thread_id(), + pointer_delta(_end, _start, sizeof(char)) / K, + _num_slow_allocations, + _num_refills); + } + } // Reset amount of internal fragmentation set_refill_waste_limit(initial_refill_waste_limit()); } @@ -206,13 +238,6 @@ void ThreadLocalAllocBuffer::initialize() { set_desired_size(initial_desired_size()); - size_t capacity = Universe::heap()->tlab_capacity() / HeapWordSize; - if (capacity > 0) { - // Keep alloc_frac as float and not double to avoid the double to float conversion - float alloc_frac = desired_size() * target_num_refills() / (float)capacity; - _allocation_fraction.sample(alloc_frac); - } - set_refill_waste_limit(initial_refill_waste_limit()); reset_statistics(); @@ -243,11 +268,11 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() { if (TLABSize > 0) { init_sz = TLABSize / HeapWordSize; } else { - // Initial size is a function of the average number of allocating threads. - unsigned int num_threads = ThreadLocalAllocStats::num_allocating_threads_avg(); - - init_sz = (Universe::heap()->tlab_capacity() / HeapWordSize) / - (num_threads * target_num_refills()); + const size_t predicted_total_requested_size = (size_t)(ThreadLocalAllocStats::total_requested_size_fraction_avg() * Universe::heap()->tlab_capacity()); + const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg(); + const size_t per_thread_requested_size = predicted_total_requested_size / num_threads; + const size_t tlab_size = per_thread_requested_size / _target_num_refills; + init_sz = tlab_size / HeapWordSize; init_sz = align_object_size(init_sz); } // We can't use clamp() between min_size() and max_size() here because some @@ -258,32 +283,7 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() { return init_sz; } -void ThreadLocalAllocBuffer::print_stats(const char* tag) { - Log(gc, tlab) log; - if (!log.is_trace()) { - return; - } - - Thread* thrd = thread(); - size_t waste = _gc_waste + _refill_waste; - double waste_percent = percent_of(waste, _allocated_size); - size_t tlab_used = Universe::heap()->tlab_used(); - log.trace("TLAB: %s thread: " PTR_FORMAT " [id: %2d]" - " desired_size: %zuKB" - " slow allocs: %d refill waste: %zuB" - " alloc:%8.5f %8.0fKB refills: %d waste %4.1f%% gc: %dB" - " slow: %dB", - tag, p2i(thrd), thrd->osthread()->thread_id(), - _desired_size / (K / HeapWordSize), - _num_slow_allocations, _refill_waste_limit * HeapWordSize, - _allocation_fraction.average(), - _allocation_fraction.average() * tlab_used / K, - _num_refills, waste_percent, - _gc_waste * HeapWordSize, - _refill_waste * HeapWordSize); -} - -Thread* ThreadLocalAllocBuffer::thread() { +Thread* ThreadLocalAllocBuffer::thread() const { return (Thread*)(((char*)this) + in_bytes(start_offset()) - in_bytes(Thread::tlab_start_offset())); } @@ -314,6 +314,7 @@ PerfVariable* ThreadLocalAllocStats::_perf_max_refill_waste; PerfVariable* ThreadLocalAllocStats::_perf_total_num_slow_allocations; PerfVariable* ThreadLocalAllocStats::_perf_max_num_slow_allocations; AdaptiveWeightedAverage ThreadLocalAllocStats::_num_allocating_threads_avg(0); +AdaptiveWeightedAverage ThreadLocalAllocStats::_total_requested_size_fraction(0); static PerfVariable* create_perf_variable(const char* name, PerfData::Units unit, TRAPS) { ResourceMark rm; @@ -324,6 +325,9 @@ void ThreadLocalAllocStats::initialize() { _num_allocating_threads_avg = AdaptiveWeightedAverage(TLABAllocationWeight); _num_allocating_threads_avg.sample(1); // One allocating thread at startup + _total_requested_size_fraction = AdaptiveWeightedAverage(TLABAllocationWeight); + _total_requested_size_fraction.sample(0.10f); // 10% + if (UsePerfData) { EXCEPTION_MARK; _perf_num_allocating_threads = create_perf_variable("allocThreads", PerfData::U_None, CHECK); @@ -344,6 +348,7 @@ ThreadLocalAllocStats::ThreadLocalAllocStats() : _total_num_refills(0), _max_num_refills(0), _total_allocated_size(0), + _total_requested_bytes(0), _total_gc_waste(0), _max_gc_waste(0), _total_refill_waste(0), @@ -355,21 +360,25 @@ unsigned int ThreadLocalAllocStats::num_allocating_threads_avg() { return MAX2((unsigned int)(_num_allocating_threads_avg.average() + 0.5), 1U); } -void ThreadLocalAllocStats::update_fast_allocations(unsigned int num_refills, - size_t allocated_size, - size_t gc_waste, - size_t refill_waste) { - _num_allocating_threads += 1; - _total_num_refills += num_refills; - _max_num_refills = MAX2(_max_num_refills, num_refills); - _total_allocated_size += allocated_size; - _total_gc_waste += gc_waste; - _max_gc_waste = MAX2(_max_gc_waste, gc_waste); - _total_refill_waste += refill_waste; - _max_refill_waste = MAX2(_max_refill_waste, refill_waste); +float ThreadLocalAllocStats::total_requested_size_fraction_avg() { + return _total_requested_size_fraction.average(); } -void ThreadLocalAllocStats::update_num_slow_allocations(unsigned int num_slow_allocations) { +void ThreadLocalAllocStats::update_current_thread_stats(unsigned int num_refills, + size_t requested_bytes, + size_t alloc_size_for_tlab, + size_t gc_waste, + size_t refill_waste, + unsigned int num_slow_allocations) { + _num_allocating_threads += 1; + _total_num_refills += num_refills; + _max_num_refills = MAX2(_max_num_refills, num_refills); + _total_allocated_size += alloc_size_for_tlab; + _total_requested_bytes += requested_bytes; + _total_gc_waste += gc_waste; + _max_gc_waste = MAX2(_max_gc_waste, gc_waste); + _total_refill_waste += refill_waste; + _max_refill_waste = MAX2(_max_refill_waste, refill_waste); _total_num_slow_allocations += num_slow_allocations; _max_num_slow_allocations = MAX2(_max_num_slow_allocations, num_slow_allocations); } @@ -379,6 +388,7 @@ void ThreadLocalAllocStats::update(const ThreadLocalAllocStats& other) { _total_num_refills += other._total_num_refills; _max_num_refills = MAX2(_max_num_refills, other._max_num_refills); _total_allocated_size += other._total_allocated_size; + _total_requested_bytes += other._total_requested_bytes; _total_gc_waste += other._total_gc_waste; _max_gc_waste = MAX2(_max_gc_waste, other._max_gc_waste); _total_refill_waste += other._total_refill_waste; @@ -392,6 +402,7 @@ void ThreadLocalAllocStats::reset() { _total_num_refills = 0; _max_num_refills = 0; _total_allocated_size = 0; + _total_requested_bytes = 0; _total_gc_waste = 0; _max_gc_waste = 0; _total_refill_waste = 0; @@ -401,22 +412,37 @@ void ThreadLocalAllocStats::reset() { } void ThreadLocalAllocStats::publish() { - if (_total_allocated_size == 0) { + if (_total_requested_bytes == 0) { return; } _num_allocating_threads_avg.sample(_num_allocating_threads); + { + const size_t tlab_capacity = Universe::heap()->tlab_capacity(); + const size_t tlab_used = Universe::heap()->tlab_used(); + if (tlab_used > 0.5 * tlab_capacity) { + // To avoid divide-by-zero + const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1)); + const float requested_size_fraction = (float)_total_requested_bytes / effective_tlab_capacity; + _total_requested_size_fraction.sample(MIN2(requested_size_fraction, 1.0f)); + } + } + const size_t waste = _total_gc_waste + _total_refill_waste; const double waste_percent = percent_of(waste, _total_allocated_size); - log_debug(gc, tlab)("TLAB totals: thrds: %d refills: %d max: %d" - " slow allocs: %d max %d waste: %4.1f%%" - " gc: %zuB max: %zuB" - " slow: %zuB max: %zuB", - _num_allocating_threads, _total_num_refills, _max_num_refills, + + const double gc_waste_pct = percent_of(_total_gc_waste, _total_allocated_size); + const double refill_waste_pct = percent_of(_total_refill_waste, _total_allocated_size); + + log_debug(gc, tlab)("TLAB totals: thrds: %d alloc-frac: %.1f%% refills: %d max: %d" + " slow allocs: %d max %d waste: %.1f%%" + " gc: %zuB(%.1f%%) max: %zuB" + " refill: %zuB(%.1f%%) max: %zuB", + _num_allocating_threads, _total_requested_size_fraction.average() * 100, _total_num_refills, _max_num_refills, _total_num_slow_allocations, _max_num_slow_allocations, waste_percent, - _total_gc_waste * HeapWordSize, _max_gc_waste * HeapWordSize, - _total_refill_waste * HeapWordSize, _max_refill_waste * HeapWordSize); + _total_gc_waste * HeapWordSize, gc_waste_pct, _max_gc_waste * HeapWordSize, + _total_refill_waste * HeapWordSize, refill_waste_pct, _max_refill_waste * HeapWordSize); if (UsePerfData) { _perf_num_allocating_threads ->set_value(_num_allocating_threads); diff --git a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp index b2bef8acd46..f69ceb00b52 100644 --- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp +++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp @@ -52,6 +52,10 @@ private: HeapWord* _allocation_end; // end for allocations (actual TLAB end, excluding alignment_reserve) size_t _desired_size; // desired size (including alignment_reserve) + + // If too many slow allocations (outside TLAB) happen, we increase + // _refill_waste_limit. This reduces outside-TLAB allocations at + // the expense of wasting more memory, i.e., the TLAB is discarded sooner. size_t _refill_waste_limit; // hold onto tlab if free() is larger than this uint64_t _allocated_before_last_gc; // total bytes allocated up until the last gc @@ -59,12 +63,24 @@ private: static unsigned _target_num_refills; // expected number of refills between GCs unsigned _num_refills; + // TLAB retirement is invoked in two main contexts: + // 1. TLAB refill: + // The current TLAB is insufficient to satisfy a pending allocation + // request, triggering a refill. The remaining space in the current + // TLAB is treated as waste and tracked in _refill_waste. + // 2. Before GC: + // Invoked at the start of a GC cycle to ensure heap parsability. + // The unused space in the current TLAB is treated as waste and + // tracked in _gc_waste. unsigned _refill_waste; unsigned _gc_waste; unsigned _num_slow_allocations; + + // Allocated size for filling TLAB in HeapWords size_t _allocated_size; - AdaptiveWeightedAverage _allocation_fraction; // fraction of eden allocated in tlabs + // Fraction of eden allocated by this thread, used for sizing its TLAB. + AdaptiveWeightedAverage _allocation_fraction; void reset_statistics(); @@ -77,8 +93,6 @@ private: void set_refill_waste_limit(size_t waste) { _refill_waste_limit = waste; } size_t initial_refill_waste_limit(); - - static int target_num_refills() { return _target_num_refills; } size_t initial_desired_size(); size_t remaining(); @@ -91,9 +105,7 @@ private: void accumulate_and_reset_statistics(ThreadLocalAllocStats* stats); - void print_stats(const char* tag); - - Thread* thread(); + Thread* thread() const; // statistics @@ -190,11 +202,13 @@ private: static PerfVariable* _perf_max_num_slow_allocations; static AdaptiveWeightedAverage _num_allocating_threads_avg; + static AdaptiveWeightedAverage _total_requested_size_fraction; unsigned int _num_allocating_threads; unsigned int _total_num_refills; unsigned int _max_num_refills; size_t _total_allocated_size; + size_t _total_requested_bytes; size_t _total_gc_waste; size_t _max_gc_waste; size_t _total_refill_waste; @@ -205,14 +219,16 @@ private: public: static void initialize(); static unsigned int num_allocating_threads_avg(); + static float total_requested_size_fraction_avg(); ThreadLocalAllocStats(); - void update_fast_allocations(unsigned int num_refills, - size_t allocated_size, - size_t gc_waste, - size_t refill_waste); - void update_num_slow_allocations(unsigned int num_slow_allocations); + void update_current_thread_stats(unsigned int num_refills, + size_t requested_bytes, + size_t alloc_size_for_tlab, + size_t gc_waste, + size_t refill_waste, + unsigned int num_slow_allocations); void update(const ThreadLocalAllocStats& other); void reset(); diff --git a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp index 727467f98d0..ee1cca56b9e 100644 --- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp +++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp @@ -52,10 +52,22 @@ inline HeapWord* ThreadLocalAllocBuffer::allocate(size_t size) { } inline size_t ThreadLocalAllocBuffer::compute_size(size_t obj_size) { - // Compute the size for the new TLAB. - // The "last" tlab may be smaller to reduce fragmentation. const size_t available_size = Universe::heap()->unsafe_max_tlab_alloc() / HeapWordSize; - size_t new_tlab_size = MIN3(available_size, desired_size() + align_object_size(obj_size), max_size()); + size_t scaled_desired_size = desired_size(); + if (ResizeTLAB) { + // Extra boost if too many refills; 16X at most. + if (_num_refills > _target_num_refills) { + const uint excess = _num_refills - _target_num_refills; + const uint steps = MIN2(excess / 8, 4U); + // Cap before shifting to avoid overflow. + if (scaled_desired_size > (max_size() >> steps)) { + scaled_desired_size = max_size(); + } else { + scaled_desired_size <<= steps; + } + } + } + size_t new_tlab_size = MIN3(available_size, scaled_desired_size + align_object_size(obj_size), max_size()); // Make sure there's enough room for object and filler int[]. if (new_tlab_size < compute_min_size(obj_size)) {