8381834: Improve TLAB sizing heuristics

Reviewed-by: jsikstro, tschatzl, aboldtch
This commit is contained in:
Albert Mingkun Yang 2026-05-28 07:20:06 +00:00
parent 7df417c353
commit 2c7efc0880
3 changed files with 164 additions and 110 deletions

View File

@ -35,6 +35,7 @@
#include "runtime/perfData.hpp"
#include "runtime/threadSMR.hpp"
#include "utilities/copy.hpp"
#include "utilities/integerCast.hpp"
size_t ThreadLocalAllocBuffer::_max_size = 0;
unsigned int ThreadLocalAllocBuffer::_target_num_refills = 0;
@ -75,43 +76,54 @@ size_t ThreadLocalAllocBuffer::remaining() {
}
void ThreadLocalAllocBuffer::accumulate_and_reset_statistics(ThreadLocalAllocStats* stats) {
size_t capacity = Universe::heap()->tlab_capacity();
size_t used = Universe::heap()->tlab_used();
_gc_waste += (unsigned)remaining();
uint64_t total_allocated = thread()->allocated_bytes();
uint64_t allocated_since_last_gc = total_allocated - _allocated_before_last_gc;
_allocated_before_last_gc = total_allocated;
const uint64_t allocated_bytes = thread()->allocated_bytes();
print_stats("gc");
const size_t allocated_since_last_gc = integer_cast_permit_tautology<size_t>(allocated_bytes - _allocated_before_last_gc);
_allocated_before_last_gc = allocated_bytes;
if (_num_refills > 0) {
// Update allocation history if a reasonable amount of eden was allocated.
bool update_allocation_history = used > 0.5 * capacity;
if (update_allocation_history) {
// Average the fraction of eden allocated in a tlab by this
// thread for use in the next resize operation.
// _gc_waste is not subtracted because it's included in
// "used".
// The result can be larger than 1.0 due to direct to old allocations.
// These allocations should ideally not be counted but since it is not possible
// to filter them out here we just cap the fraction to be at most 1.0.
// Keep alloc_frac as float and not double to avoid the double to float conversion
float alloc_frac = MIN2(1.0f, allocated_since_last_gc / (float) used);
_allocation_fraction.sample(alloc_frac);
if (allocated_since_last_gc > 0) {
const size_t tlab_capacity = Universe::heap()->tlab_capacity();
const size_t tlab_used = Universe::heap()->tlab_used();
if (tlab_used > 0.5 * tlab_capacity) {
// To avoid divide-by-zero
const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1));
const float alloc_frac = (float)allocated_since_last_gc / effective_tlab_capacity;
_allocation_fraction.sample(MIN2(alloc_frac, 1.0f));
}
stats->update_fast_allocations(_num_refills,
_allocated_size,
_gc_waste,
_refill_waste);
stats->update_current_thread_stats(_num_refills,
allocated_since_last_gc,
_allocated_size,
_gc_waste,
_refill_waste,
_num_slow_allocations);
} else {
assert(_num_refills == 0 && _refill_waste == 0 && _gc_waste == 0,
assert(_num_refills == 0 && _refill_waste == 0
&& _gc_waste == 0 && _num_slow_allocations == 0,
"tlab stats == 0");
}
stats->update_num_slow_allocations(_num_slow_allocations);
{
Log(gc, tlab) log;
if (log.is_trace()) {
Thread* thrd = thread();
size_t waste = _gc_waste + _refill_waste;
double waste_percent = percent_of(waste, _allocated_size);
log.trace("TLAB GC: thread: " PTR_FORMAT " [id: %2d]"
" desired: %zuK"
" allocated: %zuK"
" slow allocs: %d refill waste: %zuB"
" refills: %d waste %4.1f%% gc: %dB"
" slow: %dB",
p2i(thrd), thrd->osthread()->thread_id(),
_desired_size*HeapWordSize/K,
allocated_since_last_gc/K,
_num_slow_allocations, _refill_waste_limit * HeapWordSize,
_num_refills, waste_percent,
_gc_waste * HeapWordSize,
_refill_waste * HeapWordSize);
}
}
reset_statistics();
}
@ -147,20 +159,27 @@ void ThreadLocalAllocBuffer::record_refill_waste() {
}
void ThreadLocalAllocBuffer::resize() {
// Compute the next tlab size using expected allocation amount
assert(ResizeTLAB, "Should not call this otherwise");
size_t alloc = (size_t)(_allocation_fraction.average() *
(Universe::heap()->tlab_capacity() / HeapWordSize));
size_t capacity_in_words = Universe::heap()->tlab_capacity() / HeapWordSize;
float alloc_fraction = _allocation_fraction.average();
if (alloc_fraction == 0.0) {
// No samples, use global alloc fraction as an approximation.
const float total_frac = ThreadLocalAllocStats::total_requested_size_fraction_avg();
const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
alloc_fraction = total_frac / num_threads;
}
size_t alloc = (size_t)(alloc_fraction * capacity_in_words);
size_t new_size = alloc / _target_num_refills;
new_size = clamp(new_size, min_size(), max_size());
size_t aligned_new_size = align_object_size(new_size);
log_trace(gc, tlab)("TLAB new size: thread: " PTR_FORMAT " [id: %2d]"
" refills %d alloc: %8.6f desired_size: %zu -> %zu",
log_trace(gc, tlab)("TLAB resize: thread: " PTR_FORMAT " [id: %2d]"
" alloc-fraction: %.3f desired_size: %zuK -> %zuK",
p2i(thread()), thread()->osthread()->thread_id(),
_target_num_refills, _allocation_fraction.average(), desired_size(), aligned_new_size);
alloc_fraction,
desired_size() * HeapWordSize/K, aligned_new_size * HeapWordSize/K);
set_desired_size(aligned_new_size);
set_refill_waste_limit(initial_refill_waste_limit());
@ -179,11 +198,24 @@ void ThreadLocalAllocBuffer::fill(HeapWord* start,
size_t new_size) {
_num_refills++;
_allocated_size += new_size;
print_stats("fill");
assert(top <= start + new_size - alignment_reserve(), "size too small");
initialize(start, top, start + new_size - alignment_reserve());
{
Log(gc, tlab) log;
if (log.is_trace()) {
Thread* thrd = thread();
log.trace("TLAB fill: thread: " PTR_FORMAT " [id: %2d]"
" capacity: %zuK"
" slow allocs: %d "
" refills: %d",
p2i(thrd), thrd->osthread()->thread_id(),
pointer_delta(_end, _start, sizeof(char)) / K,
_num_slow_allocations,
_num_refills);
}
}
// Reset amount of internal fragmentation
set_refill_waste_limit(initial_refill_waste_limit());
}
@ -206,13 +238,6 @@ void ThreadLocalAllocBuffer::initialize() {
set_desired_size(initial_desired_size());
size_t capacity = Universe::heap()->tlab_capacity() / HeapWordSize;
if (capacity > 0) {
// Keep alloc_frac as float and not double to avoid the double to float conversion
float alloc_frac = desired_size() * target_num_refills() / (float)capacity;
_allocation_fraction.sample(alloc_frac);
}
set_refill_waste_limit(initial_refill_waste_limit());
reset_statistics();
@ -243,11 +268,11 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
if (TLABSize > 0) {
init_sz = TLABSize / HeapWordSize;
} else {
// Initial size is a function of the average number of allocating threads.
unsigned int num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
init_sz = (Universe::heap()->tlab_capacity() / HeapWordSize) /
(num_threads * target_num_refills());
const size_t predicted_total_requested_size = (size_t)(ThreadLocalAllocStats::total_requested_size_fraction_avg() * Universe::heap()->tlab_capacity());
const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
const size_t per_thread_requested_size = predicted_total_requested_size / num_threads;
const size_t tlab_size = per_thread_requested_size / _target_num_refills;
init_sz = tlab_size / HeapWordSize;
init_sz = align_object_size(init_sz);
}
// We can't use clamp() between min_size() and max_size() here because some
@ -258,32 +283,7 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
return init_sz;
}
void ThreadLocalAllocBuffer::print_stats(const char* tag) {
Log(gc, tlab) log;
if (!log.is_trace()) {
return;
}
Thread* thrd = thread();
size_t waste = _gc_waste + _refill_waste;
double waste_percent = percent_of(waste, _allocated_size);
size_t tlab_used = Universe::heap()->tlab_used();
log.trace("TLAB: %s thread: " PTR_FORMAT " [id: %2d]"
" desired_size: %zuKB"
" slow allocs: %d refill waste: %zuB"
" alloc:%8.5f %8.0fKB refills: %d waste %4.1f%% gc: %dB"
" slow: %dB",
tag, p2i(thrd), thrd->osthread()->thread_id(),
_desired_size / (K / HeapWordSize),
_num_slow_allocations, _refill_waste_limit * HeapWordSize,
_allocation_fraction.average(),
_allocation_fraction.average() * tlab_used / K,
_num_refills, waste_percent,
_gc_waste * HeapWordSize,
_refill_waste * HeapWordSize);
}
Thread* ThreadLocalAllocBuffer::thread() {
Thread* ThreadLocalAllocBuffer::thread() const {
return (Thread*)(((char*)this) + in_bytes(start_offset()) - in_bytes(Thread::tlab_start_offset()));
}
@ -314,6 +314,7 @@ PerfVariable* ThreadLocalAllocStats::_perf_max_refill_waste;
PerfVariable* ThreadLocalAllocStats::_perf_total_num_slow_allocations;
PerfVariable* ThreadLocalAllocStats::_perf_max_num_slow_allocations;
AdaptiveWeightedAverage ThreadLocalAllocStats::_num_allocating_threads_avg(0);
AdaptiveWeightedAverage ThreadLocalAllocStats::_total_requested_size_fraction(0);
static PerfVariable* create_perf_variable(const char* name, PerfData::Units unit, TRAPS) {
ResourceMark rm;
@ -324,6 +325,9 @@ void ThreadLocalAllocStats::initialize() {
_num_allocating_threads_avg = AdaptiveWeightedAverage(TLABAllocationWeight);
_num_allocating_threads_avg.sample(1); // One allocating thread at startup
_total_requested_size_fraction = AdaptiveWeightedAverage(TLABAllocationWeight);
_total_requested_size_fraction.sample(0.10f); // 10%
if (UsePerfData) {
EXCEPTION_MARK;
_perf_num_allocating_threads = create_perf_variable("allocThreads", PerfData::U_None, CHECK);
@ -344,6 +348,7 @@ ThreadLocalAllocStats::ThreadLocalAllocStats() :
_total_num_refills(0),
_max_num_refills(0),
_total_allocated_size(0),
_total_requested_bytes(0),
_total_gc_waste(0),
_max_gc_waste(0),
_total_refill_waste(0),
@ -355,21 +360,25 @@ unsigned int ThreadLocalAllocStats::num_allocating_threads_avg() {
return MAX2((unsigned int)(_num_allocating_threads_avg.average() + 0.5), 1U);
}
void ThreadLocalAllocStats::update_fast_allocations(unsigned int num_refills,
size_t allocated_size,
size_t gc_waste,
size_t refill_waste) {
_num_allocating_threads += 1;
_total_num_refills += num_refills;
_max_num_refills = MAX2(_max_num_refills, num_refills);
_total_allocated_size += allocated_size;
_total_gc_waste += gc_waste;
_max_gc_waste = MAX2(_max_gc_waste, gc_waste);
_total_refill_waste += refill_waste;
_max_refill_waste = MAX2(_max_refill_waste, refill_waste);
float ThreadLocalAllocStats::total_requested_size_fraction_avg() {
return _total_requested_size_fraction.average();
}
void ThreadLocalAllocStats::update_num_slow_allocations(unsigned int num_slow_allocations) {
void ThreadLocalAllocStats::update_current_thread_stats(unsigned int num_refills,
size_t requested_bytes,
size_t alloc_size_for_tlab,
size_t gc_waste,
size_t refill_waste,
unsigned int num_slow_allocations) {
_num_allocating_threads += 1;
_total_num_refills += num_refills;
_max_num_refills = MAX2(_max_num_refills, num_refills);
_total_allocated_size += alloc_size_for_tlab;
_total_requested_bytes += requested_bytes;
_total_gc_waste += gc_waste;
_max_gc_waste = MAX2(_max_gc_waste, gc_waste);
_total_refill_waste += refill_waste;
_max_refill_waste = MAX2(_max_refill_waste, refill_waste);
_total_num_slow_allocations += num_slow_allocations;
_max_num_slow_allocations = MAX2(_max_num_slow_allocations, num_slow_allocations);
}
@ -379,6 +388,7 @@ void ThreadLocalAllocStats::update(const ThreadLocalAllocStats& other) {
_total_num_refills += other._total_num_refills;
_max_num_refills = MAX2(_max_num_refills, other._max_num_refills);
_total_allocated_size += other._total_allocated_size;
_total_requested_bytes += other._total_requested_bytes;
_total_gc_waste += other._total_gc_waste;
_max_gc_waste = MAX2(_max_gc_waste, other._max_gc_waste);
_total_refill_waste += other._total_refill_waste;
@ -392,6 +402,7 @@ void ThreadLocalAllocStats::reset() {
_total_num_refills = 0;
_max_num_refills = 0;
_total_allocated_size = 0;
_total_requested_bytes = 0;
_total_gc_waste = 0;
_max_gc_waste = 0;
_total_refill_waste = 0;
@ -401,22 +412,37 @@ void ThreadLocalAllocStats::reset() {
}
void ThreadLocalAllocStats::publish() {
if (_total_allocated_size == 0) {
if (_total_requested_bytes == 0) {
return;
}
_num_allocating_threads_avg.sample(_num_allocating_threads);
{
const size_t tlab_capacity = Universe::heap()->tlab_capacity();
const size_t tlab_used = Universe::heap()->tlab_used();
if (tlab_used > 0.5 * tlab_capacity) {
// To avoid divide-by-zero
const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1));
const float requested_size_fraction = (float)_total_requested_bytes / effective_tlab_capacity;
_total_requested_size_fraction.sample(MIN2(requested_size_fraction, 1.0f));
}
}
const size_t waste = _total_gc_waste + _total_refill_waste;
const double waste_percent = percent_of(waste, _total_allocated_size);
log_debug(gc, tlab)("TLAB totals: thrds: %d refills: %d max: %d"
" slow allocs: %d max %d waste: %4.1f%%"
" gc: %zuB max: %zuB"
" slow: %zuB max: %zuB",
_num_allocating_threads, _total_num_refills, _max_num_refills,
const double gc_waste_pct = percent_of(_total_gc_waste, _total_allocated_size);
const double refill_waste_pct = percent_of(_total_refill_waste, _total_allocated_size);
log_debug(gc, tlab)("TLAB totals: thrds: %d alloc-frac: %.1f%% refills: %d max: %d"
" slow allocs: %d max %d waste: %.1f%%"
" gc: %zuB(%.1f%%) max: %zuB"
" refill: %zuB(%.1f%%) max: %zuB",
_num_allocating_threads, _total_requested_size_fraction.average() * 100, _total_num_refills, _max_num_refills,
_total_num_slow_allocations, _max_num_slow_allocations, waste_percent,
_total_gc_waste * HeapWordSize, _max_gc_waste * HeapWordSize,
_total_refill_waste * HeapWordSize, _max_refill_waste * HeapWordSize);
_total_gc_waste * HeapWordSize, gc_waste_pct, _max_gc_waste * HeapWordSize,
_total_refill_waste * HeapWordSize, refill_waste_pct, _max_refill_waste * HeapWordSize);
if (UsePerfData) {
_perf_num_allocating_threads ->set_value(_num_allocating_threads);

View File

@ -52,6 +52,10 @@ private:
HeapWord* _allocation_end; // end for allocations (actual TLAB end, excluding alignment_reserve)
size_t _desired_size; // desired size (including alignment_reserve)
// If too many slow allocations (outside TLAB) happen, we increase
// _refill_waste_limit. This reduces outside-TLAB allocations at
// the expense of wasting more memory, i.e., the TLAB is discarded sooner.
size_t _refill_waste_limit; // hold onto tlab if free() is larger than this
uint64_t _allocated_before_last_gc; // total bytes allocated up until the last gc
@ -59,12 +63,24 @@ private:
static unsigned _target_num_refills; // expected number of refills between GCs
unsigned _num_refills;
// TLAB retirement is invoked in two main contexts:
// 1. TLAB refill:
// The current TLAB is insufficient to satisfy a pending allocation
// request, triggering a refill. The remaining space in the current
// TLAB is treated as waste and tracked in _refill_waste.
// 2. Before GC:
// Invoked at the start of a GC cycle to ensure heap parsability.
// The unused space in the current TLAB is treated as waste and
// tracked in _gc_waste.
unsigned _refill_waste;
unsigned _gc_waste;
unsigned _num_slow_allocations;
// Allocated size for filling TLAB in HeapWords
size_t _allocated_size;
AdaptiveWeightedAverage _allocation_fraction; // fraction of eden allocated in tlabs
// Fraction of eden allocated by this thread, used for sizing its TLAB.
AdaptiveWeightedAverage _allocation_fraction;
void reset_statistics();
@ -77,8 +93,6 @@ private:
void set_refill_waste_limit(size_t waste) { _refill_waste_limit = waste; }
size_t initial_refill_waste_limit();
static int target_num_refills() { return _target_num_refills; }
size_t initial_desired_size();
size_t remaining();
@ -91,9 +105,7 @@ private:
void accumulate_and_reset_statistics(ThreadLocalAllocStats* stats);
void print_stats(const char* tag);
Thread* thread();
Thread* thread() const;
// statistics
@ -190,11 +202,13 @@ private:
static PerfVariable* _perf_max_num_slow_allocations;
static AdaptiveWeightedAverage _num_allocating_threads_avg;
static AdaptiveWeightedAverage _total_requested_size_fraction;
unsigned int _num_allocating_threads;
unsigned int _total_num_refills;
unsigned int _max_num_refills;
size_t _total_allocated_size;
size_t _total_requested_bytes;
size_t _total_gc_waste;
size_t _max_gc_waste;
size_t _total_refill_waste;
@ -205,14 +219,16 @@ private:
public:
static void initialize();
static unsigned int num_allocating_threads_avg();
static float total_requested_size_fraction_avg();
ThreadLocalAllocStats();
void update_fast_allocations(unsigned int num_refills,
size_t allocated_size,
size_t gc_waste,
size_t refill_waste);
void update_num_slow_allocations(unsigned int num_slow_allocations);
void update_current_thread_stats(unsigned int num_refills,
size_t requested_bytes,
size_t alloc_size_for_tlab,
size_t gc_waste,
size_t refill_waste,
unsigned int num_slow_allocations);
void update(const ThreadLocalAllocStats& other);
void reset();

View File

@ -52,10 +52,22 @@ inline HeapWord* ThreadLocalAllocBuffer::allocate(size_t size) {
}
inline size_t ThreadLocalAllocBuffer::compute_size(size_t obj_size) {
// Compute the size for the new TLAB.
// The "last" tlab may be smaller to reduce fragmentation.
const size_t available_size = Universe::heap()->unsafe_max_tlab_alloc() / HeapWordSize;
size_t new_tlab_size = MIN3(available_size, desired_size() + align_object_size(obj_size), max_size());
size_t scaled_desired_size = desired_size();
if (ResizeTLAB) {
// Extra boost if too many refills; 16X at most.
if (_num_refills > _target_num_refills) {
const uint excess = _num_refills - _target_num_refills;
const uint steps = MIN2(excess / 8, 4U);
// Cap before shifting to avoid overflow.
if (scaled_desired_size > (max_size() >> steps)) {
scaled_desired_size = max_size();
} else {
scaled_desired_size <<= steps;
}
}
}
size_t new_tlab_size = MIN3(available_size, scaled_desired_size + align_object_size(obj_size), max_size());
// Make sure there's enough room for object and filler int[].
if (new_tlab_size < compute_min_size(obj_size)) {