8381834: Improve TLAB sizing heuristics

Reviewed-by: jsikstro, tschatzl, aboldtch
2026-06-08 11:35:21 +00:00 · 2026-05-28 07:20:06 +00:00 · 2026-05-28 07:20:06 +00:00 · 2c7efc0880
commit 2c7efc0880
parent 7df417c353
3 changed files with 164 additions and 110 deletions
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
@ -35,6 +35,7 @@
 #include "runtime/perfData.hpp"
 #include "runtime/threadSMR.hpp"
 #include "utilities/copy.hpp"
+#include "utilities/integerCast.hpp"

 size_t       ThreadLocalAllocBuffer::_max_size = 0;
 unsigned int ThreadLocalAllocBuffer::_target_num_refills = 0;
@ -75,43 +76,54 @@ size_t ThreadLocalAllocBuffer::remaining() {
 }

 void ThreadLocalAllocBuffer::accumulate_and_reset_statistics(ThreadLocalAllocStats* stats) {
-  size_t capacity = Universe::heap()->tlab_capacity();
-  size_t used = Universe::heap()->tlab_used();
-
  _gc_waste += (unsigned)remaining();
-  uint64_t total_allocated = thread()->allocated_bytes();
-  uint64_t allocated_since_last_gc = total_allocated - _allocated_before_last_gc;
-  _allocated_before_last_gc = total_allocated;
+  const uint64_t allocated_bytes = thread()->allocated_bytes();

-  print_stats("gc");
+  const size_t allocated_since_last_gc = integer_cast_permit_tautology<size_t>(allocated_bytes - _allocated_before_last_gc);
+  _allocated_before_last_gc = allocated_bytes;

-  if (_num_refills > 0) {
-    // Update allocation history if a reasonable amount of eden was allocated.
-    bool update_allocation_history = used > 0.5 * capacity;
-
-    if (update_allocation_history) {
-      // Average the fraction of eden allocated in a tlab by this
-      // thread for use in the next resize operation.
-      // _gc_waste is not subtracted because it's included in
-      // "used".
-      // The result can be larger than 1.0 due to direct to old allocations.
-      // These allocations should ideally not be counted but since it is not possible
-      // to filter them out here we just cap the fraction to be at most 1.0.
-      // Keep alloc_frac as float and not double to avoid the double to float conversion
-      float alloc_frac = MIN2(1.0f, allocated_since_last_gc / (float) used);
-      _allocation_fraction.sample(alloc_frac);
+  if (allocated_since_last_gc > 0) {
+    const size_t tlab_capacity = Universe::heap()->tlab_capacity();
+    const size_t tlab_used = Universe::heap()->tlab_used();
+    if (tlab_used > 0.5 * tlab_capacity) {
+      // To avoid divide-by-zero
+      const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1));
+      const float alloc_frac = (float)allocated_since_last_gc / effective_tlab_capacity;
+      _allocation_fraction.sample(MIN2(alloc_frac, 1.0f));
    }
-
-    stats->update_fast_allocations(_num_refills,
-                                   _allocated_size,
-                                   _gc_waste,
-                                   _refill_waste);
+    stats->update_current_thread_stats(_num_refills,
+                                       allocated_since_last_gc,
+                                       _allocated_size,
+                                       _gc_waste,
+                                       _refill_waste,
+                                       _num_slow_allocations);
  } else {
-    assert(_num_refills == 0 && _refill_waste == 0 && _gc_waste == 0,
+    assert(_num_refills == 0 && _refill_waste == 0
+           && _gc_waste == 0 && _num_slow_allocations == 0,
           "tlab stats == 0");
  }

-  stats->update_num_slow_allocations(_num_slow_allocations);
+  {
+    Log(gc, tlab) log;
+    if (log.is_trace()) {
+      Thread* thrd = thread();
+      size_t waste = _gc_waste + _refill_waste;
+      double waste_percent = percent_of(waste, _allocated_size);
+      log.trace("TLAB GC: thread: " PTR_FORMAT " [id: %2d]"
+                " desired: %zuK"
+                " allocated: %zuK"
+                " slow allocs: %d  refill waste: %zuB"
+                " refills: %d waste %4.1f%% gc: %dB"
+                " slow: %dB",
+                p2i(thrd), thrd->osthread()->thread_id(),
+                _desired_size*HeapWordSize/K,
+                allocated_since_last_gc/K,
+                _num_slow_allocations, _refill_waste_limit * HeapWordSize,
+                _num_refills, waste_percent,
+                _gc_waste * HeapWordSize,
+                _refill_waste * HeapWordSize);
+    }
+  }

  reset_statistics();
 }
@ -147,20 +159,27 @@ void ThreadLocalAllocBuffer::record_refill_waste() {
 }

 void ThreadLocalAllocBuffer::resize() {
-  // Compute the next tlab size using expected allocation amount
  assert(ResizeTLAB, "Should not call this otherwise");
-  size_t alloc = (size_t)(_allocation_fraction.average() *
-                          (Universe::heap()->tlab_capacity() / HeapWordSize));
+  size_t capacity_in_words = Universe::heap()->tlab_capacity() / HeapWordSize;
+  float alloc_fraction = _allocation_fraction.average();
+  if (alloc_fraction == 0.0) {
+    // No samples, use global alloc fraction as an approximation.
+    const float total_frac = ThreadLocalAllocStats::total_requested_size_fraction_avg();
+    const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
+    alloc_fraction = total_frac / num_threads;
+  }
+  size_t alloc = (size_t)(alloc_fraction * capacity_in_words);
  size_t new_size = alloc / _target_num_refills;

  new_size = clamp(new_size, min_size(), max_size());

  size_t aligned_new_size = align_object_size(new_size);

-  log_trace(gc, tlab)("TLAB new size: thread: " PTR_FORMAT " [id: %2d]"
-                      " refills %d  alloc: %8.6f desired_size: %zu -> %zu",
+  log_trace(gc, tlab)("TLAB resize: thread: " PTR_FORMAT " [id: %2d]"
+                      " alloc-fraction: %.3f desired_size: %zuK -> %zuK",
                      p2i(thread()), thread()->osthread()->thread_id(),
-                      _target_num_refills, _allocation_fraction.average(), desired_size(), aligned_new_size);
+                      alloc_fraction,
+                      desired_size() * HeapWordSize/K, aligned_new_size * HeapWordSize/K);

  set_desired_size(aligned_new_size);
  set_refill_waste_limit(initial_refill_waste_limit());
@ -179,11 +198,24 @@ void ThreadLocalAllocBuffer::fill(HeapWord* start,
                                  size_t    new_size) {
  _num_refills++;
  _allocated_size += new_size;
-  print_stats("fill");
+
  assert(top <= start + new_size - alignment_reserve(), "size too small");

  initialize(start, top, start + new_size - alignment_reserve());
-
+  {
+    Log(gc, tlab) log;
+    if (log.is_trace()) {
+      Thread* thrd = thread();
+      log.trace("TLAB fill: thread: " PTR_FORMAT " [id: %2d]"
+                " capacity: %zuK"
+                " slow allocs: %d "
+                " refills: %d",
+                p2i(thrd), thrd->osthread()->thread_id(),
+                pointer_delta(_end, _start, sizeof(char)) / K,
+                _num_slow_allocations,
+                _num_refills);
+    }
+  }
  // Reset amount of internal fragmentation
  set_refill_waste_limit(initial_refill_waste_limit());
 }
@ -206,13 +238,6 @@ void ThreadLocalAllocBuffer::initialize() {

  set_desired_size(initial_desired_size());

-  size_t capacity = Universe::heap()->tlab_capacity() / HeapWordSize;
-  if (capacity > 0) {
-    // Keep alloc_frac as float and not double to avoid the double to float conversion
-    float alloc_frac = desired_size() * target_num_refills() / (float)capacity;
-    _allocation_fraction.sample(alloc_frac);
-  }
-
  set_refill_waste_limit(initial_refill_waste_limit());

  reset_statistics();
@ -243,11 +268,11 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
  if (TLABSize > 0) {
    init_sz = TLABSize / HeapWordSize;
  } else {
-    // Initial size is a function of the average number of allocating threads.
-    unsigned int num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
-
-    init_sz  = (Universe::heap()->tlab_capacity() / HeapWordSize) /
-                      (num_threads * target_num_refills());
+    const size_t predicted_total_requested_size = (size_t)(ThreadLocalAllocStats::total_requested_size_fraction_avg() * Universe::heap()->tlab_capacity());
+    const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
+    const size_t per_thread_requested_size = predicted_total_requested_size / num_threads;
+    const size_t tlab_size = per_thread_requested_size / _target_num_refills;
+    init_sz = tlab_size / HeapWordSize;
    init_sz = align_object_size(init_sz);
  }
  // We can't use clamp() between min_size() and max_size() here because some
@ -258,32 +283,7 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
  return init_sz;
 }

-void ThreadLocalAllocBuffer::print_stats(const char* tag) {
-  Log(gc, tlab) log;
-  if (!log.is_trace()) {
-    return;
-  }
-
-  Thread* thrd = thread();
-  size_t waste = _gc_waste + _refill_waste;
-  double waste_percent = percent_of(waste, _allocated_size);
-  size_t tlab_used  = Universe::heap()->tlab_used();
-  log.trace("TLAB: %s thread: " PTR_FORMAT " [id: %2d]"
-            " desired_size: %zuKB"
-            " slow allocs: %d  refill waste: %zuB"
-            " alloc:%8.5f %8.0fKB refills: %d waste %4.1f%% gc: %dB"
-            " slow: %dB",
-            tag, p2i(thrd), thrd->osthread()->thread_id(),
-            _desired_size / (K / HeapWordSize),
-            _num_slow_allocations, _refill_waste_limit * HeapWordSize,
-            _allocation_fraction.average(),
-            _allocation_fraction.average() * tlab_used / K,
-            _num_refills, waste_percent,
-            _gc_waste * HeapWordSize,
-            _refill_waste * HeapWordSize);
-}
-
-Thread* ThreadLocalAllocBuffer::thread() {
+Thread* ThreadLocalAllocBuffer::thread() const {
  return (Thread*)(((char*)this) + in_bytes(start_offset()) - in_bytes(Thread::tlab_start_offset()));
 }

@ -314,6 +314,7 @@ PerfVariable* ThreadLocalAllocStats::_perf_max_refill_waste;
 PerfVariable* ThreadLocalAllocStats::_perf_total_num_slow_allocations;
 PerfVariable* ThreadLocalAllocStats::_perf_max_num_slow_allocations;
 AdaptiveWeightedAverage ThreadLocalAllocStats::_num_allocating_threads_avg(0);
+AdaptiveWeightedAverage ThreadLocalAllocStats::_total_requested_size_fraction(0);

 static PerfVariable* create_perf_variable(const char* name, PerfData::Units unit, TRAPS) {
  ResourceMark rm;
@ -324,6 +325,9 @@ void ThreadLocalAllocStats::initialize() {
  _num_allocating_threads_avg = AdaptiveWeightedAverage(TLABAllocationWeight);
  _num_allocating_threads_avg.sample(1); // One allocating thread at startup

+  _total_requested_size_fraction = AdaptiveWeightedAverage(TLABAllocationWeight);
+  _total_requested_size_fraction.sample(0.10f); // 10%
+
  if (UsePerfData) {
    EXCEPTION_MARK;
    _perf_num_allocating_threads     = create_perf_variable("allocThreads",   PerfData::U_None,  CHECK);
@ -344,6 +348,7 @@ ThreadLocalAllocStats::ThreadLocalAllocStats() :
    _total_num_refills(0),
    _max_num_refills(0),
    _total_allocated_size(0),
+    _total_requested_bytes(0),
    _total_gc_waste(0),
    _max_gc_waste(0),
    _total_refill_waste(0),
@ -355,21 +360,25 @@ unsigned int ThreadLocalAllocStats::num_allocating_threads_avg() {
  return MAX2((unsigned int)(_num_allocating_threads_avg.average() + 0.5), 1U);
 }

-void ThreadLocalAllocStats::update_fast_allocations(unsigned int num_refills,
-                                                    size_t allocated_size,
-                                                    size_t gc_waste,
-                                                    size_t refill_waste) {
-  _num_allocating_threads  += 1;
-  _total_num_refills       += num_refills;
-  _max_num_refills          = MAX2(_max_num_refills, num_refills);
-  _total_allocated_size    += allocated_size;
-  _total_gc_waste          += gc_waste;
-  _max_gc_waste             = MAX2(_max_gc_waste, gc_waste);
-  _total_refill_waste      += refill_waste;
-  _max_refill_waste         = MAX2(_max_refill_waste, refill_waste);
+float ThreadLocalAllocStats::total_requested_size_fraction_avg() {
+  return _total_requested_size_fraction.average();
 }

-void ThreadLocalAllocStats::update_num_slow_allocations(unsigned int num_slow_allocations) {
+void ThreadLocalAllocStats::update_current_thread_stats(unsigned int num_refills,
+                                                        size_t requested_bytes,
+                                                        size_t alloc_size_for_tlab,
+                                                        size_t gc_waste,
+                                                        size_t refill_waste,
+                                                        unsigned int num_slow_allocations) {
+  _num_allocating_threads     += 1;
+  _total_num_refills          += num_refills;
+  _max_num_refills             = MAX2(_max_num_refills, num_refills);
+  _total_allocated_size       += alloc_size_for_tlab;
+  _total_requested_bytes      += requested_bytes;
+  _total_gc_waste             += gc_waste;
+  _max_gc_waste                = MAX2(_max_gc_waste, gc_waste);
+  _total_refill_waste         += refill_waste;
+  _max_refill_waste            = MAX2(_max_refill_waste, refill_waste);
  _total_num_slow_allocations += num_slow_allocations;
  _max_num_slow_allocations    = MAX2(_max_num_slow_allocations, num_slow_allocations);
 }
@ -379,6 +388,7 @@ void ThreadLocalAllocStats::update(const ThreadLocalAllocStats& other) {
  _total_num_refills          += other._total_num_refills;
  _max_num_refills             = MAX2(_max_num_refills, other._max_num_refills);
  _total_allocated_size       += other._total_allocated_size;
+  _total_requested_bytes      += other._total_requested_bytes;
  _total_gc_waste             += other._total_gc_waste;
  _max_gc_waste                = MAX2(_max_gc_waste, other._max_gc_waste);
  _total_refill_waste         += other._total_refill_waste;
@ -392,6 +402,7 @@ void ThreadLocalAllocStats::reset() {
  _total_num_refills          = 0;
  _max_num_refills            = 0;
  _total_allocated_size       = 0;
+  _total_requested_bytes      = 0;
  _total_gc_waste             = 0;
  _max_gc_waste               = 0;
  _total_refill_waste         = 0;
@ -401,22 +412,37 @@ void ThreadLocalAllocStats::reset() {
 }

 void ThreadLocalAllocStats::publish() {
-  if (_total_allocated_size == 0) {
+  if (_total_requested_bytes == 0) {
    return;
  }

  _num_allocating_threads_avg.sample(_num_allocating_threads);

+  {
+    const size_t tlab_capacity = Universe::heap()->tlab_capacity();
+    const size_t tlab_used = Universe::heap()->tlab_used();
+    if (tlab_used > 0.5 * tlab_capacity) {
+      // To avoid divide-by-zero
+      const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1));
+      const float requested_size_fraction = (float)_total_requested_bytes / effective_tlab_capacity;
+      _total_requested_size_fraction.sample(MIN2(requested_size_fraction, 1.0f));
+    }
+  }
+
  const size_t waste = _total_gc_waste + _total_refill_waste;
  const double waste_percent = percent_of(waste, _total_allocated_size);
-  log_debug(gc, tlab)("TLAB totals: thrds: %d  refills: %d max: %d"
-                      " slow allocs: %d max %d waste: %4.1f%%"
-                      " gc: %zuB max: %zuB"
-                      " slow: %zuB max: %zuB",
-                      _num_allocating_threads, _total_num_refills, _max_num_refills,
+
+  const double gc_waste_pct = percent_of(_total_gc_waste, _total_allocated_size);
+  const double refill_waste_pct = percent_of(_total_refill_waste, _total_allocated_size);
+
+  log_debug(gc, tlab)("TLAB totals: thrds: %d alloc-frac: %.1f%% refills: %d max: %d"
+                      " slow allocs: %d max %d waste: %.1f%%"
+                      " gc: %zuB(%.1f%%) max: %zuB"
+                      " refill: %zuB(%.1f%%) max: %zuB",
+                      _num_allocating_threads, _total_requested_size_fraction.average() * 100, _total_num_refills, _max_num_refills,
                      _total_num_slow_allocations, _max_num_slow_allocations, waste_percent,
-                      _total_gc_waste * HeapWordSize, _max_gc_waste * HeapWordSize,
-                      _total_refill_waste * HeapWordSize, _max_refill_waste * HeapWordSize);
+                      _total_gc_waste * HeapWordSize, gc_waste_pct, _max_gc_waste * HeapWordSize,
+                      _total_refill_waste * HeapWordSize, refill_waste_pct, _max_refill_waste * HeapWordSize);

  if (UsePerfData) {
    _perf_num_allocating_threads      ->set_value(_num_allocating_threads);
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
@ -52,6 +52,10 @@ private:
  HeapWord* _allocation_end;                     // end for allocations (actual TLAB end, excluding alignment_reserve)

  size_t    _desired_size;                       // desired size   (including alignment_reserve)
+
+  // If too many slow allocations (outside TLAB) happen, we increase
+  // _refill_waste_limit. This reduces outside-TLAB allocations at
+  // the expense of wasting more memory, i.e., the TLAB is discarded sooner.
  size_t    _refill_waste_limit;                 // hold onto tlab if free() is larger than this
  uint64_t  _allocated_before_last_gc;           // total bytes allocated up until the last gc

@ -59,12 +63,24 @@ private:
  static unsigned _target_num_refills;           // expected number of refills between GCs

  unsigned  _num_refills;
+  // TLAB retirement is invoked in two main contexts:
+  // 1. TLAB refill:
+  //    The current TLAB is insufficient to satisfy a pending allocation
+  //    request, triggering a refill. The remaining space in the current
+  //    TLAB is treated as waste and tracked in _refill_waste.
+  // 2. Before GC:
+  //    Invoked at the start of a GC cycle to ensure heap parsability.
+  //    The unused space in the current TLAB is treated as waste and
+  //    tracked in _gc_waste.
  unsigned  _refill_waste;
  unsigned  _gc_waste;
  unsigned  _num_slow_allocations;
+
+  // Allocated size for filling TLAB in HeapWords
  size_t    _allocated_size;

-  AdaptiveWeightedAverage _allocation_fraction;  // fraction of eden allocated in tlabs
+  // Fraction of eden allocated by this thread, used for sizing its TLAB.
+  AdaptiveWeightedAverage _allocation_fraction;

  void reset_statistics();

@ -77,8 +93,6 @@ private:
  void set_refill_waste_limit(size_t waste)      { _refill_waste_limit = waste;  }

  size_t initial_refill_waste_limit();
-
-  static int    target_num_refills()             { return _target_num_refills; }
  size_t initial_desired_size();

  size_t remaining();
@ -91,9 +105,7 @@ private:

  void accumulate_and_reset_statistics(ThreadLocalAllocStats* stats);

-  void print_stats(const char* tag);
-
-  Thread* thread();
+  Thread* thread() const;

  // statistics

@ -190,11 +202,13 @@ private:
  static PerfVariable* _perf_max_num_slow_allocations;

  static AdaptiveWeightedAverage _num_allocating_threads_avg;
+  static AdaptiveWeightedAverage _total_requested_size_fraction;

  unsigned int _num_allocating_threads;
  unsigned int _total_num_refills;
  unsigned int _max_num_refills;
  size_t       _total_allocated_size;
+  size_t       _total_requested_bytes;
  size_t       _total_gc_waste;
  size_t       _max_gc_waste;
  size_t       _total_refill_waste;
@ -205,14 +219,16 @@ private:
 public:
  static void initialize();
  static unsigned int num_allocating_threads_avg();
+  static float total_requested_size_fraction_avg();

  ThreadLocalAllocStats();

-  void update_fast_allocations(unsigned int num_refills,
-                               size_t allocated_size,
-                               size_t gc_waste,
-                               size_t refill_waste);
-  void update_num_slow_allocations(unsigned int num_slow_allocations);
+  void update_current_thread_stats(unsigned int num_refills,
+                                   size_t requested_bytes,
+                                   size_t alloc_size_for_tlab,
+                                   size_t gc_waste,
+                                   size_t refill_waste,
+                                   unsigned int num_slow_allocations);
  void update(const ThreadLocalAllocStats& other);

  void reset();
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp
@ -52,10 +52,22 @@ inline HeapWord* ThreadLocalAllocBuffer::allocate(size_t size) {
 }

 inline size_t ThreadLocalAllocBuffer::compute_size(size_t obj_size) {
-  // Compute the size for the new TLAB.
-  // The "last" tlab may be smaller to reduce fragmentation.
  const size_t available_size = Universe::heap()->unsafe_max_tlab_alloc() / HeapWordSize;
-  size_t new_tlab_size = MIN3(available_size, desired_size() + align_object_size(obj_size), max_size());
+  size_t scaled_desired_size = desired_size();
+  if (ResizeTLAB) {
+    // Extra boost if too many refills; 16X at most.
+    if (_num_refills > _target_num_refills) {
+      const uint excess = _num_refills - _target_num_refills;
+      const uint steps = MIN2(excess / 8, 4U);
+      // Cap before shifting to avoid overflow.
+      if (scaled_desired_size > (max_size() >> steps)) {
+        scaled_desired_size = max_size();
+      } else {
+        scaled_desired_size <<= steps;
+      }
+    }
+  }
+  size_t new_tlab_size = MIN3(available_size, scaled_desired_size + align_object_size(obj_size), max_size());

  // Make sure there's enough room for object and filler int[].
  if (new_tlab_size < compute_min_size(obj_size)) {