From 2c7efc08805237bcc5103fad6d88d4b947de3e6e Mon Sep 17 00:00:00 2001
From: Albert Mingkun Yang <ayang@openjdk.org>
Date: Thu, 28 May 2026 07:20:06 +0000
Subject: [PATCH] 8381834: Improve TLAB sizing heuristics

Reviewed-by: jsikstro, tschatzl, aboldtch
---
 .../gc/shared/threadLocalAllocBuffer.cpp      | 218 ++++++++++--------
 .../gc/shared/threadLocalAllocBuffer.hpp      |  38 ++-
 .../shared/threadLocalAllocBuffer.inline.hpp  |  18 +-
 3 files changed, 164 insertions(+), 110 deletions(-)

diff --git a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
index 59d7befb32d..fae0365719a 100644
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
@@ -35,6 +35,7 @@
 #include "runtime/perfData.hpp"
 #include "runtime/threadSMR.hpp"
 #include "utilities/copy.hpp"
+#include "utilities/integerCast.hpp"
 
 size_t       ThreadLocalAllocBuffer::_max_size = 0;
 unsigned int ThreadLocalAllocBuffer::_target_num_refills = 0;
@@ -75,43 +76,54 @@ size_t ThreadLocalAllocBuffer::remaining() {
 }
 
 void ThreadLocalAllocBuffer::accumulate_and_reset_statistics(ThreadLocalAllocStats* stats) {
-  size_t capacity = Universe::heap()->tlab_capacity();
-  size_t used = Universe::heap()->tlab_used();
-
   _gc_waste += (unsigned)remaining();
-  uint64_t total_allocated = thread()->allocated_bytes();
-  uint64_t allocated_since_last_gc = total_allocated - _allocated_before_last_gc;
-  _allocated_before_last_gc = total_allocated;
+  const uint64_t allocated_bytes = thread()->allocated_bytes();
 
-  print_stats("gc");
+  const size_t allocated_since_last_gc = integer_cast_permit_tautology<size_t>(allocated_bytes - _allocated_before_last_gc);
+  _allocated_before_last_gc = allocated_bytes;
 
-  if (_num_refills > 0) {
-    // Update allocation history if a reasonable amount of eden was allocated.
-    bool update_allocation_history = used > 0.5 * capacity;
-
-    if (update_allocation_history) {
-      // Average the fraction of eden allocated in a tlab by this
-      // thread for use in the next resize operation.
-      // _gc_waste is not subtracted because it's included in
-      // "used".
-      // The result can be larger than 1.0 due to direct to old allocations.
-      // These allocations should ideally not be counted but since it is not possible
-      // to filter them out here we just cap the fraction to be at most 1.0.
-      // Keep alloc_frac as float and not double to avoid the double to float conversion
-      float alloc_frac = MIN2(1.0f, allocated_since_last_gc / (float) used);
-      _allocation_fraction.sample(alloc_frac);
+  if (allocated_since_last_gc > 0) {
+    const size_t tlab_capacity = Universe::heap()->tlab_capacity();
+    const size_t tlab_used = Universe::heap()->tlab_used();
+    if (tlab_used > 0.5 * tlab_capacity) {
+      // To avoid divide-by-zero
+      const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1));
+      const float alloc_frac = (float)allocated_since_last_gc / effective_tlab_capacity;
+      _allocation_fraction.sample(MIN2(alloc_frac, 1.0f));
     }
-
-    stats->update_fast_allocations(_num_refills,
-                                   _allocated_size,
-                                   _gc_waste,
-                                   _refill_waste);
+    stats->update_current_thread_stats(_num_refills,
+                                       allocated_since_last_gc,
+                                       _allocated_size,
+                                       _gc_waste,
+                                       _refill_waste,
+                                       _num_slow_allocations);
   } else {
-    assert(_num_refills == 0 && _refill_waste == 0 && _gc_waste == 0,
+    assert(_num_refills == 0 && _refill_waste == 0
+           && _gc_waste == 0 && _num_slow_allocations == 0,
            "tlab stats == 0");
   }
 
-  stats->update_num_slow_allocations(_num_slow_allocations);
+  {
+    Log(gc, tlab) log;
+    if (log.is_trace()) {
+      Thread* thrd = thread();
+      size_t waste = _gc_waste + _refill_waste;
+      double waste_percent = percent_of(waste, _allocated_size);
+      log.trace("TLAB GC: thread: " PTR_FORMAT " [id: %2d]"
+                " desired: %zuK"
+                " allocated: %zuK"
+                " slow allocs: %d  refill waste: %zuB"
+                " refills: %d waste %4.1f%% gc: %dB"
+                " slow: %dB",
+                p2i(thrd), thrd->osthread()->thread_id(),
+                _desired_size*HeapWordSize/K,
+                allocated_since_last_gc/K,
+                _num_slow_allocations, _refill_waste_limit * HeapWordSize,
+                _num_refills, waste_percent,
+                _gc_waste * HeapWordSize,
+                _refill_waste * HeapWordSize);
+    }
+  }
 
   reset_statistics();
 }
@@ -147,20 +159,27 @@ void ThreadLocalAllocBuffer::record_refill_waste() {
 }
 
 void ThreadLocalAllocBuffer::resize() {
-  // Compute the next tlab size using expected allocation amount
   assert(ResizeTLAB, "Should not call this otherwise");
-  size_t alloc = (size_t)(_allocation_fraction.average() *
-                          (Universe::heap()->tlab_capacity() / HeapWordSize));
+  size_t capacity_in_words = Universe::heap()->tlab_capacity() / HeapWordSize;
+  float alloc_fraction = _allocation_fraction.average();
+  if (alloc_fraction == 0.0) {
+    // No samples, use global alloc fraction as an approximation.
+    const float total_frac = ThreadLocalAllocStats::total_requested_size_fraction_avg();
+    const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
+    alloc_fraction = total_frac / num_threads;
+  }
+  size_t alloc = (size_t)(alloc_fraction * capacity_in_words);
   size_t new_size = alloc / _target_num_refills;
 
   new_size = clamp(new_size, min_size(), max_size());
 
   size_t aligned_new_size = align_object_size(new_size);
 
-  log_trace(gc, tlab)("TLAB new size: thread: " PTR_FORMAT " [id: %2d]"
-                      " refills %d  alloc: %8.6f desired_size: %zu -> %zu",
+  log_trace(gc, tlab)("TLAB resize: thread: " PTR_FORMAT " [id: %2d]"
+                      " alloc-fraction: %.3f desired_size: %zuK -> %zuK",
                       p2i(thread()), thread()->osthread()->thread_id(),
-                      _target_num_refills, _allocation_fraction.average(), desired_size(), aligned_new_size);
+                      alloc_fraction,
+                      desired_size() * HeapWordSize/K, aligned_new_size * HeapWordSize/K);
 
   set_desired_size(aligned_new_size);
   set_refill_waste_limit(initial_refill_waste_limit());
@@ -179,11 +198,24 @@ void ThreadLocalAllocBuffer::fill(HeapWord* start,
                                   size_t    new_size) {
   _num_refills++;
   _allocated_size += new_size;
-  print_stats("fill");
+
   assert(top <= start + new_size - alignment_reserve(), "size too small");
 
   initialize(start, top, start + new_size - alignment_reserve());
-
+  {
+    Log(gc, tlab) log;
+    if (log.is_trace()) {
+      Thread* thrd = thread();
+      log.trace("TLAB fill: thread: " PTR_FORMAT " [id: %2d]"
+                " capacity: %zuK"
+                " slow allocs: %d "
+                " refills: %d",
+                p2i(thrd), thrd->osthread()->thread_id(),
+                pointer_delta(_end, _start, sizeof(char)) / K,
+                _num_slow_allocations,
+                _num_refills);
+    }
+  }
   // Reset amount of internal fragmentation
   set_refill_waste_limit(initial_refill_waste_limit());
 }
@@ -206,13 +238,6 @@ void ThreadLocalAllocBuffer::initialize() {
 
   set_desired_size(initial_desired_size());
 
-  size_t capacity = Universe::heap()->tlab_capacity() / HeapWordSize;
-  if (capacity > 0) {
-    // Keep alloc_frac as float and not double to avoid the double to float conversion
-    float alloc_frac = desired_size() * target_num_refills() / (float)capacity;
-    _allocation_fraction.sample(alloc_frac);
-  }
-
   set_refill_waste_limit(initial_refill_waste_limit());
 
   reset_statistics();
@@ -243,11 +268,11 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
   if (TLABSize > 0) {
     init_sz = TLABSize / HeapWordSize;
   } else {
-    // Initial size is a function of the average number of allocating threads.
-    unsigned int num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
-
-    init_sz  = (Universe::heap()->tlab_capacity() / HeapWordSize) /
-                      (num_threads * target_num_refills());
+    const size_t predicted_total_requested_size = (size_t)(ThreadLocalAllocStats::total_requested_size_fraction_avg() * Universe::heap()->tlab_capacity());
+    const uint num_threads = ThreadLocalAllocStats::num_allocating_threads_avg();
+    const size_t per_thread_requested_size = predicted_total_requested_size / num_threads;
+    const size_t tlab_size = per_thread_requested_size / _target_num_refills;
+    init_sz = tlab_size / HeapWordSize;
     init_sz = align_object_size(init_sz);
   }
   // We can't use clamp() between min_size() and max_size() here because some
@@ -258,32 +283,7 @@ size_t ThreadLocalAllocBuffer::initial_desired_size() {
   return init_sz;
 }
 
-void ThreadLocalAllocBuffer::print_stats(const char* tag) {
-  Log(gc, tlab) log;
-  if (!log.is_trace()) {
-    return;
-  }
-
-  Thread* thrd = thread();
-  size_t waste = _gc_waste + _refill_waste;
-  double waste_percent = percent_of(waste, _allocated_size);
-  size_t tlab_used  = Universe::heap()->tlab_used();
-  log.trace("TLAB: %s thread: " PTR_FORMAT " [id: %2d]"
-            " desired_size: %zuKB"
-            " slow allocs: %d  refill waste: %zuB"
-            " alloc:%8.5f %8.0fKB refills: %d waste %4.1f%% gc: %dB"
-            " slow: %dB",
-            tag, p2i(thrd), thrd->osthread()->thread_id(),
-            _desired_size / (K / HeapWordSize),
-            _num_slow_allocations, _refill_waste_limit * HeapWordSize,
-            _allocation_fraction.average(),
-            _allocation_fraction.average() * tlab_used / K,
-            _num_refills, waste_percent,
-            _gc_waste * HeapWordSize,
-            _refill_waste * HeapWordSize);
-}
-
-Thread* ThreadLocalAllocBuffer::thread() {
+Thread* ThreadLocalAllocBuffer::thread() const {
   return (Thread*)(((char*)this) + in_bytes(start_offset()) - in_bytes(Thread::tlab_start_offset()));
 }
 
@@ -314,6 +314,7 @@ PerfVariable* ThreadLocalAllocStats::_perf_max_refill_waste;
 PerfVariable* ThreadLocalAllocStats::_perf_total_num_slow_allocations;
 PerfVariable* ThreadLocalAllocStats::_perf_max_num_slow_allocations;
 AdaptiveWeightedAverage ThreadLocalAllocStats::_num_allocating_threads_avg(0);
+AdaptiveWeightedAverage ThreadLocalAllocStats::_total_requested_size_fraction(0);
 
 static PerfVariable* create_perf_variable(const char* name, PerfData::Units unit, TRAPS) {
   ResourceMark rm;
@@ -324,6 +325,9 @@ void ThreadLocalAllocStats::initialize() {
   _num_allocating_threads_avg = AdaptiveWeightedAverage(TLABAllocationWeight);
   _num_allocating_threads_avg.sample(1); // One allocating thread at startup
 
+  _total_requested_size_fraction = AdaptiveWeightedAverage(TLABAllocationWeight);
+  _total_requested_size_fraction.sample(0.10f); // 10%
+
   if (UsePerfData) {
     EXCEPTION_MARK;
     _perf_num_allocating_threads     = create_perf_variable("allocThreads",   PerfData::U_None,  CHECK);
@@ -344,6 +348,7 @@ ThreadLocalAllocStats::ThreadLocalAllocStats() :
     _total_num_refills(0),
     _max_num_refills(0),
     _total_allocated_size(0),
+    _total_requested_bytes(0),
     _total_gc_waste(0),
     _max_gc_waste(0),
     _total_refill_waste(0),
@@ -355,21 +360,25 @@ unsigned int ThreadLocalAllocStats::num_allocating_threads_avg() {
   return MAX2((unsigned int)(_num_allocating_threads_avg.average() + 0.5), 1U);
 }
 
-void ThreadLocalAllocStats::update_fast_allocations(unsigned int num_refills,
-                                                    size_t allocated_size,
-                                                    size_t gc_waste,
-                                                    size_t refill_waste) {
-  _num_allocating_threads  += 1;
-  _total_num_refills       += num_refills;
-  _max_num_refills          = MAX2(_max_num_refills, num_refills);
-  _total_allocated_size    += allocated_size;
-  _total_gc_waste          += gc_waste;
-  _max_gc_waste             = MAX2(_max_gc_waste, gc_waste);
-  _total_refill_waste      += refill_waste;
-  _max_refill_waste         = MAX2(_max_refill_waste, refill_waste);
+float ThreadLocalAllocStats::total_requested_size_fraction_avg() {
+  return _total_requested_size_fraction.average();
 }
 
-void ThreadLocalAllocStats::update_num_slow_allocations(unsigned int num_slow_allocations) {
+void ThreadLocalAllocStats::update_current_thread_stats(unsigned int num_refills,
+                                                        size_t requested_bytes,
+                                                        size_t alloc_size_for_tlab,
+                                                        size_t gc_waste,
+                                                        size_t refill_waste,
+                                                        unsigned int num_slow_allocations) {
+  _num_allocating_threads     += 1;
+  _total_num_refills          += num_refills;
+  _max_num_refills             = MAX2(_max_num_refills, num_refills);
+  _total_allocated_size       += alloc_size_for_tlab;
+  _total_requested_bytes      += requested_bytes;
+  _total_gc_waste             += gc_waste;
+  _max_gc_waste                = MAX2(_max_gc_waste, gc_waste);
+  _total_refill_waste         += refill_waste;
+  _max_refill_waste            = MAX2(_max_refill_waste, refill_waste);
   _total_num_slow_allocations += num_slow_allocations;
   _max_num_slow_allocations    = MAX2(_max_num_slow_allocations, num_slow_allocations);
 }
@@ -379,6 +388,7 @@ void ThreadLocalAllocStats::update(const ThreadLocalAllocStats& other) {
   _total_num_refills          += other._total_num_refills;
   _max_num_refills             = MAX2(_max_num_refills, other._max_num_refills);
   _total_allocated_size       += other._total_allocated_size;
+  _total_requested_bytes      += other._total_requested_bytes;
   _total_gc_waste             += other._total_gc_waste;
   _max_gc_waste                = MAX2(_max_gc_waste, other._max_gc_waste);
   _total_refill_waste         += other._total_refill_waste;
@@ -392,6 +402,7 @@ void ThreadLocalAllocStats::reset() {
   _total_num_refills          = 0;
   _max_num_refills            = 0;
   _total_allocated_size       = 0;
+  _total_requested_bytes      = 0;
   _total_gc_waste             = 0;
   _max_gc_waste               = 0;
   _total_refill_waste         = 0;
@@ -401,22 +412,37 @@ void ThreadLocalAllocStats::reset() {
 }
 
 void ThreadLocalAllocStats::publish() {
-  if (_total_allocated_size == 0) {
+  if (_total_requested_bytes == 0) {
     return;
   }
 
   _num_allocating_threads_avg.sample(_num_allocating_threads);
 
+  {
+    const size_t tlab_capacity = Universe::heap()->tlab_capacity();
+    const size_t tlab_used = Universe::heap()->tlab_used();
+    if (tlab_used > 0.5 * tlab_capacity) {
+      // To avoid divide-by-zero
+      const size_t effective_tlab_capacity = MAX2(tlab_capacity, size_t(1));
+      const float requested_size_fraction = (float)_total_requested_bytes / effective_tlab_capacity;
+      _total_requested_size_fraction.sample(MIN2(requested_size_fraction, 1.0f));
+    }
+  }
+
   const size_t waste = _total_gc_waste + _total_refill_waste;
   const double waste_percent = percent_of(waste, _total_allocated_size);
-  log_debug(gc, tlab)("TLAB totals: thrds: %d  refills: %d max: %d"
-                      " slow allocs: %d max %d waste: %4.1f%%"
-                      " gc: %zuB max: %zuB"
-                      " slow: %zuB max: %zuB",
-                      _num_allocating_threads, _total_num_refills, _max_num_refills,
+
+  const double gc_waste_pct = percent_of(_total_gc_waste, _total_allocated_size);
+  const double refill_waste_pct = percent_of(_total_refill_waste, _total_allocated_size);
+
+  log_debug(gc, tlab)("TLAB totals: thrds: %d alloc-frac: %.1f%% refills: %d max: %d"
+                      " slow allocs: %d max %d waste: %.1f%%"
+                      " gc: %zuB(%.1f%%) max: %zuB"
+                      " refill: %zuB(%.1f%%) max: %zuB",
+                      _num_allocating_threads, _total_requested_size_fraction.average() * 100, _total_num_refills, _max_num_refills,
                       _total_num_slow_allocations, _max_num_slow_allocations, waste_percent,
-                      _total_gc_waste * HeapWordSize, _max_gc_waste * HeapWordSize,
-                      _total_refill_waste * HeapWordSize, _max_refill_waste * HeapWordSize);
+                      _total_gc_waste * HeapWordSize, gc_waste_pct, _max_gc_waste * HeapWordSize,
+                      _total_refill_waste * HeapWordSize, refill_waste_pct, _max_refill_waste * HeapWordSize);
 
   if (UsePerfData) {
     _perf_num_allocating_threads      ->set_value(_num_allocating_threads);
diff --git a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
index b2bef8acd46..f69ceb00b52 100644
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.hpp
@@ -52,6 +52,10 @@ private:
   HeapWord* _allocation_end;                     // end for allocations (actual TLAB end, excluding alignment_reserve)
 
   size_t    _desired_size;                       // desired size   (including alignment_reserve)
+
+  // If too many slow allocations (outside TLAB) happen, we increase
+  // _refill_waste_limit. This reduces outside-TLAB allocations at
+  // the expense of wasting more memory, i.e., the TLAB is discarded sooner.
   size_t    _refill_waste_limit;                 // hold onto tlab if free() is larger than this
   uint64_t  _allocated_before_last_gc;           // total bytes allocated up until the last gc
 
@@ -59,12 +63,24 @@ private:
   static unsigned _target_num_refills;           // expected number of refills between GCs
 
   unsigned  _num_refills;
+  // TLAB retirement is invoked in two main contexts:
+  // 1. TLAB refill:
+  //    The current TLAB is insufficient to satisfy a pending allocation
+  //    request, triggering a refill. The remaining space in the current
+  //    TLAB is treated as waste and tracked in _refill_waste.
+  // 2. Before GC:
+  //    Invoked at the start of a GC cycle to ensure heap parsability.
+  //    The unused space in the current TLAB is treated as waste and
+  //    tracked in _gc_waste.
   unsigned  _refill_waste;
   unsigned  _gc_waste;
   unsigned  _num_slow_allocations;
+
+  // Allocated size for filling TLAB in HeapWords
   size_t    _allocated_size;
 
-  AdaptiveWeightedAverage _allocation_fraction;  // fraction of eden allocated in tlabs
+  // Fraction of eden allocated by this thread, used for sizing its TLAB.
+  AdaptiveWeightedAverage _allocation_fraction;
 
   void reset_statistics();
 
@@ -77,8 +93,6 @@ private:
   void set_refill_waste_limit(size_t waste)      { _refill_waste_limit = waste;  }
 
   size_t initial_refill_waste_limit();
-
-  static int    target_num_refills()             { return _target_num_refills; }
   size_t initial_desired_size();
 
   size_t remaining();
@@ -91,9 +105,7 @@ private:
 
   void accumulate_and_reset_statistics(ThreadLocalAllocStats* stats);
 
-  void print_stats(const char* tag);
-
-  Thread* thread();
+  Thread* thread() const;
 
   // statistics
 
@@ -190,11 +202,13 @@ private:
   static PerfVariable* _perf_max_num_slow_allocations;
 
   static AdaptiveWeightedAverage _num_allocating_threads_avg;
+  static AdaptiveWeightedAverage _total_requested_size_fraction;
 
   unsigned int _num_allocating_threads;
   unsigned int _total_num_refills;
   unsigned int _max_num_refills;
   size_t       _total_allocated_size;
+  size_t       _total_requested_bytes;
   size_t       _total_gc_waste;
   size_t       _max_gc_waste;
   size_t       _total_refill_waste;
@@ -205,14 +219,16 @@ private:
 public:
   static void initialize();
   static unsigned int num_allocating_threads_avg();
+  static float total_requested_size_fraction_avg();
 
   ThreadLocalAllocStats();
 
-  void update_fast_allocations(unsigned int num_refills,
-                               size_t allocated_size,
-                               size_t gc_waste,
-                               size_t refill_waste);
-  void update_num_slow_allocations(unsigned int num_slow_allocations);
+  void update_current_thread_stats(unsigned int num_refills,
+                                   size_t requested_bytes,
+                                   size_t alloc_size_for_tlab,
+                                   size_t gc_waste,
+                                   size_t refill_waste,
+                                   unsigned int num_slow_allocations);
   void update(const ThreadLocalAllocStats& other);
 
   void reset();
diff --git a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp
index 727467f98d0..ee1cca56b9e 100644
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.inline.hpp
@@ -52,10 +52,22 @@ inline HeapWord* ThreadLocalAllocBuffer::allocate(size_t size) {
 }
 
 inline size_t ThreadLocalAllocBuffer::compute_size(size_t obj_size) {
-  // Compute the size for the new TLAB.
-  // The "last" tlab may be smaller to reduce fragmentation.
   const size_t available_size = Universe::heap()->unsafe_max_tlab_alloc() / HeapWordSize;
-  size_t new_tlab_size = MIN3(available_size, desired_size() + align_object_size(obj_size), max_size());
+  size_t scaled_desired_size = desired_size();
+  if (ResizeTLAB) {
+    // Extra boost if too many refills; 16X at most.
+    if (_num_refills > _target_num_refills) {
+      const uint excess = _num_refills - _target_num_refills;
+      const uint steps = MIN2(excess / 8, 4U);
+      // Cap before shifting to avoid overflow.
+      if (scaled_desired_size > (max_size() >> steps)) {
+        scaled_desired_size = max_size();
+      } else {
+        scaled_desired_size <<= steps;
+      }
+    }
+  }
+  size_t new_tlab_size = MIN3(available_size, scaled_desired_size + align_object_size(obj_size), max_size());
 
   // Make sure there's enough room for object and filler int[].
   if (new_tlab_size < compute_min_size(obj_size)) {