8371701: Add ability to set NUMA-affinity for threads

Reviewed-by: aboldtch, ayang
2026-01-28 12:09:14 +00:00 · 2025-11-24 13:40:55 +00:00 · 2025-11-24 13:40:55 +00:00 · 0a963b612d
commit 0a963b612d
parent 8ae4ea8114
6 changed files with 103 additions and 0 deletions
--- a/src/hotspot/os/aix/os_aix.cpp
+++ b/src/hotspot/os/aix/os_aix.cpp
@ -1747,6 +1747,9 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) {
  return page_size;
 }

+void os::numa_set_thread_affinity(Thread *thread, int node) {
+}
+
 void os::numa_make_global(char *addr, size_t bytes) {
 }

--- a/src/hotspot/os/bsd/os_bsd.cpp
+++ b/src/hotspot/os/bsd/os_bsd.cpp
@ -1581,6 +1581,9 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) {
  return page_size;
 }

+void os::numa_set_thread_affinity(Thread *thread, int node) {
+}
+
 void os::numa_make_global(char *addr, size_t bytes) {
 }

--- a/src/hotspot/os/linux/os_linux.cpp
+++ b/src/hotspot/os/linux/os_linux.cpp
@ -2995,6 +2995,10 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) {
  return page_size;
 }

+void os::numa_set_thread_affinity(Thread* thread, int node) {
+  Linux::numa_set_thread_affinity(thread->osthread()->thread_id(), node);
+}
+
 void os::numa_make_global(char *addr, size_t bytes) {
  Linux::numa_interleave_memory(addr, bytes);
 }
@ -3177,6 +3181,8 @@ bool os::Linux::libnuma_init() {
                                              libnuma_dlsym(handle, "numa_set_bind_policy")));
      set_numa_bitmask_isbitset(CAST_TO_FN_PTR(numa_bitmask_isbitset_func_t,
                                               libnuma_dlsym(handle, "numa_bitmask_isbitset")));
+      set_numa_bitmask_clearbit(CAST_TO_FN_PTR(numa_bitmask_clearbit_func_t,
+                                               libnuma_dlsym(handle, "numa_bitmask_clearbit")));
      set_numa_bitmask_equal(CAST_TO_FN_PTR(numa_bitmask_equal_func_t,
                                            libnuma_dlsym(handle, "numa_bitmask_equal")));
      set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t,
@ -3191,20 +3197,32 @@ bool os::Linux::libnuma_init() {
                                            libnuma_dlsym(handle, "numa_set_preferred")));
      set_numa_get_run_node_mask(CAST_TO_FN_PTR(numa_get_run_node_mask_func_t,
                                                libnuma_v2_dlsym(handle, "numa_get_run_node_mask")));
+      set_numa_sched_setaffinity(CAST_TO_FN_PTR(numa_sched_setaffinity_func_t,
+                                                libnuma_v2_dlsym(handle, "numa_sched_setaffinity")));
+      set_numa_allocate_cpumask(CAST_TO_FN_PTR(numa_allocate_cpumask_func_t,
+                                               libnuma_v2_dlsym(handle, "numa_allocate_cpumask")));

      if (numa_available() != -1) {
        set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
        set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr"));
        set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr"));
+        set_numa_all_cpus_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_cpus_ptr"));
        set_numa_interleave_bitmask(_numa_get_interleave_mask());
        set_numa_membind_bitmask(_numa_get_membind());
        set_numa_cpunodebind_bitmask(_numa_get_run_node_mask());
+
        // Create an index -> node mapping, since nodes are not always consecutive
        _nindex_to_node = new (mtInternal) GrowableArray<int>(0, mtInternal);
        rebuild_nindex_to_node_map();
+
        // Create a cpu -> node mapping
        _cpu_to_node = new (mtInternal) GrowableArray<int>(0, mtInternal);
        rebuild_cpu_to_node_map();
+
+        // Create a node -> CPUs mapping
+        _numa_affinity_masks = new (mtInternal) GrowableArray<struct bitmask*>(0, mtInternal);
+        build_numa_affinity_masks();
+
        return true;
      }
    }
@ -3240,6 +3258,42 @@ size_t os::Linux::default_guard_size(os::ThreadType thr_type) {
  return ((thr_type == java_thread || thr_type == compiler_thread) ? 0 : os::vm_page_size());
 }

+void os::Linux::build_numa_affinity_masks() {
+  // We only build the affinity masks if running libnuma v2 (_numa_node_to_cpus_v2
+  // is available) and we have the affinity mask of the process when it started.
+  if (_numa_node_to_cpus_v2 == nullptr || _numa_all_cpus_ptr == nullptr) {
+    return;
+  }
+
+  // It's important that we respect any user configuration by removing the
+  // CPUs we're not allowed to run on from the affinity mask. For example,
+  // if the user runs the JVM with "numactl -C 0-1,4-5" on a machine with
+  // the following NUMA setup:
+  // NUMA 0: CPUs 0-3, NUMA 1: CPUs 4-7
+  // We expect to get the following affinity masks:
+  // Affinity masks: idx 0 = (0, 1), idx 1 = (4, 5)
+
+  const int num_nodes = get_existing_num_nodes();
+  const unsigned num_cpus = (unsigned)os::processor_count();
+
+  for (int i = 0; i < num_nodes; i++) {
+    struct bitmask* affinity_mask = _numa_allocate_cpumask();
+
+    // Fill the affinity mask with all CPUs belonging to NUMA node i
+    _numa_node_to_cpus_v2(i, affinity_mask);
+
+    // Clear the bits of all CPUs that the process is not allowed to
+    // execute tasks on
+    for (unsigned j = 0; j < num_cpus; j++) {
+      if (!_numa_bitmask_isbitset(_numa_all_cpus_ptr, j)) {
+        _numa_bitmask_clearbit(affinity_mask, j);
+      }
+    }
+
+    _numa_affinity_masks->push(affinity_mask);
+  }
+}
+
 void os::Linux::rebuild_nindex_to_node_map() {
  int highest_node_number = Linux::numa_max_node();

@ -3355,6 +3409,25 @@ int os::Linux::numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen)
  return -1;
 }

+void os::Linux::numa_set_thread_affinity(pid_t tid, int node) {
+  // We only set affinity if running libnuma v2 (_numa_sched_setaffinity
+  // is available) and we have all affinity mask
+  if (_numa_sched_setaffinity == nullptr ||
+      _numa_all_cpus_ptr == nullptr ||
+      _numa_affinity_masks->is_empty()) {
+    return;
+  }
+
+  if (node == -1) {
+    // If the node is -1, the affinity is reverted to the original affinity
+    // of the thread when the VM was started
+    _numa_sched_setaffinity(tid, _numa_all_cpus_ptr);
+  } else {
+    // Normal case, set the affinity to the corresponding affinity mask
+    _numa_sched_setaffinity(tid, _numa_affinity_masks->at(node));
+  }
+}
+
 int os::Linux::get_node_by_cpu(int cpu_id) {
  if (cpu_to_node() != nullptr && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) {
    return cpu_to_node()->at(cpu_id);
@ -3364,6 +3437,7 @@ int os::Linux::get_node_by_cpu(int cpu_id) {

 GrowableArray<int>* os::Linux::_cpu_to_node;
 GrowableArray<int>* os::Linux::_nindex_to_node;
+GrowableArray<struct bitmask*>* os::Linux::_numa_affinity_masks;
 os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
 os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
 os::Linux::numa_node_to_cpus_v2_func_t os::Linux::_numa_node_to_cpus_v2;
@ -3375,17 +3449,21 @@ os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
 os::Linux::numa_interleave_memory_v2_func_t os::Linux::_numa_interleave_memory_v2;
 os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
 os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset;
+os::Linux::numa_bitmask_clearbit_func_t os::Linux::_numa_bitmask_clearbit;
 os::Linux::numa_bitmask_equal_func_t os::Linux::_numa_bitmask_equal;
 os::Linux::numa_distance_func_t os::Linux::_numa_distance;
 os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind;
 os::Linux::numa_get_interleave_mask_func_t os::Linux::_numa_get_interleave_mask;
 os::Linux::numa_get_run_node_mask_func_t os::Linux::_numa_get_run_node_mask;
+os::Linux::numa_sched_setaffinity_func_t os::Linux::_numa_sched_setaffinity;
+os::Linux::numa_allocate_cpumask_func_t os::Linux::_numa_allocate_cpumask;
 os::Linux::numa_move_pages_func_t os::Linux::_numa_move_pages;
 os::Linux::numa_set_preferred_func_t os::Linux::_numa_set_preferred;
 os::Linux::NumaAllocationPolicy os::Linux::_current_numa_policy;
 unsigned long* os::Linux::_numa_all_nodes;
 struct bitmask* os::Linux::_numa_all_nodes_ptr;
 struct bitmask* os::Linux::_numa_nodes_ptr;
+struct bitmask* os::Linux::_numa_all_cpus_ptr;
 struct bitmask* os::Linux::_numa_interleave_bitmask;
 struct bitmask* os::Linux::_numa_membind_bitmask;
 struct bitmask* os::Linux::_numa_cpunodebind_bitmask;
--- a/src/hotspot/os/linux/os_linux.hpp
+++ b/src/hotspot/os/linux/os_linux.hpp
@ -45,6 +45,10 @@ class os::Linux {
  static GrowableArray<int>* _cpu_to_node;
  static GrowableArray<int>* _nindex_to_node;

+  static GrowableArray<struct bitmask*>* _numa_affinity_masks;
+
+  static void build_numa_affinity_masks();
+
 protected:

  static physical_memory_size_type _physical_memory;
@ -230,8 +234,11 @@ class os::Linux {
  typedef void (*numa_set_preferred_func_t)(int node);
  typedef void (*numa_set_bind_policy_func_t)(int policy);
  typedef int (*numa_bitmask_isbitset_func_t)(struct bitmask *bmp, unsigned int n);
+  typedef int (*numa_bitmask_clearbit_func_t)(struct bitmask *bmp, unsigned int n);
  typedef int (*numa_bitmask_equal_func_t)(struct bitmask *bmp1, struct bitmask *bmp2);
  typedef int (*numa_distance_func_t)(int node1, int node2);
+  typedef int (*numa_sched_setaffinity_func_t)(pid_t pid, struct bitmask* mask);
+  typedef struct bitmask* (*numa_allocate_cpumask_func_t)(void);

  static sched_getcpu_func_t _sched_getcpu;
  static numa_node_to_cpus_func_t _numa_node_to_cpus;
@ -244,6 +251,7 @@ class os::Linux {
  static numa_interleave_memory_v2_func_t _numa_interleave_memory_v2;
  static numa_set_bind_policy_func_t _numa_set_bind_policy;
  static numa_bitmask_isbitset_func_t _numa_bitmask_isbitset;
+  static numa_bitmask_clearbit_func_t _numa_bitmask_clearbit;
  static numa_bitmask_equal_func_t _numa_bitmask_equal;
  static numa_distance_func_t _numa_distance;
  static numa_get_membind_func_t _numa_get_membind;
@ -251,9 +259,12 @@ class os::Linux {
  static numa_get_interleave_mask_func_t _numa_get_interleave_mask;
  static numa_move_pages_func_t _numa_move_pages;
  static numa_set_preferred_func_t _numa_set_preferred;
+  static numa_sched_setaffinity_func_t _numa_sched_setaffinity;
+  static numa_allocate_cpumask_func_t _numa_allocate_cpumask;
  static unsigned long* _numa_all_nodes;
  static struct bitmask* _numa_all_nodes_ptr;
  static struct bitmask* _numa_nodes_ptr;
+  static struct bitmask* _numa_all_cpus_ptr;
  static struct bitmask* _numa_interleave_bitmask;
  static struct bitmask* _numa_membind_bitmask;
  static struct bitmask* _numa_cpunodebind_bitmask;
@ -269,6 +280,7 @@ class os::Linux {
  static void set_numa_interleave_memory_v2(numa_interleave_memory_v2_func_t func) { _numa_interleave_memory_v2 = func; }
  static void set_numa_set_bind_policy(numa_set_bind_policy_func_t func) { _numa_set_bind_policy = func; }
  static void set_numa_bitmask_isbitset(numa_bitmask_isbitset_func_t func) { _numa_bitmask_isbitset = func; }
+  static void set_numa_bitmask_clearbit(numa_bitmask_clearbit_func_t func) { _numa_bitmask_clearbit = func; }
  static void set_numa_bitmask_equal(numa_bitmask_equal_func_t func) { _numa_bitmask_equal = func; }
  static void set_numa_distance(numa_distance_func_t func) { _numa_distance = func; }
  static void set_numa_get_membind(numa_get_membind_func_t func) { _numa_get_membind = func; }
@ -279,9 +291,12 @@ class os::Linux {
  static void set_numa_all_nodes(unsigned long* ptr) { _numa_all_nodes = ptr; }
  static void set_numa_all_nodes_ptr(struct bitmask **ptr) { _numa_all_nodes_ptr = (ptr == nullptr ? nullptr : *ptr); }
  static void set_numa_nodes_ptr(struct bitmask **ptr) { _numa_nodes_ptr = (ptr == nullptr ? nullptr : *ptr); }
+  static void set_numa_all_cpus_ptr(struct bitmask **ptr) { _numa_all_cpus_ptr = (ptr == nullptr ? nullptr : *ptr); }
  static void set_numa_interleave_bitmask(struct bitmask* ptr)     { _numa_interleave_bitmask = ptr ;   }
  static void set_numa_membind_bitmask(struct bitmask* ptr)        { _numa_membind_bitmask = ptr ;      }
  static void set_numa_cpunodebind_bitmask(struct bitmask* ptr)        { _numa_cpunodebind_bitmask = ptr ;      }
+  static void set_numa_sched_setaffinity(numa_sched_setaffinity_func_t func) { _numa_sched_setaffinity = func; }
+  static void set_numa_allocate_cpumask(numa_allocate_cpumask_func_t func) { _numa_allocate_cpumask = func; }
  static int sched_getcpu_syscall(void);

  enum NumaAllocationPolicy{
@ -292,6 +307,8 @@ class os::Linux {
  static NumaAllocationPolicy _current_numa_policy;

 public:
+  static void numa_set_thread_affinity(pid_t tid, int node);
+
  static int sched_getcpu()  { return _sched_getcpu != nullptr ? _sched_getcpu() : -1; }
  static int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen);
  static int numa_max_node() { return _numa_max_node != nullptr ? _numa_max_node() : -1; }
--- a/src/hotspot/os/windows/os_windows.cpp
+++ b/src/hotspot/os/windows/os_windows.cpp
@ -3752,6 +3752,7 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) {
  return page_size;
 }

+void os::numa_set_thread_affinity(Thread *thread, int node) { }
 void os::numa_make_global(char *addr, size_t bytes)    { }
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint)    { }
 size_t os::numa_get_groups_num()                       { return MAX2(numa_node_list_holder.get_count(), 1); }
--- a/src/hotspot/share/runtime/os.hpp
+++ b/src/hotspot/share/runtime/os.hpp
@ -534,6 +534,7 @@ class os: AllStatic {
  static void   realign_memory(char *addr, size_t bytes, size_t alignment_hint);

  // NUMA-specific interface
+  static void   numa_set_thread_affinity(Thread* thread, int node);
  static void   numa_make_local(char *addr, size_t bytes, int lgrp_hint);
  static void   numa_make_global(char *addr, size_t bytes);
  static size_t numa_get_groups_num();