From 0a963b612d0fcbfe002340098341862096650945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joel=20Sikstr=C3=B6m?= Date: Mon, 24 Nov 2025 13:40:55 +0000 Subject: [PATCH] 8371701: Add ability to set NUMA-affinity for threads Reviewed-by: aboldtch, ayang --- src/hotspot/os/aix/os_aix.cpp | 3 ++ src/hotspot/os/bsd/os_bsd.cpp | 3 ++ src/hotspot/os/linux/os_linux.cpp | 78 +++++++++++++++++++++++++++ src/hotspot/os/linux/os_linux.hpp | 17 ++++++ src/hotspot/os/windows/os_windows.cpp | 1 + src/hotspot/share/runtime/os.hpp | 1 + 6 files changed, 103 insertions(+) diff --git a/src/hotspot/os/aix/os_aix.cpp b/src/hotspot/os/aix/os_aix.cpp index 5f81912c0d6..48bd5e05816 100644 --- a/src/hotspot/os/aix/os_aix.cpp +++ b/src/hotspot/os/aix/os_aix.cpp @@ -1747,6 +1747,9 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) { return page_size; } +void os::numa_set_thread_affinity(Thread *thread, int node) { +} + void os::numa_make_global(char *addr, size_t bytes) { } diff --git a/src/hotspot/os/bsd/os_bsd.cpp b/src/hotspot/os/bsd/os_bsd.cpp index 3e5fa8b84e1..0b37cb100f6 100644 --- a/src/hotspot/os/bsd/os_bsd.cpp +++ b/src/hotspot/os/bsd/os_bsd.cpp @@ -1581,6 +1581,9 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) { return page_size; } +void os::numa_set_thread_affinity(Thread *thread, int node) { +} + void os::numa_make_global(char *addr, size_t bytes) { } diff --git a/src/hotspot/os/linux/os_linux.cpp b/src/hotspot/os/linux/os_linux.cpp index b9f8307673c..456927d2a64 100644 --- a/src/hotspot/os/linux/os_linux.cpp +++ b/src/hotspot/os/linux/os_linux.cpp @@ -2995,6 +2995,10 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) { return page_size; } +void os::numa_set_thread_affinity(Thread* thread, int node) { + Linux::numa_set_thread_affinity(thread->osthread()->thread_id(), node); +} + void os::numa_make_global(char *addr, size_t bytes) { Linux::numa_interleave_memory(addr, bytes); } @@ -3177,6 +3181,8 @@ bool os::Linux::libnuma_init() { libnuma_dlsym(handle, "numa_set_bind_policy"))); set_numa_bitmask_isbitset(CAST_TO_FN_PTR(numa_bitmask_isbitset_func_t, libnuma_dlsym(handle, "numa_bitmask_isbitset"))); + set_numa_bitmask_clearbit(CAST_TO_FN_PTR(numa_bitmask_clearbit_func_t, + libnuma_dlsym(handle, "numa_bitmask_clearbit"))); set_numa_bitmask_equal(CAST_TO_FN_PTR(numa_bitmask_equal_func_t, libnuma_dlsym(handle, "numa_bitmask_equal"))); set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t, @@ -3191,20 +3197,32 @@ bool os::Linux::libnuma_init() { libnuma_dlsym(handle, "numa_set_preferred"))); set_numa_get_run_node_mask(CAST_TO_FN_PTR(numa_get_run_node_mask_func_t, libnuma_v2_dlsym(handle, "numa_get_run_node_mask"))); + set_numa_sched_setaffinity(CAST_TO_FN_PTR(numa_sched_setaffinity_func_t, + libnuma_v2_dlsym(handle, "numa_sched_setaffinity"))); + set_numa_allocate_cpumask(CAST_TO_FN_PTR(numa_allocate_cpumask_func_t, + libnuma_v2_dlsym(handle, "numa_allocate_cpumask"))); if (numa_available() != -1) { set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes")); set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr")); set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr")); + set_numa_all_cpus_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_cpus_ptr")); set_numa_interleave_bitmask(_numa_get_interleave_mask()); set_numa_membind_bitmask(_numa_get_membind()); set_numa_cpunodebind_bitmask(_numa_get_run_node_mask()); + // Create an index -> node mapping, since nodes are not always consecutive _nindex_to_node = new (mtInternal) GrowableArray(0, mtInternal); rebuild_nindex_to_node_map(); + // Create a cpu -> node mapping _cpu_to_node = new (mtInternal) GrowableArray(0, mtInternal); rebuild_cpu_to_node_map(); + + // Create a node -> CPUs mapping + _numa_affinity_masks = new (mtInternal) GrowableArray(0, mtInternal); + build_numa_affinity_masks(); + return true; } } @@ -3240,6 +3258,42 @@ size_t os::Linux::default_guard_size(os::ThreadType thr_type) { return ((thr_type == java_thread || thr_type == compiler_thread) ? 0 : os::vm_page_size()); } +void os::Linux::build_numa_affinity_masks() { + // We only build the affinity masks if running libnuma v2 (_numa_node_to_cpus_v2 + // is available) and we have the affinity mask of the process when it started. + if (_numa_node_to_cpus_v2 == nullptr || _numa_all_cpus_ptr == nullptr) { + return; + } + + // It's important that we respect any user configuration by removing the + // CPUs we're not allowed to run on from the affinity mask. For example, + // if the user runs the JVM with "numactl -C 0-1,4-5" on a machine with + // the following NUMA setup: + // NUMA 0: CPUs 0-3, NUMA 1: CPUs 4-7 + // We expect to get the following affinity masks: + // Affinity masks: idx 0 = (0, 1), idx 1 = (4, 5) + + const int num_nodes = get_existing_num_nodes(); + const unsigned num_cpus = (unsigned)os::processor_count(); + + for (int i = 0; i < num_nodes; i++) { + struct bitmask* affinity_mask = _numa_allocate_cpumask(); + + // Fill the affinity mask with all CPUs belonging to NUMA node i + _numa_node_to_cpus_v2(i, affinity_mask); + + // Clear the bits of all CPUs that the process is not allowed to + // execute tasks on + for (unsigned j = 0; j < num_cpus; j++) { + if (!_numa_bitmask_isbitset(_numa_all_cpus_ptr, j)) { + _numa_bitmask_clearbit(affinity_mask, j); + } + } + + _numa_affinity_masks->push(affinity_mask); + } +} + void os::Linux::rebuild_nindex_to_node_map() { int highest_node_number = Linux::numa_max_node(); @@ -3355,6 +3409,25 @@ int os::Linux::numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen) return -1; } +void os::Linux::numa_set_thread_affinity(pid_t tid, int node) { + // We only set affinity if running libnuma v2 (_numa_sched_setaffinity + // is available) and we have all affinity mask + if (_numa_sched_setaffinity == nullptr || + _numa_all_cpus_ptr == nullptr || + _numa_affinity_masks->is_empty()) { + return; + } + + if (node == -1) { + // If the node is -1, the affinity is reverted to the original affinity + // of the thread when the VM was started + _numa_sched_setaffinity(tid, _numa_all_cpus_ptr); + } else { + // Normal case, set the affinity to the corresponding affinity mask + _numa_sched_setaffinity(tid, _numa_affinity_masks->at(node)); + } +} + int os::Linux::get_node_by_cpu(int cpu_id) { if (cpu_to_node() != nullptr && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) { return cpu_to_node()->at(cpu_id); @@ -3364,6 +3437,7 @@ int os::Linux::get_node_by_cpu(int cpu_id) { GrowableArray* os::Linux::_cpu_to_node; GrowableArray* os::Linux::_nindex_to_node; +GrowableArray* os::Linux::_numa_affinity_masks; os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu; os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus; os::Linux::numa_node_to_cpus_v2_func_t os::Linux::_numa_node_to_cpus_v2; @@ -3375,17 +3449,21 @@ os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory; os::Linux::numa_interleave_memory_v2_func_t os::Linux::_numa_interleave_memory_v2; os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy; os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset; +os::Linux::numa_bitmask_clearbit_func_t os::Linux::_numa_bitmask_clearbit; os::Linux::numa_bitmask_equal_func_t os::Linux::_numa_bitmask_equal; os::Linux::numa_distance_func_t os::Linux::_numa_distance; os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind; os::Linux::numa_get_interleave_mask_func_t os::Linux::_numa_get_interleave_mask; os::Linux::numa_get_run_node_mask_func_t os::Linux::_numa_get_run_node_mask; +os::Linux::numa_sched_setaffinity_func_t os::Linux::_numa_sched_setaffinity; +os::Linux::numa_allocate_cpumask_func_t os::Linux::_numa_allocate_cpumask; os::Linux::numa_move_pages_func_t os::Linux::_numa_move_pages; os::Linux::numa_set_preferred_func_t os::Linux::_numa_set_preferred; os::Linux::NumaAllocationPolicy os::Linux::_current_numa_policy; unsigned long* os::Linux::_numa_all_nodes; struct bitmask* os::Linux::_numa_all_nodes_ptr; struct bitmask* os::Linux::_numa_nodes_ptr; +struct bitmask* os::Linux::_numa_all_cpus_ptr; struct bitmask* os::Linux::_numa_interleave_bitmask; struct bitmask* os::Linux::_numa_membind_bitmask; struct bitmask* os::Linux::_numa_cpunodebind_bitmask; diff --git a/src/hotspot/os/linux/os_linux.hpp b/src/hotspot/os/linux/os_linux.hpp index df96a17d8e9..9c0b6723b38 100644 --- a/src/hotspot/os/linux/os_linux.hpp +++ b/src/hotspot/os/linux/os_linux.hpp @@ -45,6 +45,10 @@ class os::Linux { static GrowableArray* _cpu_to_node; static GrowableArray* _nindex_to_node; + static GrowableArray* _numa_affinity_masks; + + static void build_numa_affinity_masks(); + protected: static physical_memory_size_type _physical_memory; @@ -230,8 +234,11 @@ class os::Linux { typedef void (*numa_set_preferred_func_t)(int node); typedef void (*numa_set_bind_policy_func_t)(int policy); typedef int (*numa_bitmask_isbitset_func_t)(struct bitmask *bmp, unsigned int n); + typedef int (*numa_bitmask_clearbit_func_t)(struct bitmask *bmp, unsigned int n); typedef int (*numa_bitmask_equal_func_t)(struct bitmask *bmp1, struct bitmask *bmp2); typedef int (*numa_distance_func_t)(int node1, int node2); + typedef int (*numa_sched_setaffinity_func_t)(pid_t pid, struct bitmask* mask); + typedef struct bitmask* (*numa_allocate_cpumask_func_t)(void); static sched_getcpu_func_t _sched_getcpu; static numa_node_to_cpus_func_t _numa_node_to_cpus; @@ -244,6 +251,7 @@ class os::Linux { static numa_interleave_memory_v2_func_t _numa_interleave_memory_v2; static numa_set_bind_policy_func_t _numa_set_bind_policy; static numa_bitmask_isbitset_func_t _numa_bitmask_isbitset; + static numa_bitmask_clearbit_func_t _numa_bitmask_clearbit; static numa_bitmask_equal_func_t _numa_bitmask_equal; static numa_distance_func_t _numa_distance; static numa_get_membind_func_t _numa_get_membind; @@ -251,9 +259,12 @@ class os::Linux { static numa_get_interleave_mask_func_t _numa_get_interleave_mask; static numa_move_pages_func_t _numa_move_pages; static numa_set_preferred_func_t _numa_set_preferred; + static numa_sched_setaffinity_func_t _numa_sched_setaffinity; + static numa_allocate_cpumask_func_t _numa_allocate_cpumask; static unsigned long* _numa_all_nodes; static struct bitmask* _numa_all_nodes_ptr; static struct bitmask* _numa_nodes_ptr; + static struct bitmask* _numa_all_cpus_ptr; static struct bitmask* _numa_interleave_bitmask; static struct bitmask* _numa_membind_bitmask; static struct bitmask* _numa_cpunodebind_bitmask; @@ -269,6 +280,7 @@ class os::Linux { static void set_numa_interleave_memory_v2(numa_interleave_memory_v2_func_t func) { _numa_interleave_memory_v2 = func; } static void set_numa_set_bind_policy(numa_set_bind_policy_func_t func) { _numa_set_bind_policy = func; } static void set_numa_bitmask_isbitset(numa_bitmask_isbitset_func_t func) { _numa_bitmask_isbitset = func; } + static void set_numa_bitmask_clearbit(numa_bitmask_clearbit_func_t func) { _numa_bitmask_clearbit = func; } static void set_numa_bitmask_equal(numa_bitmask_equal_func_t func) { _numa_bitmask_equal = func; } static void set_numa_distance(numa_distance_func_t func) { _numa_distance = func; } static void set_numa_get_membind(numa_get_membind_func_t func) { _numa_get_membind = func; } @@ -279,9 +291,12 @@ class os::Linux { static void set_numa_all_nodes(unsigned long* ptr) { _numa_all_nodes = ptr; } static void set_numa_all_nodes_ptr(struct bitmask **ptr) { _numa_all_nodes_ptr = (ptr == nullptr ? nullptr : *ptr); } static void set_numa_nodes_ptr(struct bitmask **ptr) { _numa_nodes_ptr = (ptr == nullptr ? nullptr : *ptr); } + static void set_numa_all_cpus_ptr(struct bitmask **ptr) { _numa_all_cpus_ptr = (ptr == nullptr ? nullptr : *ptr); } static void set_numa_interleave_bitmask(struct bitmask* ptr) { _numa_interleave_bitmask = ptr ; } static void set_numa_membind_bitmask(struct bitmask* ptr) { _numa_membind_bitmask = ptr ; } static void set_numa_cpunodebind_bitmask(struct bitmask* ptr) { _numa_cpunodebind_bitmask = ptr ; } + static void set_numa_sched_setaffinity(numa_sched_setaffinity_func_t func) { _numa_sched_setaffinity = func; } + static void set_numa_allocate_cpumask(numa_allocate_cpumask_func_t func) { _numa_allocate_cpumask = func; } static int sched_getcpu_syscall(void); enum NumaAllocationPolicy{ @@ -292,6 +307,8 @@ class os::Linux { static NumaAllocationPolicy _current_numa_policy; public: + static void numa_set_thread_affinity(pid_t tid, int node); + static int sched_getcpu() { return _sched_getcpu != nullptr ? _sched_getcpu() : -1; } static int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen); static int numa_max_node() { return _numa_max_node != nullptr ? _numa_max_node() : -1; } diff --git a/src/hotspot/os/windows/os_windows.cpp b/src/hotspot/os/windows/os_windows.cpp index ce2baeaf46c..28b20f3cdaf 100644 --- a/src/hotspot/os/windows/os_windows.cpp +++ b/src/hotspot/os/windows/os_windows.cpp @@ -3752,6 +3752,7 @@ size_t os::pd_pretouch_memory(void* first, void* last, size_t page_size) { return page_size; } +void os::numa_set_thread_affinity(Thread *thread, int node) { } void os::numa_make_global(char *addr, size_t bytes) { } void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) { } size_t os::numa_get_groups_num() { return MAX2(numa_node_list_holder.get_count(), 1); } diff --git a/src/hotspot/share/runtime/os.hpp b/src/hotspot/share/runtime/os.hpp index e008f29eecc..b65bf643cbf 100644 --- a/src/hotspot/share/runtime/os.hpp +++ b/src/hotspot/share/runtime/os.hpp @@ -534,6 +534,7 @@ class os: AllStatic { static void realign_memory(char *addr, size_t bytes, size_t alignment_hint); // NUMA-specific interface + static void numa_set_thread_affinity(Thread* thread, int node); static void numa_make_local(char *addr, size_t bytes, int lgrp_hint); static void numa_make_global(char *addr, size_t bytes); static size_t numa_get_groups_num();