8373106: JFR suspend/resume deadlock on macOS in pthreads library

Reviewed-by: egahlin
This commit is contained in:
Markus Grönlund 2026-01-13 19:40:20 +00:00
parent f23752a75e
commit b070367bdf

View File

@ -232,41 +232,50 @@ void JfrSamplerThread::task_stacktrace(JfrSampleRequestType type, JavaThread** l
JavaThread* start = nullptr;
elapsedTimer sample_time;
sample_time.start();
ThreadsListHandle tlh;
// Resolve a sample session relative start position index into the thread list array.
// In cases where the last sampled thread is null or not-null but stale, find_index() returns -1.
_cur_index = tlh.list()->find_index_of_JavaThread(*last_thread);
JavaThread* current = _cur_index != -1 ? *last_thread : nullptr;
{
/*
* Take the Threads_lock for three purposes:
*
* 1) Avoid sampling right through a safepoint,
* which could result in touching oops in case of virtual threads.
* 2) Prevent JFR from issuing an epoch rotation while the sampler thread
* is actively processing a thread in state native, as both threads are outside the safepoint protocol.
* 3) Some operating systems (BSD / Mac) require a process lock when sending a signal with pthread_kill.
* Holding the Threads_lock prevents a JavaThread from calling os::create_thread(), which also takes the process lock.
* In a sense, we provide a coarse signal mask, so we can always send the resume signal.
*/
MutexLocker tlock(Threads_lock);
ThreadsListHandle tlh;
// Resolve a sample session relative start position index into the thread list array.
// In cases where the last sampled thread is null or not-null but stale, find_index() returns -1.
_cur_index = tlh.list()->find_index_of_JavaThread(*last_thread);
JavaThread* current = _cur_index != -1 ? *last_thread : nullptr;
while (num_samples < sample_limit) {
current = next_thread(tlh.list(), start, current);
if (current == nullptr) {
break;
}
if (is_excluded(current)) {
continue;
}
if (start == nullptr) {
start = current; // remember the thread where we started to attempt sampling
}
bool success;
if (JAVA_SAMPLE == type) {
success = sample_java_thread(current);
} else {
assert(type == NATIVE_SAMPLE, "invariant");
success = sample_native_thread(current);
}
if (success) {
num_samples++;
}
if (SafepointSynchronize::is_at_safepoint()) {
// For _thread_in_native, we cannot get the Threads_lock.
// For _thread_in_Java, well, there are none.
break;
while (num_samples < sample_limit) {
current = next_thread(tlh.list(), start, current);
if (current == nullptr) {
break;
}
if (is_excluded(current)) {
continue;
}
if (start == nullptr) {
start = current; // remember the thread where we started to attempt sampling
}
bool success;
if (JAVA_SAMPLE == type) {
success = sample_java_thread(current);
} else {
assert(type == NATIVE_SAMPLE, "invariant");
success = sample_native_thread(current);
}
if (success) {
num_samples++;
}
}
*last_thread = current; // remember the thread we last attempted to sample
}
*last_thread = current; // remember the thread we last attempted to sample
sample_time.stop();
log_trace(jfr)("JFR thread sampling done in %3.7f secs with %d java %d native samples",
sample_time.seconds(), type == JAVA_SAMPLE ? num_samples : 0, type == NATIVE_SAMPLE ? num_samples : 0);
@ -297,6 +306,7 @@ class OSThreadSampler : public SuspendedThreadTask {
// Sampling a thread in state _thread_in_Java
// involves a platform-specific thread suspend and CPU context retrieval.
bool JfrSamplerThread::sample_java_thread(JavaThread* jt) {
assert_lock_strong(Threads_lock);
if (jt->thread_state() != _thread_in_Java) {
return false;
}
@ -328,6 +338,7 @@ static JfrSamplerThread* _sampler_thread = nullptr;
// without thread suspension and CPU context retrieval,
// if we carefully order the loads of the thread state.
bool JfrSamplerThread::sample_native_thread(JavaThread* jt) {
assert_lock_strong(Threads_lock);
if (jt->thread_state() != _thread_in_native) {
return false;
}
@ -343,22 +354,6 @@ bool JfrSamplerThread::sample_native_thread(JavaThread* jt) {
SafepointMechanism::arm_local_poll_release(jt);
// Take the Threads_lock for two purposes:
// 1) Avoid sampling through a safepoint which could result
// in touching oops in case of virtual threads.
// 2) Prevent JFR from issuing an epoch rotation while the sampler thread
// is actively processing a thread in native, as both threads are now
// outside the safepoint protocol.
// OrderAccess::fence() as part of acquiring the lock prevents loads from floating up.
JfrMutexTryLock lock(Threads_lock);
if (!lock.acquired()) {
// Remove the native sample request and release the potentially waiting thread.
JfrSampleMonitor jsm(tl);
return false;
}
// Separate the arming of the poll (above) from the reading of JavaThread state (below).
if (UseSystemMemoryBarrier) {
SystemMemoryBarrier::emit();
@ -367,7 +362,6 @@ bool JfrSamplerThread::sample_native_thread(JavaThread* jt) {
}
if (jt->thread_state() != _thread_in_native || !jt->has_last_Java_frame()) {
assert_lock_strong(Threads_lock);
JfrSampleMonitor jsm(tl);
if (jsm.is_waiting()) {
// The thread has already returned from native,