Skip to content

Commit ffb2f89

Browse files
committed
src: fix deadlock in NodePlatform::DrainTasks
1 parent 68d7b6f commit ffb2f89

3 files changed

Lines changed: 56 additions & 19 deletions

File tree

src/node_mutex.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ class ConditionVariableBase {
139139
inline void Broadcast(const ScopedLock&);
140140
inline void Signal(const ScopedLock&);
141141
inline void Wait(const ScopedLock& scoped_lock);
142+
// Returns 0 if signaled, UV_ETIMEDOUT on timeout.
143+
inline int TimedWait(const ScopedLock& scoped_lock, uint64_t timeout);
142144

143145
ConditionVariableBase(const ConditionVariableBase&) = delete;
144146
ConditionVariableBase& operator=(const ConditionVariableBase&) = delete;
@@ -175,6 +177,11 @@ struct LibuvMutexTraits {
175177
uv_cond_wait(cond, mutex);
176178
}
177179

180+
static inline int cond_timedwait(CondT* cond, MutexT* mutex,
181+
uint64_t timeout) {
182+
return uv_cond_timedwait(cond, mutex, timeout);
183+
}
184+
178185
static inline void mutex_destroy(MutexT* mutex) {
179186
uv_mutex_destroy(mutex);
180187
}
@@ -249,6 +256,12 @@ void ConditionVariableBase<Traits>::Wait(const ScopedLock& scoped_lock) {
249256
Traits::cond_wait(&cond_, &scoped_lock.mutex_.mutex_);
250257
}
251258

259+
template <typename Traits>
260+
int ConditionVariableBase<Traits>::TimedWait(const ScopedLock& scoped_lock,
261+
uint64_t timeout) {
262+
return Traits::cond_timedwait(&cond_, &scoped_lock.mutex_.mutex_, timeout);
263+
}
264+
252265
template <typename Traits>
253266
MutexBase<Traits>::MutexBase() {
254267
CHECK_EQ(0, Traits::mutex_init(&mutex_));

src/node_platform.cc

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,14 @@ void WorkerThreadsTaskRunner::BlockingDrain() {
300300
pending_worker_tasks_.Lock().BlockingDrain();
301301
}
302302

303+
bool WorkerThreadsTaskRunner::TimedBlockingDrain(uint64_t timeout_in_ns) {
304+
return pending_worker_tasks_.Lock().TimedBlockingDrain(timeout_in_ns);
305+
}
306+
307+
bool WorkerThreadsTaskRunner::HasOutstandingTasks() {
308+
return pending_worker_tasks_.Lock().HasOutstandingTasks();
309+
}
310+
303311
void WorkerThreadsTaskRunner::Shutdown() {
304312
pending_worker_tasks_.Lock().Stop();
305313
delayed_task_scheduler_->Stop();
@@ -581,26 +589,23 @@ void NodePlatform::DrainTasks(Isolate* isolate) {
581589
if (!per_isolate) return;
582590

583591
do {
584-
// FIXME(54918): we should not be blocking on the worker tasks on the
585-
// main thread in one go. Doing so leads to two problems:
586-
// 1. If any of the worker tasks post another foreground task and wait
587-
// for it to complete, and that foreground task is posted right after
588-
// we flush the foreground task queue and before the foreground thread
589-
// goes into sleep, we'll never be able to wake up to execute that
590-
// foreground task and in turn the worker task will never complete, and
591-
// we have a deadlock.
592-
// 2. Worker tasks can be posted from any thread, not necessarily associated
593-
// with the current isolate, and we can be blocking on a worker task that
594-
// is associated with a completely unrelated isolate in the event loop.
595-
// This is suboptimal.
592+
// Worker tasks (e.g. V8 JIT compilation) may post foreground tasks and
593+
// wait for their completion. If we block indefinitely on worker tasks
594+
// without flushing foreground tasks, those worker tasks can never finish,
595+
// causing a deadlock (see https://github.com/nodejs/node/issues/54918).
596596
//
597-
// However, not blocking on the worker tasks at all can lead to loss of some
598-
// critical user-blocking worker tasks e.g. wasm async compilation tasks,
599-
// which should block the main thread until they are completed, as the
600-
// documentation suggets. As a compromise, we currently only block on
601-
// user-blocking tasks to reduce the chance of deadlocks while making sure
602-
// that criticl user-blocking tasks are not lost.
603-
worker_thread_task_runner_->BlockingDrain();
597+
// To avoid this, we interleave: wait briefly for worker tasks to complete,
598+
// then flush any foreground tasks that may have been posted, and repeat.
599+
// This ensures foreground tasks posted by workers get a chance to run.
600+
while (worker_thread_task_runner_->HasOutstandingTasks()) {
601+
// Wait up to 1ms for outstanding worker tasks to complete.
602+
constexpr uint64_t kDrainTimeoutNs = 1'000'000; // 1ms
603+
if (worker_thread_task_runner_->TimedBlockingDrain(kDrainTimeoutNs)) {
604+
break; // All outstanding tasks drained.
605+
}
606+
// Flush foreground tasks that worker tasks may be waiting on.
607+
per_isolate->FlushForegroundTasksInternal();
608+
}
604609
} while (per_isolate->FlushForegroundTasksInternal());
605610
}
606611

@@ -832,6 +837,20 @@ void TaskQueue<T>::Locked::BlockingDrain() {
832837
}
833838
}
834839

840+
template <class T>
841+
bool TaskQueue<T>::Locked::TimedBlockingDrain(uint64_t timeout_in_ns) {
842+
while (queue_->outstanding_tasks_ > 0) {
843+
int r = queue_->outstanding_tasks_drained_.TimedWait(lock_, timeout_in_ns);
844+
if (r != 0) return false; // Timed out, still has outstanding tasks.
845+
}
846+
return true;
847+
}
848+
849+
template <class T>
850+
bool TaskQueue<T>::Locked::HasOutstandingTasks() {
851+
return queue_->outstanding_tasks_ > 0;
852+
}
853+
835854
template <class T>
836855
void TaskQueue<T>::Locked::Stop() {
837856
queue_->stopped_ = true;

src/node_platform.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ class TaskQueue {
5353
std::unique_ptr<T> BlockingPop();
5454
void NotifyOfOutstandingCompletion();
5555
void BlockingDrain();
56+
// Returns true if all outstanding tasks are drained, false on timeout.
57+
bool TimedBlockingDrain(uint64_t timeout_in_ns);
58+
bool HasOutstandingTasks();
5659
void Stop();
5760
PriorityQueue PopAll();
5861

@@ -196,6 +199,8 @@ class WorkerThreadsTaskRunner {
196199
double delay_in_seconds);
197200

198201
void BlockingDrain();
202+
bool TimedBlockingDrain(uint64_t timeout_in_ns);
203+
bool HasOutstandingTasks();
199204
void Shutdown();
200205

201206
int NumberOfWorkerThreads() const;

0 commit comments

Comments
 (0)