From f53fd7aed630b488646e958eaf50cea54d95faeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bern=C3=A1t=20G=C3=A1bor?= Date: Fri, 6 Mar 2026 16:34:06 -0800 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=90=9B=20fix(run):=20break=20deadlock?= =?UTF-8?q?=20in=20execution=20interrupt=20chain?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Windows CI (~1/40 runs), a subprocess can hang indefinitely during environment setup — either in virtualenv's interpreter discovery or during package installation/provisioning. This created an unbreakable deadlock: thread.join() blocked the main thread so signals couldn't be delivered, as_completed() blocked the interrupt thread so it couldn't check the interrupt event, and executor.shutdown(wait=True) prevented done.set() from ever firing. Replace the blocking as_completed() with a polling _next_completed() that checks the interrupt event every second, make the interrupt thread a daemon so the process can exit if it's stuck, use timeout loops for thread.join() so signals can be delivered, and skip waiting for stuck workers on shutdown when interrupted. This affected 18 flaky timeouts across 9 different tests in the last 30 days (89% Windows, 11% macOS). --- docs/changelog/3869.bugfix.rst | 2 + src/tox/session/cmd/run/common.py | 76 ++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 27 deletions(-) create mode 100644 docs/changelog/3869.bugfix.rst diff --git a/docs/changelog/3869.bugfix.rst b/docs/changelog/3869.bugfix.rst new file mode 100644 index 0000000000..ab1aa80aac --- /dev/null +++ b/docs/changelog/3869.bugfix.rst @@ -0,0 +1,2 @@ +Break deadlock in execution interrupt chain that caused ~18 flaky timeout failures across 9 tests on Windows/macOS CI +- by :user:`gaborbernat`. diff --git a/src/tox/session/cmd/run/common.py b/src/tox/session/cmd/run/common.py index 84f4f9f440..311997c257 100644 --- a/src/tox/session/cmd/run/common.py +++ b/src/tox/session/cmd/run/common.py @@ -6,7 +6,8 @@ import os import time from argparse import Action, ArgumentError, ArgumentParser, Namespace -from concurrent.futures import CancelledError, Future, ThreadPoolExecutor, as_completed +from concurrent.futures import FIRST_COMPLETED, CancelledError, Future, ThreadPoolExecutor +from concurrent.futures import wait as wait_futures from fnmatch import fnmatchcase from pathlib import Path from signal import SIGINT, Handlers, signal @@ -227,7 +228,8 @@ def execute(state: State, max_workers: int | None, has_spinner: bool, live: bool ) thread.start() try: - thread.join() + while thread.is_alive(): + thread.join(timeout=1) except KeyboardInterrupt: previous, has_previous = signal(SIGINT, Handlers.SIG_IGN), True spinner.print_report = False # no need to print reports at this point, final report coming up @@ -287,6 +289,18 @@ def update_spinner(self, result: ToxEnvRunResult, success: bool) -> None: # noq done(result.name) +def _next_completed( + future_to_env: dict[Future[ToxEnvRunResult], ToxEnv], + interrupt: Event, +) -> Future[ToxEnvRunResult] | None: + while True: + done_futures, _ = wait_futures(list(future_to_env), timeout=1, return_when=FIRST_COMPLETED) + if done_futures: + return done_futures.pop() + if interrupt.is_set(): + return None + + def _queue_and_wait( # noqa: C901, PLR0913, PLR0915, PLR0912 state: State, to_run_list: list[str], @@ -337,32 +351,40 @@ def _run(tox_env: RunToxEnv) -> ToxEnvRunResult: if not future_to_env: result: ToxEnvRunResult | None = None - else: # if we have queued wait for completed - future = next(as_completed(future_to_env)) - tox_env_done = future_to_env.pop(future) - try: - result = future.result() - except CancelledError: - tox_env_done.teardown() - name = tox_env_done.conf.name - result = ToxEnvRunResult( - name=name, - skipped=False, - code=-3, - outcomes=[], - duration=MISS_DURATION, - ) - results.append(result) - completed.add(result.name) - if ( - result.code != Outcome.OK - and not result.ignore_outcome - and (options.parsed.fail_fast or result.fail_fast) - ): - interrupt.set() + else: + completed_future = _next_completed(future_to_env, interrupt) + if completed_future is None: + for pending_future, pending_env in list(future_to_env.items()): + if not pending_future.cancel() and not pending_future.done(): + pending_env.interrupt() + future_to_env.clear() env_list = [] - for pending_future in list(future_to_env.keys()): - pending_future.cancel() + result = None + else: + tox_env_done = future_to_env.pop(completed_future) + try: + result = completed_future.result() + except CancelledError: + tox_env_done.teardown() + name = tox_env_done.conf.name + result = ToxEnvRunResult( + name=name, + skipped=False, + code=-3, + outcomes=[], + duration=MISS_DURATION, + ) + results.append(result) + completed.add(result.name) + if ( + result.code != Outcome.OK + and not result.ignore_outcome + and (options.parsed.fail_fast or result.fail_fast) + ): + interrupt.set() + env_list = [] + for pending_future in list(future_to_env.keys()): + pending_future.cancel() if not interrupt.is_set() and not env_list: env_list = next(envs_to_run_generator, []) From 9de2cd21c68899a209430f8aaa14638debe593cf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 19:10:02 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/changelog/3869.bugfix.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/changelog/3869.bugfix.rst b/docs/changelog/3869.bugfix.rst index ab1aa80aac..e1d3cb1bca 100644 --- a/docs/changelog/3869.bugfix.rst +++ b/docs/changelog/3869.bugfix.rst @@ -1,2 +1,2 @@ -Break deadlock in execution interrupt chain that caused ~18 flaky timeout failures across 9 tests on Windows/macOS CI -- by :user:`gaborbernat`. +Break deadlock in execution interrupt chain that caused ~18 flaky timeout failures across 9 tests on Windows/macOS CI - +by :user:`gaborbernat`.