From 75a84e6766d20b13988c9843519fcb63de6862c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6plinger?= Date: Fri, 27 Mar 2026 17:13:35 +0100 Subject: [PATCH] Fix xharness watchdog process cleanup in apple and android runners Fixes the noisy error on normal (non-timeout) runs: ./xharness-runner.apple.sh: line 187: 4322 Killed: 9 sleep "$command_timeout" This happened because the watchdog's sleep process was still running when the command finished, and the shell reported its termination. The original code never cleaned up the watchdog since it didn't track its PID. Changes: - Capture the watchdog PID (WATCHDOG_PID) and kill it after the command finishes, then wait on it to suppress the 'Killed' message. - Rename PID to COMMAND_PID for clarity. - Remove the inner '&' inside the watchdog subshell that caused the sleep && kill chain to be double-backgrounded, making the captured PID point to an already-exited subshell instead of the actual sleeper. - Remove the redundant explicit subshell (...) since backgrounding with '&' already implies one. --- .../xharness-runner/xharness-helix-job.android.sh | 13 +++++++++++-- .../tools/xharness-runner/xharness-runner.apple.sh | 9 ++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-helix-job.android.sh b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-helix-job.android.sh index c6a5c4023ea..68f943d56d3 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-helix-job.android.sh +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-helix-job.android.sh @@ -78,6 +78,15 @@ function report_infrastructure_failure() { } # Act out the actual commands (and time constrain them to create buffer for the end of this script) -source command.sh & PID=$! ; (sleep "$command_timeout" && kill -s 0 $PID > /dev/null 2>&1 && echo "ERROR: WORKLOAD TIMED OUT - Killing user command.." && kill $PID 2> /dev/null & ) ; wait $PID +# shellcheck disable=SC1091 +source command.sh & +COMMAND_PID=$! +sleep "$command_timeout" && kill -s 0 $COMMAND_PID > /dev/null 2>&1 && echo "ERROR: WORKLOAD TIMED OUT - Killing user command.." && kill $COMMAND_PID 2> /dev/null & +WATCHDOG_PID=$! +wait $COMMAND_PID +exit_code=$? +# Kill the watchdog process (and its sleeping child) now that the command has finished +kill $WATCHDOG_PID 2> /dev/null +wait $WATCHDOG_PID 2> /dev/null -exit $? +exit $exit_code diff --git a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh index d9a19177447..df05d3dab70 100644 --- a/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh +++ b/src/Microsoft.DotNet.Helix/Sdk/tools/xharness-runner/xharness-runner.apple.sh @@ -184,8 +184,15 @@ start_time="$(date '+%Y-%m-%d %H:%M:%S')" # Act out the actual commands (and time constrain them to create buffer for the end of this script) # shellcheck disable=SC1091 -source command.sh & PID=$! ; (sleep "$command_timeout" && kill -s 0 $PID > /dev/null 2>&1 && echo "ERROR: WORKLOAD TIMED OUT - Killing user command.." && kill $PID 2> /dev/null & ) ; wait $PID +source command.sh & +COMMAND_PID=$! +sleep "$command_timeout" && kill -s 0 $COMMAND_PID > /dev/null 2>&1 && echo "ERROR: WORKLOAD TIMED OUT - Killing user command.." && kill $COMMAND_PID 2> /dev/null & +WATCHDOG_PID=$! +wait $COMMAND_PID exit_code=$? +# Kill the watchdog process (and its sleeping child) now that the command has finished +kill $WATCHDOG_PID 2> /dev/null +wait $WATCHDOG_PID 2> /dev/null # In case of issues, include the syslog (last 2 MB from the time this work item has been running) if [ $exit_code -ne 0 ]; then