From 88f70a79844adf651ebfa195b0494770443ad052 Mon Sep 17 00:00:00 2001
From: PawelPlesniak <plesniakpaul@gmail.com>
Date: Tue, 10 Feb 2026 14:45:45 +0100
Subject: [PATCH 01/29] First step of integration tests

---
 integtest/process_manager_test.py | 243 ++++++++++++++++++++++++
 scripts/drunc_integtest_bundle.sh | 299 ++++++++++++++++++++++++++++++
 2 files changed, 542 insertions(+)
 create mode 100644 integtest/process_manager_test.py
 create mode 100644 scripts/drunc_integtest_bundle.sh

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
new file mode 100644
index 000000000..ef81caa99
--- /dev/null
+++ b/integtest/process_manager_test.py
@@ -0,0 +1,243 @@
+import os
+import re
+
+import integrationtest.data_classes as data_classes
+import integrationtest.data_file_checks as data_file_checks
+import integrationtest.log_file_checks as log_file_checks
+import integrationtest.opmon_metric_checks as opmon_metric_checks
+
+pytest_plugins = "integrationtest.integrationtest_drunc"
+
+# Values that help determine the running conditions
+number_of_data_producers = 2
+data_rate_slowdown_factor = 1  # 10 for ProtoWIB/DuneWIB
+run_duration = 10  # seconds
+readout_window_time_before = 1000
+readout_window_time_after = 1001
+
+# Default values for validation parameters
+expected_number_of_data_files = 1
+check_for_logfile_errors = True
+expected_event_count = run_duration
+expected_event_count_tolerance = 2
+wibeth_frag_params = {
+    "fragment_type_description": "WIBEth",
+    "fragment_type": "WIBEth",
+    "expected_fragment_count": number_of_data_producers,
+    "min_size_bytes": 7272,
+    "max_size_bytes": 14472,
+}
+triggercandidate_frag_params = {
+    "fragment_type_description": "Trigger Candidate",
+    "fragment_type": "Trigger_Candidate",
+    "expected_fragment_count": 1,
+    "min_size_bytes": 128,
+    "max_size_bytes": 216,
+}
+hsi_frag_params = {
+    "fragment_type_description": "HSI",
+    "fragment_type": "Hardware_Signal",
+    "expected_fragment_count": 0,
+    "min_size_bytes": 72,
+    "max_size_bytes": 100,
+}
+ignored_logfile_problems = {
+    "-controller": [
+        "Worker with pid \\d+ was terminated due to signal",
+        "Connection '.*' not found on the application registry",
+    ],
+    "connectivity-service": [
+        "errorlog: -",
+    ],
+}
+
+# The next three variable declarations *must* be present as globals in the test
+# file. They're read by the "fixtures" in conftest.py to determine how
+# to run the config generation and nanorc
+
+# The arguments to pass to the config generator, excluding the json
+# output directory (the test framework handles that)
+
+# CCM includes FSM, hosts; moduleconfs includes connections
+object_databases = ["config/daqsystemtest/integrationtest-objects.data.xml"]
+
+conf_dict = data_classes.drunc_config()
+conf_dict.dro_map_config.n_streams = number_of_data_producers
+conf_dict.op_env = "integtest"
+conf_dict.session = "minimal"
+conf_dict.tpg_enabled = False
+
+# For testing, allow drunc to manage ConnectivityService (default is False, integrationtest manages Connectivity Service)
+# conf_dict.drunc_connsvc = True
+# For testing, specify connectivity service port (default is 0, a random port is chosen for the Connectivity Service)
+# conf_dict.connsvc_port = 12345
+
+substitution = data_classes.attribute_substitution(
+    obj_id="random-tc-generator",
+    obj_class="RandomTCMakerConf",
+    updates={"trigger_rate_hz": 1},
+)
+conf_dict.config_substitutions.append(
+    data_classes.attribute_substitution(
+        obj_class="TCReadoutMap",
+        obj_id="def-random-readout",
+        updates={
+            "time_before": readout_window_time_before,
+            "time_after": readout_window_time_after,
+        },
+    )
+)
+conf_dict.config_substitutions.append(substitution)
+
+
+confgen_arguments = {"MinimalSystem": conf_dict}
+# The commands to run in nanorc, as a list
+nanorc_command_list = "boot restart -n root-controller restart -n mlt logs -n root-controller logs -n mlt ps flush terminate boot terminate boot".split()
+
+# The tests themselves
+
+
+def test_nanorc_success(run_nanorc):
+    # print the name of the current test
+    current_test = os.environ.get("PYTEST_CURRENT_TEST")
+    match_obj = re.search(r".*\[(.+)-run_.*rc.*\d].*", current_test)
+    if match_obj:
+        current_test = match_obj.group(1)
+    banner_line = re.sub(".", "=", current_test)
+    print(banner_line)
+    print(current_test)
+    print(banner_line)
+
+    # Check that nanorc completed correctly
+    assert run_nanorc.completed_process.returncode == 0
+
+
+def test_log_files(run_nanorc):
+    # Check that at least some of the expected log files are present
+    assert any(
+        f"{run_nanorc.session}_df-01" in str(logname)
+        for logname in run_nanorc.log_files
+    )
+    assert any(
+        f"{run_nanorc.session}_dfo" in str(logname) for logname in run_nanorc.log_files
+    )
+    assert any(
+        f"{run_nanorc.session}_mlt" in str(logname) for logname in run_nanorc.log_files
+    )
+    assert any(
+        f"{run_nanorc.session}_ru" in str(logname) for logname in run_nanorc.log_files
+    )
+
+    if check_for_logfile_errors:
+        # Check that there are no warnings or errors in the log files
+        assert log_file_checks.logs_are_error_free(
+            run_nanorc.log_files, True, True, ignored_logfile_problems
+        )
+
+
+def test_data_files(run_nanorc):
+    # Run some tests on the output data file
+    all_ok = len(run_nanorc.data_files) == expected_number_of_data_files
+    print("")  # Clear potential dot from pytest
+    if all_ok:
+        print(
+            f"\N{WHITE HEAVY CHECK MARK} The correct number of raw data files was found ({expected_number_of_data_files})"
+        )
+    else:
+        print(
+            f"\N{POLICE CARS REVOLVING LIGHT} An incorrect number of raw data files was found, expected {expected_number_of_data_files}, found {len(run_nanorc.data_files)} \N{POLICE CARS REVOLVING LIGHT}"
+        )
+
+    fragment_check_list = [triggercandidate_frag_params, hsi_frag_params]
+    fragment_check_list.append(wibeth_frag_params)
+    nontrig_fragment_check_list = [hsi_frag_params, wibeth_frag_params]
+
+    for idx in range(len(run_nanorc.data_files)):
+        data_file = data_file_checks.DataFile(run_nanorc.data_files[idx])
+        all_ok &= data_file_checks.sanity_check(data_file)
+        all_ok &= data_file_checks.check_file_attributes(data_file)
+        all_ok &= data_file_checks.check_event_count(
+            data_file, expected_event_count, expected_event_count_tolerance
+        )
+        for jdx in range(len(fragment_check_list)):
+            all_ok &= data_file_checks.check_fragment_count(
+                data_file, fragment_check_list[jdx]
+            )
+            all_ok &= data_file_checks.check_fragment_sizes(
+                data_file, fragment_check_list[jdx]
+            )
+        for kdx in range(len(nontrig_fragment_check_list)):
+            all_ok &= data_file_checks.check_fragment_error_flags(
+                data_file, nontrig_fragment_check_list[kdx]
+            )
+
+    assert all_ok
+
+
+# 26-Nov-2025, KAB: added some sample opmon metric checks, for demonstration purposes
+def test_metric_files(run_nanorc):
+    print("")  # Clear potential dot from pytest
+
+    # 10-Dec-2025, KAB: we have noticed that sometimes drunc transitions (or other parts of
+    # a run control session) take a little longer than expected.  This can cause extra metric
+    # samples to be created.  This section of code takes that into account by increasing
+    # the max allowed sample count by the amount of extra time taken, divided by 10
+    # (metric samples are produced every 10 seconds, by default).
+    # I've tried to make this code backward compatible by handling cases in which the
+    # daq_session_overall_time is not available (e.g. the try/catch).
+    #
+    # The expected DAQ session time is the sum of the time spent in the "running" state
+    # (specified in the run control commands above [run_duration]) plus the "wait" times in
+    # the RC commands plus the time spent in RC transitions.  With a run duration of 20 sec,
+    # the session time has been measured to be ~40 seconds, so we take the extra 20 seconds
+    # into account.
+    expected_daq_session_time = run_duration + 20
+    #
+    # To calculate the expected number of metric samples, we subtract a small-ish amount of
+    # time that the DAQ session spends in state(s) that don't produce metrics (say 3 seconds)
+    # and divide by 10, where 10 seconds is the interval between each reporting of metrics.
+    expected_metric_sample_count = int((expected_daq_session_time - 3) / 10)
+    #
+    # We'll set the maximum allowed sample count slightly higher than the expected value.
+    max_metric_sample_count = expected_metric_sample_count + 2
+    try:
+        # print(f"\nDAQ session overall time: {run_nanorc.daq_session_overall_time} seconds")
+        if run_nanorc.daq_session_overall_time is not None:
+            extra_time_taken = (
+                run_nanorc.daq_session_overall_time - expected_daq_session_time
+            )
+            if extra_time_taken > 10:
+                extra_sample_count_allowance = int(extra_time_taken / 10)
+                max_metric_sample_count += extra_sample_count_allowance
+    except AttributeError:
+        pass
+
+    session_name = (
+        run_nanorc.session_name if run_nanorc.session_name else run_nanorc.session
+    )
+    metric_data = opmon_metric_checks.collate_opmon_data_from_files(
+        run_nanorc.opmon_files
+    )
+
+    metric_key_list = [
+        session_name,
+        "df-01",
+        "df-01-trb",
+        "dfmodules.TRBInfo",
+        "generated_trigger_records",
+    ]
+    all_ok = True
+    # a 20-second run will likely result in 3 metric samples (at 10-second intervals), so a range
+    # of 1..5 should always succeed
+    all_ok &= opmon_metric_checks.check_metric_sample_count(
+        metric_data, metric_key_list, min_count=1, max_count=max_metric_sample_count
+    )
+    # the number of triggers expected in this test is based on the run duration, so we check for
+    # a reported number of generated trigger records between slightly above/below that
+    all_ok &= opmon_metric_checks.check_metric_value_sum(
+        metric_data,
+        metric_key_list,
+        min_value_sum=run_duration - 3,
+        max_value_sum=run_duration + 3,
+    )
+    assert all_ok
diff --git a/scripts/drunc_integtest_bundle.sh b/scripts/drunc_integtest_bundle.sh
new file mode 100644
index 000000000..785b07217
--- /dev/null
+++ b/scripts/drunc_integtest_bundle.sh
@@ -0,0 +1,299 @@
+#!/bin/bash
+
+# Defines a driver script for the drunc integration tests.
+# The purpose of these scripts is to run a set of integration test with all of the features of drunc tested, so any introduced changes do not affect functionality of the existing infrastructure.
+# Based entirely of the implementation of daqsystemtest_integtest_bundle.sh
+# Original author: KAB, 10-Oct-2023
+
+integtest_list=( "process_manager_test.py" )
+let last_test_index=${#integtest_list[@]}-1
+
+usage() {
+    declare -r script_name=$(basename "$0")
+    echo """
+Usage:
+"${script_name}" [option(s)]
+
+Options:
+    -h, --help : prints out usage information
+    -f <zero-based index of the first test to be run, default=0>
+    -l <zero-based index of the last test to be run, default=${last_test_index}>
+    -k <pipe-delimited string to select which tests will be run ('egrep -i' match to test name)>
+    -n <number of times to run each individual test, default=1>
+    -N <number of times to run the full set of selected tests, default=1>
+    --stop-on-failure : causes the script to stop when one of the integtests reports a failure
+    --concise-output : suppresses run control and DAQApp messages in order to focus on test results
+    --tmpdir : specifies a root directory to use for test output, e.g. a directory instead of '/tmp'
+"""
+    let counter=0
+    echo "List of available tests:"
+    for tst in ${integtest_list[@]}; do
+        echo "    ${counter}: $tst"
+        let counter=${counter}+1
+    done
+    echo ""
+}
+
+# 29-Dec-2025, KAB: Determine if a non-standard pytest tmpdir has been specified
+# in the linux shell environment in which this script is being run. We need to know
+# this value in order to direct functionality in this script to the right place.
+# A user-specified command-line value for the tmpdir over-rides the value determined here.
+tmpdir_root=`dst_get_pytest_tmpdir`
+
+# Removes the ANSI characters associated with formatting, including color coding and font styling
+CaptureOutputNoANSI() {
+    tee -a >(sed -u 's/\x1b\[[0-9;]*m//g' >> "$1")
+}
+# Captures the output to the specified file, without changing the output
+CaptureOutput() {
+    tee -a $1
+}
+
+GETOPT_TEMP=`getopt -o hs:f:l:k:n:N: --long help,stop-on-failure,concise-output,tmpdir: -- "$@"`
+eval set -- "$GETOPT_TEMP"
+
+let first_test_index=0
+let individual_test_requested_iterations=1
+let full_set_requested_interations=1
+let stop_on_failure=0
+requested_test_names=
+PYTEST_COMMAND="pytest -s --tb=short"  # our core pytest command, with DAQ printout included and short pytest traceback
+
+while true; do
+    case "$1" in
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -f)
+            let first_test_index=$2
+            shift 2
+            ;;
+        -l)
+            let last_test_index=$2
+            shift 2
+            ;;
+        -k)
+            requested_test_names=$2
+            shift 2
+            ;;
+        -n)
+            let individual_test_requested_iterations=$2
+            shift 2
+            ;;
+        -N)
+            let full_set_requested_interations=$2
+            shift 2
+            ;;
+        --stop-on-failure)
+            let stop_on_failure=1
+            PYTEST_COMMAND="${PYTEST_COMMAND} -x"  # add the -x option to our pytest command to have it exit on first error
+            shift
+            ;;
+        --concise-output)
+            PYTEST_COMMAND="`echo ${PYTEST_COMMAND} | sed 's/ -s//'`"  # remove the -s option to turn off messages from DAQ processes
+            shift
+            ;;
+        --tmpdir)
+            tmpdir_root=$2
+            export PYTEST_DEBUG_TEMPROOT=${tmpdir_root}
+            shift 2
+            ;;
+        --)
+            shift
+            break
+            ;;
+    esac
+done
+
+# check if the numad daemon is running
+numad_grep_output=`ps -ef | grep numad | grep -v grep`
+if [[ "${numad_grep_output}" != "" ]]; then
+    echo "*********************************************************************"
+    echo "*** DANGER, DANGER, 'numad' appears to be running on this computer!"
+    echo "*** 'ps' output:  ${numad_grep_output}"
+    echo "*** <ctrl-c> now if you want to abort this testing."
+    echo "*********************************************************************"
+    sleep 3
+fi
+
+# other setup
+INITIAL_TIMESTAMP=`date '+%Y%m%d%H%M%S'`
+# 30-Dec-2025, KAB: check that the specified tmpdir exists and is writeable
+if [[ ! -d ${tmpdir_root} ]]; then
+    echo "*** ERROR: directory \"${tmpdir_root}\" does not exist."
+    exit 1
+fi
+if [[ ! -w ${tmpdir_root} ]]; then
+    echo "*** ERROR: directory \"${tmpdir_root}\" is not writeable in the current environment."
+    exit 1
+fi
+pytest_user_dir=${tmpdir_root}/pytest-of-${USER}
+mkdir -p ${pytest_user_dir}
+ITGRUNNER_LOG_FILE="${pytest_user_dir}/drunc_integtest_bundle_${INITIAL_TIMESTAMP}.log"
+CURRENT_PID=$$
+
+let number_of_individual_tests=0
+let test_index=0
+for TEST_NAME in "${integtest_list[@]}"; do
+    if [[ ${test_index} -ge ${first_test_index} && ${test_index} -le ${last_test_index} ]]; then
+        requested_test=`echo ${TEST_NAME} | egrep -i ${requested_test_names:-${TEST_NAME}}`
+        if [[ "${requested_test}" != "" ]]; then
+            let number_of_individual_tests=${number_of_individual_tests}+1
+        fi
+    fi
+    let test_index=${test_index}+1
+done
+let total_number_of_tests=${number_of_individual_tests}*${individual_test_requested_iterations}*${full_set_requested_interations}
+
+# run the tests
+let overall_test_index=0  # this is only used for user feedback
+let full_set_loop_count=0
+while [[ ${full_set_loop_count} -lt ${full_set_requested_interations} ]]; do
+    let test_index=0
+    for TEST_NAME in "${integtest_list[@]}"; do
+        if [[ ${test_index} -ge ${first_test_index} && ${test_index} -le ${last_test_index} ]]; then
+            CURRENT_TIMESTAMP=`date '+%Y%m%d%H%M%S'`
+            # 15-Dec-2025, KAB: added the export of the following enviromental variable.  This is used
+            # by the integrationtest infrastructure to put a bread-crumb file in the directory where
+            # the test results are located.  That file, in turn, allows this script to find the directory
+            # for the current test, and make a copy of it if the test fails.
+            export DUNEDAQ_INTEGTEST_BUNDLE_INFO="${INITIAL_TIMESTAMP};${CURRENT_PID};${CURRENT_TIMESTAMP}"
+            requested_test=`echo ${TEST_NAME} | egrep -i ${requested_test_names:-${TEST_NAME}}`
+            if [[ "${requested_test}" != "" ]]; then
+                let individual_loop_count=0
+                while [[ ${individual_loop_count} -lt ${individual_test_requested_iterations} ]]; do
+                    let overall_test_index=${overall_test_index}+1
+                    echo ""
+                    echo -e "\U0001F535 \033[0;34mStarting test ${overall_test_index} of ${total_number_of_tests}...\033[0m \U0001F535" | CaptureOutput ${ITGRUNNER_LOG_FILE}
+
+                    echo -e "\u2B95 \033[0;1mRunning ${TEST_NAME}\033[0m \u2B05" | CaptureOutput ${ITGRUNNER_LOG_FILE}
+                    if [[ -e "./${TEST_NAME}" ]]; then
+                        ${PYTEST_COMMAND} ./${TEST_NAME} | CaptureOutputNoANSI ${ITGRUNNER_LOG_FILE}
+                    elif [[ -e "${DBT_AREA_ROOT}/pythoncode/drunc/integtest/${TEST_NAME}" ]]; then
+                        if [[ -w "${DBT_AREA_ROOT}" ]]; then
+                            ${PYTEST_COMMAND} ${DBT_AREA_ROOT}/pythoncode/drunc/integtest/${TEST_NAME} | CaptureOutputNoANSI ${ITGRUNNER_LOG_FILE}
+                        else
+                            ${PYTEST_COMMAND} -p no:cacheprovider ${DBT_AREA_ROOT}/pythoncode/drunc/integtest/${TEST_NAME} | CaptureOutputNoANSI ${ITGRUNNER_LOG_FILE}
+                        fi
+                    else
+                        ${PYTEST_COMMAND} -p no:cacheprovider ${DAQSYSTEMTEST_SHARE}/integtest/${TEST_NAME} | CaptureOutputNoANSI ${ITGRUNNER_LOG_FILE}
+                    fi
+                    let pytest_return_code=${PIPESTATUS[0]}
+
+                    let individual_loop_count=${individual_loop_count}+1
+
+                    # check if the test failed
+                    if [[ ${pytest_return_code} -ne 0 ]]; then
+                        # 15-Dec-2025, KAB: make a copy of the pytest directory. This allows
+                        # testers to take a look at the results within a reasonable time frame.
+                        # (If we can't find the "jq" JSON utility, we simply note that fact
+                        # and continue.)
+                        # This code makes use of a bread-crumb file that is created by the
+                        # integrationtest infrastructure.
+                        if [[ "`which jq 2>/dev/null`" != "" ]]; then
+                            current_pytest_rundir=""
+                            mapfile -t bundle_info_files < <(find "${pytest_user_dir}" -type f -name "bundle_script_info.json" -printf '%T@ %p\n' | grep -v 'failed-' | sort -nr | awk '{print $2}')
+                            for info_file in "${bundle_info_files[@]}"; do
+                                script_start_time=`jq -r .bundle_script_start_time ${info_file}`
+                                script_pid=`jq -r .bundle_script_process_id ${info_file}`
+                                individual_test_start_time=`jq -r .individual_test_start_time ${info_file}`
+                                if [[ ${script_start_time} -eq ${INITIAL_TIMESTAMP} ]] && \
+                                       [[ ${script_pid} -eq ${CURRENT_PID} ]] && \
+                                       [[ ${individual_test_start_time} -eq ${CURRENT_TIMESTAMP} ]]; then
+                                    current_pytest_rundir=$info_file
+                                    break
+                                fi
+                            done
+
+                            was_successfully_copied=""
+                            if [[ "${current_pytest_rundir}" != "" ]]; then
+                                pytest_tmpdir=`echo ${current_pytest_rundir} | xargs -r dirname | xargs -r dirname`
+                                if [[ "${pytest_tmpdir}" != "" ]]; then
+                                    pytest_rootdir=`echo ${pytest_tmpdir} | xargs -r dirname`
+                                    pytest_basedir=`echo ${pytest_tmpdir} | xargs -r basename`
+                                    if [[ "${pytest_rootdir}" != "" ]] && [[ "${pytest_basedir}" != "" ]]; then
+                                        new_dir="${pytest_rootdir}/failed-${pytest_basedir}"
+                                        echo ""
+                                        echo -e "\U1F535 Copying the files from failed test ${pytest_tmpdir} to ${new_dir}. \U1F535"
+                                        cp -pR "${pytest_tmpdir}" "${new_dir}"
+                                        if [[ $? == 0 ]]; then
+                                            was_successfully_copied="yes"
+                                            # 18-Dec-2025, KAB: added the removal of the "current" symbolic links
+                                            # from inside the copied directory (since they get broken in the copying)
+                                            rm -f "${new_dir}/configcurrent"
+                                            rm -f "${new_dir}/runcurrent"
+                                        fi
+                                    fi
+                                fi
+                            fi
+                            if [[ "${was_successfully_copied}" == "" ]]; then
+                                echo ""
+                                echo -e "\U1f7e1 WARNING: Unable to copy the pytest directory for this failed test (${current_pytest_rundir}). \U1f7e1"
+                            fi
+                        else
+                            echo ""
+                            echo -e "\U1f7e1 WARNING: Unable to find the 'jq' utility which is needed to help identify which pytest directory to copy for this failed test. \U1f7e1"
+                        fi
+
+                        # remove stale and surplus directories from failed tests
+                        test_dirs_to_remove=()
+                        mapfile -t all_failed_test_dirs < <(find ${pytest_user_dir} -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | awk '{print $2}' | grep 'failed-')
+                        surplus_dirs=("${all_failed_test_dirs[@]:10}")
+                        for test_dir in "${surplus_dirs[@]}"; do
+                            test_dirs_to_remove+=(${test_dir})
+                        done
+                        stale_failed_test_dirs=(`find ${pytest_user_dir} -maxdepth 1 -type d -name 'failed-*' -cmin +1560 -print`)
+                        for test_dir in "${stale_failed_test_dirs[@]}"; do
+                            test_dirs_to_remove+=(${test_dir})
+                        done
+                        if [[ ${#test_dirs_to_remove[@]} -gt 0 ]];then
+                            echo -e "\U1F535 Removing ${#test_dirs_to_remove[@]} old failed test directory(ies). \U1F535"
+                            for test_dir in "${test_dirs_to_remove[@]}"; do
+                                if [[ -e "${test_dir}" ]]; then
+                                    rm -rf "${test_dir}"
+                                fi
+                            done
+                        fi
+
+                        # exit out of this script if the user has requested that we stop on a failure
+                        if [[ ${stop_on_failure} -gt 0 ]]; then
+                            break 3
+                        fi
+                    fi
+                done
+            fi
+        fi
+        let test_index=${test_index}+1
+    done
+
+    let full_set_loop_count=${full_set_loop_count}+1
+done
+
+# print out summary information
+echo ""                                                   | CaptureOutput ${ITGRUNNER_LOG_FILE}
+echo ""                                                   | CaptureOutput ${ITGRUNNER_LOG_FILE}
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++"  | CaptureOutput ${ITGRUNNER_LOG_FILE}
+echo "++++++++++++++++++++ SUMMARY ++++++++++++++++++++"  | CaptureOutput ${ITGRUNNER_LOG_FILE}
+echo "+++++++++++++++++++++++++++++++++++++++++++++++++"  | CaptureOutput ${ITGRUNNER_LOG_FILE}
+echo ""                                                   | CaptureOutput ${ITGRUNNER_LOG_FILE}
+date                                                      | CaptureOutput ${ITGRUNNER_LOG_FILE}
+echo "Log file is: ${ITGRUNNER_LOG_FILE}"                 | CaptureOutput ${ITGRUNNER_LOG_FILE}
+echo ""                                                   | CaptureOutput ${ITGRUNNER_LOG_FILE}
+summary_string="`egrep $'=====|\u2B95' ${ITGRUNNER_LOG_FILE} | egrep ' in |Running'`"
+colorized_summary_string="`echo \"${summary_string}\" | sed 's/passed/passed \\\\U2705/' | sed 's/failed/failed \\\\U274c/' | sed 's/skipped/skipped \\\\U1f7e1/'`"
+echo -e "${colorized_summary_string}" | CaptureOutput ${ITGRUNNER_LOG_FILE}
+
+# check again if the numad daemon is running
+numad_grep_output=`ps -ef | grep numad | grep -v grep`
+if [[ "${numad_grep_output}" != "" ]]; then
+    echo ""                                                                                 | CaptureOutput ${ITGRUNNER_LOG_FILE}
+    echo "********************************************************************************" | CaptureOutput ${ITGRUNNER_LOG_FILE}
+    echo "*** WARNING: 'numad' appears to be running on this computer!"                     | CaptureOutput ${ITGRUNNER_LOG_FILE}
+    echo "*** 'ps' output:  ${numad_grep_output}"                                           | CaptureOutput ${ITGRUNNER_LOG_FILE}
+    echo "*** This daemon can adversely affect the running of these tests, especially ones" | CaptureOutput ${ITGRUNNER_LOG_FILE}
+    echo "*** that are resource intensive in the Readout Apps. This is because numad moves" | CaptureOutput ${ITGRUNNER_LOG_FILE}
+    echo "*** processes (threads?) to different cores/numa nodes periodically, and that"    | CaptureOutput ${ITGRUNNER_LOG_FILE}
+    echo "*** context switch can disrupt the stable running of the DAQ processes."          | CaptureOutput ${ITGRUNNER_LOG_FILE}
+    echo "********************************************************************************" | CaptureOutput ${ITGRUNNER_LOG_FILE}
+fi
\ No newline at end of file

From 9d43725d927c3167f90c4e737ec5756906c1bf88 Mon Sep 17 00:00:00 2001
From: PawelPlesniak <plesniakpaul@gmail.com>
Date: Tue, 10 Feb 2026 15:21:19 +0100
Subject: [PATCH 02/29] Testing now testing

---
 scripts/drunc_integtest_bundle.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/drunc_integtest_bundle.sh b/scripts/drunc_integtest_bundle.sh
index 785b07217..fe5d71f9f 100644
--- a/scripts/drunc_integtest_bundle.sh
+++ b/scripts/drunc_integtest_bundle.sh
@@ -57,7 +57,7 @@ let individual_test_requested_iterations=1
 let full_set_requested_interations=1
 let stop_on_failure=0
 requested_test_names=
-PYTEST_COMMAND="pytest -s --tb=short"  # our core pytest command, with DAQ printout included and short pytest traceback
+PYTEST_COMMAND="pytest -c -s --tb=short"  # our core pytest command, with DAQ printout included and short pytest traceback
 
 while true; do
     case "$1" in

From 863c86e5780544c6590a88983091af601533b0f7 Mon Sep 17 00:00:00 2001
From: PawelPlesniak <plesniakpaul@gmail.com>
Date: Wed, 11 Feb 2026 12:45:34 +0100
Subject: [PATCH 03/29] WIP

---
 integtest/process_manager_test.py | 91 +++++++++----------------------
 scripts/drunc_integtest_bundle.sh |  2 +-
 2 files changed, 27 insertions(+), 66 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index ef81caa99..7edd8bd0b 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -1,8 +1,8 @@
+import getpass
 import os
 import re
 
 import integrationtest.data_classes as data_classes
-import integrationtest.data_file_checks as data_file_checks
 import integrationtest.log_file_checks as log_file_checks
 import integrationtest.opmon_metric_checks as opmon_metric_checks
 
@@ -16,31 +16,31 @@
 readout_window_time_after = 1001
 
 # Default values for validation parameters
-expected_number_of_data_files = 1
+# expected_number_of_data_files = 1
 check_for_logfile_errors = True
-expected_event_count = run_duration
-expected_event_count_tolerance = 2
-wibeth_frag_params = {
-    "fragment_type_description": "WIBEth",
-    "fragment_type": "WIBEth",
-    "expected_fragment_count": number_of_data_producers,
-    "min_size_bytes": 7272,
-    "max_size_bytes": 14472,
-}
-triggercandidate_frag_params = {
-    "fragment_type_description": "Trigger Candidate",
-    "fragment_type": "Trigger_Candidate",
-    "expected_fragment_count": 1,
-    "min_size_bytes": 128,
-    "max_size_bytes": 216,
-}
-hsi_frag_params = {
-    "fragment_type_description": "HSI",
-    "fragment_type": "Hardware_Signal",
-    "expected_fragment_count": 0,
-    "min_size_bytes": 72,
-    "max_size_bytes": 100,
-}
+# expected_event_count = run_duration
+# expected_event_count_tolerance = 2
+# wibeth_frag_params = {
+#     "fragment_type_description": "WIBEth",
+#     "fragment_type": "WIBEth",
+#     "expected_fragment_count": number_of_data_producers,
+#     "min_size_bytes": 7272,
+#     "max_size_bytes": 14472,
+# }
+# triggercandidate_frag_params = {
+#     "fragment_type_description": "Trigger Candidate",
+#     "fragment_type": "Trigger_Candidate",
+#     "expected_fragment_count": 1,
+#     "min_size_bytes": 128,
+#     "max_size_bytes": 216,
+# }
+# hsi_frag_params = {
+#     "fragment_type_description": "HSI",
+#     "fragment_type": "Hardware_Signal",
+#     "expected_fragment_count": 0,
+#     "min_size_bytes": 72,
+#     "max_size_bytes": 100,
+# }
 ignored_logfile_problems = {
     "-controller": [
         "Worker with pid \\d+ was terminated due to signal",
@@ -92,7 +92,7 @@
 
 confgen_arguments = {"MinimalSystem": conf_dict}
 # The commands to run in nanorc, as a list
-nanorc_command_list = "boot restart -n root-controller restart -n mlt logs -n root-controller logs -n mlt ps flush terminate boot terminate boot".split()
+nanorc_command_list = f"boot restart -n root-controller restart -n mlt logs -n root-controller logs -n mlt --how-far 20 --grep ABC ps -l ps -u {getpass.getuser()} flush terminate".split()
 
 # The tests themselves
 
@@ -135,45 +135,6 @@ def test_log_files(run_nanorc):
         )
 
 
-def test_data_files(run_nanorc):
-    # Run some tests on the output data file
-    all_ok = len(run_nanorc.data_files) == expected_number_of_data_files
-    print("")  # Clear potential dot from pytest
-    if all_ok:
-        print(
-            f"\N{WHITE HEAVY CHECK MARK} The correct number of raw data files was found ({expected_number_of_data_files})"
-        )
-    else:
-        print(
-            f"\N{POLICE CARS REVOLVING LIGHT} An incorrect number of raw data files was found, expected {expected_number_of_data_files}, found {len(run_nanorc.data_files)} \N{POLICE CARS REVOLVING LIGHT}"
-        )
-
-    fragment_check_list = [triggercandidate_frag_params, hsi_frag_params]
-    fragment_check_list.append(wibeth_frag_params)
-    nontrig_fragment_check_list = [hsi_frag_params, wibeth_frag_params]
-
-    for idx in range(len(run_nanorc.data_files)):
-        data_file = data_file_checks.DataFile(run_nanorc.data_files[idx])
-        all_ok &= data_file_checks.sanity_check(data_file)
-        all_ok &= data_file_checks.check_file_attributes(data_file)
-        all_ok &= data_file_checks.check_event_count(
-            data_file, expected_event_count, expected_event_count_tolerance
-        )
-        for jdx in range(len(fragment_check_list)):
-            all_ok &= data_file_checks.check_fragment_count(
-                data_file, fragment_check_list[jdx]
-            )
-            all_ok &= data_file_checks.check_fragment_sizes(
-                data_file, fragment_check_list[jdx]
-            )
-        for kdx in range(len(nontrig_fragment_check_list)):
-            all_ok &= data_file_checks.check_fragment_error_flags(
-                data_file, nontrig_fragment_check_list[kdx]
-            )
-
-    assert all_ok
-
-
 # 26-Nov-2025, KAB: added some sample opmon metric checks, for demonstration purposes
 def test_metric_files(run_nanorc):
     print("")  # Clear potential dot from pytest
diff --git a/scripts/drunc_integtest_bundle.sh b/scripts/drunc_integtest_bundle.sh
index fe5d71f9f..7cea68c87 100644
--- a/scripts/drunc_integtest_bundle.sh
+++ b/scripts/drunc_integtest_bundle.sh
@@ -57,7 +57,7 @@ let individual_test_requested_iterations=1
 let full_set_requested_interations=1
 let stop_on_failure=0
 requested_test_names=
-PYTEST_COMMAND="pytest -c -s --tb=short"  # our core pytest command, with DAQ printout included and short pytest traceback
+PYTEST_COMMAND="pytest -c /dev/null -s --tb=short"  # our core pytest command, with DAQ printout included and short pytest traceback
 
 while true; do
     case "$1" in

From 47a20043e083f1a9b0dcce6702c561f6b09f8c46 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Thu, 12 Mar 2026 17:43:01 +0100
Subject: [PATCH 04/29] Update name, fix tests

[TO SQUASH] Some more notes and minor cleanup; move to new nightly

Update name, fix tests
---
 integtest/process_manager_test.py | 121 ++++--------------------------
 scripts/drunc_integtest_bundle.sh |   0
 2 files changed, 13 insertions(+), 108 deletions(-)
 mode change 100644 => 100755 scripts/drunc_integtest_bundle.sh

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 7edd8bd0b..16d941139 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -4,7 +4,6 @@
 
 import integrationtest.data_classes as data_classes
 import integrationtest.log_file_checks as log_file_checks
-import integrationtest.opmon_metric_checks as opmon_metric_checks
 
 pytest_plugins = "integrationtest.integrationtest_drunc"
 
@@ -15,32 +14,8 @@
 readout_window_time_before = 1000
 readout_window_time_after = 1001
 
-# Default values for validation parameters
-# expected_number_of_data_files = 1
 check_for_logfile_errors = True
-# expected_event_count = run_duration
-# expected_event_count_tolerance = 2
-# wibeth_frag_params = {
-#     "fragment_type_description": "WIBEth",
-#     "fragment_type": "WIBEth",
-#     "expected_fragment_count": number_of_data_producers,
-#     "min_size_bytes": 7272,
-#     "max_size_bytes": 14472,
-# }
-# triggercandidate_frag_params = {
-#     "fragment_type_description": "Trigger Candidate",
-#     "fragment_type": "Trigger_Candidate",
-#     "expected_fragment_count": 1,
-#     "min_size_bytes": 128,
-#     "max_size_bytes": 216,
-# }
-# hsi_frag_params = {
-#     "fragment_type_description": "HSI",
-#     "fragment_type": "Hardware_Signal",
-#     "expected_fragment_count": 0,
-#     "min_size_bytes": 72,
-#     "max_size_bytes": 100,
-# }
+
 ignored_logfile_problems = {
     "-controller": [
         "Worker with pid \\d+ was terminated due to signal",
@@ -92,12 +67,9 @@
 
 confgen_arguments = {"MinimalSystem": conf_dict}
 # The commands to run in nanorc, as a list
-nanorc_command_list = f"boot restart -n root-controller restart -n mlt logs -n root-controller logs -n mlt --how-far 20 --grep ABC ps -l ps -u {getpass.getuser()} flush terminate".split()
-
-# The tests themselves
-
+dunerc_command_list = f"boot restart -n root-controller restart -n mlt logs -n root-controller logs -n mlt --how-far 20 --grep ABC ps -l ps -u {getpass.getuser()} flush terminate".split()
 
-def test_nanorc_success(run_nanorc):
+def test_nanorc_success(run_dunerc):
     # print the name of the current test
     current_test = os.environ.get("PYTEST_CURRENT_TEST")
     match_obj = re.search(r".*\[(.+)-run_.*rc.*\d].*", current_test)
@@ -109,96 +81,29 @@ def test_nanorc_success(run_nanorc):
     print(banner_line)
 
     # Check that nanorc completed correctly
-    assert run_nanorc.completed_process.returncode == 0
+    assert run_dunerc.completed_process.returncode == 0
 
 
-def test_log_files(run_nanorc):
+def test_log_files(run_dunerc):
     # Check that at least some of the expected log files are present
     assert any(
-        f"{run_nanorc.session}_df-01" in str(logname)
-        for logname in run_nanorc.log_files
+        f"{run_dunerc.session}_df-01" in str(logname)
+        for logname in run_dunerc.log_files
     )
     assert any(
-        f"{run_nanorc.session}_dfo" in str(logname) for logname in run_nanorc.log_files
+        f"{run_dunerc.session}_dfo" in str(logname) for logname in run_dunerc.log_files
     )
     assert any(
-        f"{run_nanorc.session}_mlt" in str(logname) for logname in run_nanorc.log_files
+        f"{run_dunerc.session}_mlt" in str(logname) for logname in run_dunerc.log_files
     )
     assert any(
-        f"{run_nanorc.session}_ru" in str(logname) for logname in run_nanorc.log_files
+        f"{run_dunerc.session}_ru" in str(logname) for logname in run_dunerc.log_files
     )
 
     if check_for_logfile_errors:
         # Check that there are no warnings or errors in the log files
         assert log_file_checks.logs_are_error_free(
-            run_nanorc.log_files, True, True, ignored_logfile_problems
+            [
+                logname for logname in run_dunerc.log_files if "process_manager" in str(logname)
+            ], True, True, ignored_logfile_problems
         )
-
-
-# 26-Nov-2025, KAB: added some sample opmon metric checks, for demonstration purposes
-def test_metric_files(run_nanorc):
-    print("")  # Clear potential dot from pytest
-
-    # 10-Dec-2025, KAB: we have noticed that sometimes drunc transitions (or other parts of
-    # a run control session) take a little longer than expected.  This can cause extra metric
-    # samples to be created.  This section of code takes that into account by increasing
-    # the max allowed sample count by the amount of extra time taken, divided by 10
-    # (metric samples are produced every 10 seconds, by default).
-    # I've tried to make this code backward compatible by handling cases in which the
-    # daq_session_overall_time is not available (e.g. the try/catch).
-    #
-    # The expected DAQ session time is the sum of the time spent in the "running" state
-    # (specified in the run control commands above [run_duration]) plus the "wait" times in
-    # the RC commands plus the time spent in RC transitions.  With a run duration of 20 sec,
-    # the session time has been measured to be ~40 seconds, so we take the extra 20 seconds
-    # into account.
-    expected_daq_session_time = run_duration + 20
-    #
-    # To calculate the expected number of metric samples, we subtract a small-ish amount of
-    # time that the DAQ session spends in state(s) that don't produce metrics (say 3 seconds)
-    # and divide by 10, where 10 seconds is the interval between each reporting of metrics.
-    expected_metric_sample_count = int((expected_daq_session_time - 3) / 10)
-    #
-    # We'll set the maximum allowed sample count slightly higher than the expected value.
-    max_metric_sample_count = expected_metric_sample_count + 2
-    try:
-        # print(f"\nDAQ session overall time: {run_nanorc.daq_session_overall_time} seconds")
-        if run_nanorc.daq_session_overall_time is not None:
-            extra_time_taken = (
-                run_nanorc.daq_session_overall_time - expected_daq_session_time
-            )
-            if extra_time_taken > 10:
-                extra_sample_count_allowance = int(extra_time_taken / 10)
-                max_metric_sample_count += extra_sample_count_allowance
-    except AttributeError:
-        pass
-
-    session_name = (
-        run_nanorc.session_name if run_nanorc.session_name else run_nanorc.session
-    )
-    metric_data = opmon_metric_checks.collate_opmon_data_from_files(
-        run_nanorc.opmon_files
-    )
-
-    metric_key_list = [
-        session_name,
-        "df-01",
-        "df-01-trb",
-        "dfmodules.TRBInfo",
-        "generated_trigger_records",
-    ]
-    all_ok = True
-    # a 20-second run will likely result in 3 metric samples (at 10-second intervals), so a range
-    # of 1..5 should always succeed
-    all_ok &= opmon_metric_checks.check_metric_sample_count(
-        metric_data, metric_key_list, min_count=1, max_count=max_metric_sample_count
-    )
-    # the number of triggers expected in this test is based on the run duration, so we check for
-    # a reported number of generated trigger records between slightly above/below that
-    all_ok &= opmon_metric_checks.check_metric_value_sum(
-        metric_data,
-        metric_key_list,
-        min_value_sum=run_duration - 3,
-        max_value_sum=run_duration + 3,
-    )
-    assert all_ok
diff --git a/scripts/drunc_integtest_bundle.sh b/scripts/drunc_integtest_bundle.sh
old mode 100644
new mode 100755

From 1d0c3ba33a2fb5c21b55a4d9558485a856d9925c Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 12:36:06 +0100
Subject: [PATCH 05/29] add echo and comment commands

---
 src/drunc/controller/interface/commands.py | 17 +++++++++++++++++
 src/drunc/controller/interface/shell.py    |  4 ++++
 src/drunc/unified_shell/shell.py           |  4 ++++
 3 files changed, 25 insertions(+)

diff --git a/src/drunc/controller/interface/commands.py b/src/drunc/controller/interface/commands.py
index 4d26c00f3..efb22e7fa 100644
--- a/src/drunc/controller/interface/commands.py
+++ b/src/drunc/controller/interface/commands.py
@@ -243,6 +243,23 @@ def who_am_i(obj: ControllerContext) -> None:
     log.info(obj.get_token().user_name)
 
 
+# click_shell/_cmd.py, line 23. identchars only accepts ascii letters + digits + _
+@click.command("comment", 
+    hidden=True,
+    context_settings=dict(
+    ignore_unknown_options=True,
+    allow_extra_args=True,
+))
+def comment_handler():
+    """Ignore this line"""
+    pass
+
+@click.command("echo")
+@click.argument("text", required=False)
+@click.pass_obj
+def echo(obj, text: str | None) -> None:
+    log.info(text or "")
+    
 @click.command("who-is-in-charge")
 @click.option("--target", type=str, help="The target to address", default="")
 @click.option(
diff --git a/src/drunc/controller/interface/shell.py b/src/drunc/controller/interface/shell.py
index ab33f9cd5..25373004c 100644
--- a/src/drunc/controller/interface/shell.py
+++ b/src/drunc/controller/interface/shell.py
@@ -17,6 +17,8 @@
     take_control,
     wait,
     who_am_i,
+    echo,
+    comment_handler,
     who_is_in_charge,
 )
 from drunc.controller.interface.shell_utils import (
@@ -90,6 +92,8 @@ def controller_shell(ctx, controller_address: str, log_level: str) -> None:
     ctx.command.add_command(take_control, "take-control")
     ctx.command.add_command(surrender_control, "surrender-control")
     ctx.command.add_command(who_am_i, "whoami")
+    ctx.command.add_command(echo, "echo")
+    ctx.command.add_command(comment_handler, "comment-handler")
     ctx.command.add_command(who_is_in_charge, "who-is-in-charge")
     for transition in transitions.commands:
         ctx.command.add_command(*generate_fsm_command(ctx.obj, transition, desc.name))
diff --git a/src/drunc/unified_shell/shell.py b/src/drunc/unified_shell/shell.py
index b80d8e33d..3d9efa1d7 100644
--- a/src/drunc/unified_shell/shell.py
+++ b/src/drunc/unified_shell/shell.py
@@ -30,6 +30,8 @@
     to_error,
     wait,
     who_am_i,
+    echo,
+    comment_handler,
     who_is_in_charge,
 )
 from drunc.controller.interface.shell_utils import generate_fsm_command
@@ -381,6 +383,8 @@ def unified_shell(
         take_control,
         surrender_control,
         who_am_i,
+        echo,
+        comment_handler,
         who_is_in_charge,
         include,
         exclude,

From 430d09618113f09f8c684ecd3c3b7e324d676263 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 14:20:43 +0100
Subject: [PATCH 06/29] Fix 'test will fail if your terminal window is too
 short' bug

---
 src/drunc/process_manager/interface/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/drunc/process_manager/interface/commands.py b/src/drunc/process_manager/interface/commands.py
index 38fb61946..2937515ca 100644
--- a/src/drunc/process_manager/interface/commands.py
+++ b/src/drunc/process_manager/interface/commands.py
@@ -236,7 +236,7 @@ def logs(
         if grep is not None:
             line = line.replace(grep, f"[u]{grep}[/]")
 
-        obj.print(line)
+        obj.print(line, soft_wrap=True)
     if result.name is not None:
         obj.rule(f"[yellow]{display_name}[/yellow] end")
 

From 89e259e7513aa41c9daf52225e2afa88867c4d8b Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 14:22:34 +0100
Subject: [PATCH 07/29] add basic logging tests to show that it works

---
 integtest/process_manager_test.py          | 85 +++++++++++++++++++++-
 src/drunc/controller/interface/commands.py |  1 +
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 16d941139..4b3f9b633 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -67,7 +67,35 @@
 
 confgen_arguments = {"MinimalSystem": conf_dict}
 # The commands to run in nanorc, as a list
-dunerc_command_list = f"boot restart -n root-controller restart -n mlt logs -n root-controller logs -n mlt --how-far 20 --grep ABC ps -l ps -u {getpass.getuser()} flush terminate".split()
+# NOTE THAT WE HAVE NOT TESTED FLUSH BECAUSE IT IS BROKEN
+# see #821
+
+dunerc_command_list = f"""
+boot
+
+echo testing_logs
+logs --name unknown
+logs --name root-controller --how-far 5
+logs --name mlt --how-far 5
+
+ps -u {getpass.getuser()}
+
+restart -n root-controller
+restart -n mlt
+wait 5
+kill -n mlt
+wait 2
+restart -n mlt
+restart -n trg-controller
+wait 5
+
+
+flush
+terminate
+
+""".split()
+
+
 
 def test_nanorc_success(run_dunerc):
     # print the name of the current test
@@ -84,6 +112,61 @@ def test_nanorc_success(run_dunerc):
     assert run_dunerc.completed_process.returncode == 0
 
 
+def test_log_command(run_dunerc) -> None:
+    test_str = "Bad query for logs: The process corresponding to the query doesn't exist"
+    assert test_str in run_dunerc.completed_process.stdout 
+
+
+def test_root_controller_logs(run_dunerc) -> None:
+    """
+    Verifies that:
+    - the stdout contains a "root-controller logs" header line and a "root-controller end" footer line
+    - there are exactly 5 lines between those two lines
+    - among those 5 lines, the one from "drunc.controller.core.init_controller" ends with "Controller ready"
+    """
+    stdout = run_dunerc.completed_process.stdout
+    assert isinstance(stdout, str)
+
+    lines = stdout.splitlines()
+
+    # 1) Find the header/footer lines
+    header_idx = next(
+        (i for i, line in enumerate(lines) if "root-controller logs" in line),
+        None,
+    )
+    footer_idx = next(
+        (i for i, line in enumerate(lines) if "root-controller end" in line),
+        None,
+    )
+
+    assert header_idx is not None, "Did not find the 'root-controller logs' header line in stdout."
+    assert footer_idx is not None, "Did not find the 'root-controller end' footer line in stdout."
+    assert footer_idx > header_idx, "Footer appears before header in stdout."
+
+    # 2) Check there are 5 lines between header and footer
+    between = lines[header_idx + 1 : footer_idx]
+    assert (
+        len(between) == 5
+    ), f"Expected exactly 5 lines between header and footer, found {len(between)}.\nBetween:\n" + "\n".join(
+        between
+    )
+
+    # 3) Check the init_controller line ends with "Controller ready"
+    # Example line:
+    # [2026/03/13 08:17:47 UTC] INFO ... drunc.controller.core.init_controller ... Controller ready
+    init_controller_ready_re = re.compile(
+        r"drunc\.controller\.core\.init_controller.*Controller ready\s*$"
+    )
+
+    matches = [line for line in between if init_controller_ready_re.search(line)]
+    assert (
+        len(matches) >= 1
+    ), "Did not find an init_controller line ending with 'Controller ready' within the 5 lines.\nBetween:\n" + "\n".join(
+        between
+    )
+
+
+
 def test_log_files(run_dunerc):
     # Check that at least some of the expected log files are present
     assert any(
diff --git a/src/drunc/controller/interface/commands.py b/src/drunc/controller/interface/commands.py
index efb22e7fa..566f350c5 100644
--- a/src/drunc/controller/interface/commands.py
+++ b/src/drunc/controller/interface/commands.py
@@ -244,6 +244,7 @@ def who_am_i(obj: ControllerContext) -> None:
 
 
 # click_shell/_cmd.py, line 23. identchars only accepts ascii letters + digits + _
+# Can't really be used by the integ test tho..
 @click.command("comment", 
     hidden=True,
     context_settings=dict(

From 3aea522a4435952d643779f2ebcb3d65bc4f713d Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 15:13:41 +0100
Subject: [PATCH 08/29] add wait kill tests; add tableparser

---
 integtest/process_manager_test.py | 200 ++++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 4b3f9b633..dc43ef511 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -1,6 +1,7 @@
 import getpass
 import os
 import re
+from datetime import datetime
 
 import integrationtest.data_classes as data_classes
 import integrationtest.log_file_checks as log_file_checks
@@ -78,17 +79,33 @@
 logs --name root-controller --how-far 5
 logs --name mlt --how-far 5
 
+echo test_wait
+wait 10
+
+echo on_boot
 ps -u {getpass.getuser()}
 
 restart -n root-controller
 restart -n mlt
 wait 5
+
+echo pre_kill_mlt
+ps -u {getpass.getuser()}
+
 kill -n mlt
 wait 2
+echo post_kill_mlt
+ps -u {getpass.getuser()}
+
+
+
 restart -n mlt
 restart -n trg-controller
 wait 5
 
+echo ps_after_recovery
+ps -u {getpass.getuser()}
+
 
 flush
 terminate
@@ -96,6 +113,167 @@
 """.split()
 
 
+UUID_RE = re.compile(
+    r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
+)
+ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
+
+
+def strip_ansi(text: str) -> str:
+    return ANSI_ESCAPE_RE.sub("", text)
+
+
+def _parse_ps_table_from_index(lines: list[str], start_idx: int) -> list[dict[str, str]]:
+    table_rows: list[dict[str, str]] = []
+
+    for line in lines[start_idx + 1 :]:
+        stripped = line.strip()
+
+        if stripped.startswith("└"):
+            break
+
+        if not stripped.startswith("│"):
+            continue
+
+        cells = [cell.strip() for cell in stripped.strip("│").split("│")]
+        if len(cells) < 7:
+            continue
+
+        table_rows.append(
+            {
+                "session": cells[0],
+                "friendly_name": cells[1],
+                "user": cells[2],
+                "host": cells[3],
+                "uuid": cells[4],
+                "alive": cells[5],
+                "exit_code": cells[6],
+            }
+        )
+
+    return table_rows
+
+
+def get_ps_table_after_echo(stdout: str, echo_marker: str) -> list[dict[str, str]]:
+    lines = strip_ansi(stdout).splitlines()
+
+    echo_idx = next(
+        (
+            idx
+            for idx, line in enumerate(lines)
+            if "drunc.echo" in line and line.rstrip().endswith(echo_marker)
+        ),
+        None,
+    )
+    assert echo_idx is not None, f"Could not find drunc.echo marker '{echo_marker}' in stdout."
+
+    table_start_idx = next(
+        (idx for idx in range(echo_idx + 1, len(lines)) if "Processes running" in lines[idx]),
+        None,
+    )
+    assert (
+        table_start_idx is not None
+    ), f"Could not find a 'Processes running' table after marker '{echo_marker}'."
+
+    table_rows = _parse_ps_table_from_index(lines, table_start_idx)
+    assert table_rows, f"Found table header after marker '{echo_marker}', but no rows were parsed."
+
+    return table_rows
+
+
+def get_uuid_for_friendly_name(ps_table: list[dict[str, str]], friendly_name: str) -> str:
+    for row in ps_table:
+        if row["friendly_name"].strip() == friendly_name:
+            return row["uuid"]
+
+    available_names = ", ".join(row["friendly_name"].strip() for row in ps_table)
+    raise AssertionError(
+        f"Could not find friendly name '{friendly_name}' in ps table. "
+        f"Available names: {available_names}"
+    )
+
+
+
+
+def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
+    stdout = run_dunerc.completed_process.stdout
+
+    ps_before_kill = get_ps_table_after_echo(stdout, "pre_kill_mlt")
+    ps_after_kill = get_ps_table_after_echo(stdout, "post_kill_mlt")
+
+    mlt_before_kill = [
+        row for row in ps_before_kill if row["friendly_name"].strip() == "mlt"
+    ]
+    mlt_after_kill = [
+        row for row in ps_after_kill if row["friendly_name"].strip() == "mlt"
+    ]
+
+    assert mlt_before_kill, "Expected to find 'mlt' in ps table before kill, but it was missing."
+    assert not mlt_after_kill, "Expected 'mlt' to be absent from ps table after kill, but it is still present."
+
+
+def test_wait_command_duration_from_logs(run_dunerc) -> None:
+    stdout = run_dunerc.completed_process.stdout
+    lines = strip_ansi(stdout).splitlines()
+
+    echo_idx = next(
+        (
+            idx
+            for idx, line in enumerate(lines)
+            if "drunc.echo" in line and line.rstrip().endswith("test_wait")
+        ),
+        None,
+    )
+    assert echo_idx is not None, "Could not find drunc.echo marker 'test_wait' in stdout."
+
+    running_pattern = re.compile(r"Command wait running for (\d+) seconds\.")
+    ran_pattern = re.compile(r"Command wait ran for (\d+) seconds\.")
+    timestamp_pattern = re.compile(r"^\[(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) UTC\]")
+
+    running_idx = next(
+        (idx for idx in range(echo_idx + 1, len(lines)) if running_pattern.search(lines[idx])),
+        None,
+    )
+    assert running_idx is not None, "Did not find 'Command wait running for ... seconds.' after test_wait marker."
+
+    ran_idx = next(
+        (idx for idx in range(running_idx + 1, len(lines)) if ran_pattern.search(lines[idx])),
+        None,
+    )
+    assert ran_idx is not None, "Did not find 'Command wait ran for ... seconds.' after wait start log."
+
+    running_match = running_pattern.search(lines[running_idx])
+    ran_match = ran_pattern.search(lines[ran_idx])
+    assert running_match is not None
+    assert ran_match is not None
+
+    expected_seconds = 10
+    assert int(running_match.group(1)) == expected_seconds, (
+        f"Expected wait start log to report {expected_seconds} seconds, got {running_match.group(1)}."
+    )
+    assert int(ran_match.group(1)) == expected_seconds, (
+        f"Expected wait end log to report {expected_seconds} seconds, got {ran_match.group(1)}."
+    )
+
+    start_ts_match = timestamp_pattern.search(lines[running_idx])
+    end_ts_match = timestamp_pattern.search(lines[ran_idx])
+    assert start_ts_match is not None, "Could not parse timestamp in wait start log line."
+    assert end_ts_match is not None, "Could not parse timestamp in wait end log line."
+
+    start_ts = datetime.strptime(start_ts_match.group(1), "%Y/%m/%d %H:%M:%S")
+    end_ts = datetime.strptime(end_ts_match.group(1), "%Y/%m/%d %H:%M:%S")
+    elapsed_seconds = (end_ts - start_ts).total_seconds()
+
+    tolerance_seconds = 1
+    assert abs(elapsed_seconds - expected_seconds) <= tolerance_seconds, (
+        f"Expected wait log timestamps to differ by {expected_seconds}±{tolerance_seconds} seconds, "
+        f"got {elapsed_seconds} seconds."
+    )
+
+
+
+
+
 
 def test_nanorc_success(run_dunerc):
     # print the name of the current test
@@ -167,6 +345,28 @@ def test_root_controller_logs(run_dunerc) -> None:
 
 
 
+# def test_restart_changes_process_uuid(run_dunerc) -> None:
+#     stdout = run_dunerc.completed_process.stdout
+
+#     ps_before_restart = get_ps_table_after_echo(stdout, "ps_before_restart")
+#     ps_after_restart = get_ps_table_after_echo(stdout, "ps_after_restart")
+
+#     root_before = get_uuid_for_friendly_name(ps_before_restart, "root-controller")
+#     root_after = get_uuid_for_friendly_name(ps_after_restart, "root-controller")
+#     assert root_before != root_after, (
+#         "Expected root-controller UUID to change after restart, "
+#         f"but it stayed the same ({root_before})."
+#     )
+
+#     mlt_before = get_uuid_for_friendly_name(ps_before_restart, "mlt")
+#     mlt_after = get_uuid_for_friendly_name(ps_after_restart, "mlt")
+#     assert mlt_before != mlt_after, (
+#         "Expected mlt UUID to change after restart, "
+#         f"but it stayed the same ({mlt_before})."
+#     )
+
+
+
 def test_log_files(run_dunerc):
     # Check that at least some of the expected log files are present
     assert any(

From fcdb8cf543a068ad47fd3f891b44c37fa40a21f5 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 15:48:52 +0100
Subject: [PATCH 09/29] Add more test, fix tiny terminal bug again (different
 source)

---
 integtest/process_manager_test.py | 154 +++++++++++++++++++++---------
 1 file changed, 109 insertions(+), 45 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index dc43ef511..04f9731f1 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -72,7 +72,13 @@
 # see #821
 
 dunerc_command_list = f"""
+
+echo pre_boot
+ps -u {getpass.getuser()}
 boot
+echo on_boot
+ps -u {getpass.getuser()}
+
 
 echo testing_logs
 logs --name unknown
@@ -82,23 +88,20 @@
 echo test_wait
 wait 10
 
-echo on_boot
-ps -u {getpass.getuser()}
-
-restart -n root-controller
+echo WE_STILL_NEED_TO_TEST-RESTART
 restart -n mlt
+restart -n root-controller
 wait 5
 
+
 echo pre_kill_mlt
 ps -u {getpass.getuser()}
-
 kill -n mlt
 wait 2
 echo post_kill_mlt
 ps -u {getpass.getuser()}
 
 
-
 restart -n mlt
 restart -n trg-controller
 wait 5
@@ -115,6 +118,7 @@
 
 UUID_RE = re.compile(
     r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
+    r"|^[0-9a-fA-F]{8}-[-0-9a-fA-F]*\u2026"  # truncated by Rich table column width
 )
 ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
 
@@ -123,7 +127,9 @@ def strip_ansi(text: str) -> str:
     return ANSI_ESCAPE_RE.sub("", text)
 
 
-def _parse_ps_table_from_index(lines: list[str], start_idx: int) -> list[dict[str, str]]:
+def _parse_ps_table_from_index(
+    lines: list[str], start_idx: int
+) -> list[dict[str, str]]:
     table_rows: list[dict[str, str]] = []
 
     for line in lines[start_idx + 1 :]:
@@ -165,23 +171,27 @@ def get_ps_table_after_echo(stdout: str, echo_marker: str) -> list[dict[str, str
         ),
         None,
     )
-    assert echo_idx is not None, f"Could not find drunc.echo marker '{echo_marker}' in stdout."
+    assert echo_idx is not None, (
+        f"Could not find drunc.echo marker '{echo_marker}' in stdout."
+    )
 
     table_start_idx = next(
-        (idx for idx in range(echo_idx + 1, len(lines)) if "Processes running" in lines[idx]),
+        (
+            idx
+            for idx in range(echo_idx + 1, len(lines))
+            if "Processes running" in lines[idx]
+        ),
         None,
     )
-    assert (
-        table_start_idx is not None
-    ), f"Could not find a 'Processes running' table after marker '{echo_marker}'."
+    if table_start_idx is None:
+        return []
 
-    table_rows = _parse_ps_table_from_index(lines, table_start_idx)
-    assert table_rows, f"Found table header after marker '{echo_marker}', but no rows were parsed."
+    return _parse_ps_table_from_index(lines, table_start_idx)
 
-    return table_rows
 
-
-def get_uuid_for_friendly_name(ps_table: list[dict[str, str]], friendly_name: str) -> str:
+def get_uuid_for_friendly_name(
+    ps_table: list[dict[str, str]], friendly_name: str
+) -> str:
     for row in ps_table:
         if row["friendly_name"].strip() == friendly_name:
             return row["uuid"]
@@ -193,6 +203,24 @@ def get_uuid_for_friendly_name(ps_table: list[dict[str, str]], friendly_name: st
     )
 
 
+def test_boot(run_dunerc) -> None:
+    stdout = run_dunerc.completed_process.stdout
+
+    ps_pre_boot = get_ps_table_after_echo(stdout, "pre_boot")
+    ps_on_boot = get_ps_table_after_echo(stdout, "on_boot")
+
+    assert not ps_pre_boot, (
+        f"Expected ps table before boot to be empty, but found {len(ps_pre_boot)} row(s): "
+        + ", ".join(row["friendly_name"] for row in ps_pre_boot)
+    )
+
+    assert ps_on_boot, (
+        "Expected ps table after boot to contain processes, but it was empty."
+    )
+    for row in ps_on_boot:
+        assert UUID_RE.match(row["uuid"]), (
+            f"Expected a valid UUID for process '{row['friendly_name']}', got '{row['uuid']}'"
+        )
 
 
 def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
@@ -208,8 +236,25 @@ def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
         row for row in ps_after_kill if row["friendly_name"].strip() == "mlt"
     ]
 
-    assert mlt_before_kill, "Expected to find 'mlt' in ps table before kill, but it was missing."
-    assert not mlt_after_kill, "Expected 'mlt' to be absent from ps table after kill, but it is still present."
+    assert mlt_before_kill, (
+        "Expected to find 'mlt' in ps table before kill, but it was missing."
+    )
+    assert not mlt_after_kill, (
+        "Expected 'mlt' to be absent from ps table after kill, but it is still present."
+    )
+
+
+def test_mlt_recovers_after_kill(run_dunerc) -> None:
+    stdout = run_dunerc.completed_process.stdout
+
+    ps_after_recovery = get_ps_table_after_echo(stdout, "ps_after_recovery")
+
+    mlt_after_recovery = [
+        row for row in ps_after_recovery if row["friendly_name"].strip() == "mlt"
+    ]
+    assert mlt_after_recovery, (
+        "Expected 'mlt' to be present in ps table after recovery, but it was missing."
+    )
 
 
 def test_wait_command_duration_from_logs(run_dunerc) -> None:
@@ -224,23 +269,37 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
         ),
         None,
     )
-    assert echo_idx is not None, "Could not find drunc.echo marker 'test_wait' in stdout."
+    assert echo_idx is not None, (
+        "Could not find drunc.echo marker 'test_wait' in stdout."
+    )
 
     running_pattern = re.compile(r"Command wait running for (\d+) seconds\.")
     ran_pattern = re.compile(r"Command wait ran for (\d+) seconds\.")
     timestamp_pattern = re.compile(r"^\[(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) UTC\]")
 
     running_idx = next(
-        (idx for idx in range(echo_idx + 1, len(lines)) if running_pattern.search(lines[idx])),
+        (
+            idx
+            for idx in range(echo_idx + 1, len(lines))
+            if running_pattern.search(lines[idx])
+        ),
         None,
     )
-    assert running_idx is not None, "Did not find 'Command wait running for ... seconds.' after test_wait marker."
+    assert running_idx is not None, (
+        "Did not find 'Command wait running for ... seconds.' after test_wait marker."
+    )
 
     ran_idx = next(
-        (idx for idx in range(running_idx + 1, len(lines)) if ran_pattern.search(lines[idx])),
+        (
+            idx
+            for idx in range(running_idx + 1, len(lines))
+            if ran_pattern.search(lines[idx])
+        ),
         None,
     )
-    assert ran_idx is not None, "Did not find 'Command wait ran for ... seconds.' after wait start log."
+    assert ran_idx is not None, (
+        "Did not find 'Command wait ran for ... seconds.' after wait start log."
+    )
 
     running_match = running_pattern.search(lines[running_idx])
     ran_match = ran_pattern.search(lines[ran_idx])
@@ -257,7 +316,9 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
 
     start_ts_match = timestamp_pattern.search(lines[running_idx])
     end_ts_match = timestamp_pattern.search(lines[ran_idx])
-    assert start_ts_match is not None, "Could not parse timestamp in wait start log line."
+    assert start_ts_match is not None, (
+        "Could not parse timestamp in wait start log line."
+    )
     assert end_ts_match is not None, "Could not parse timestamp in wait end log line."
 
     start_ts = datetime.strptime(start_ts_match.group(1), "%Y/%m/%d %H:%M:%S")
@@ -271,10 +332,6 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
     )
 
 
-
-
-
-
 def test_nanorc_success(run_dunerc):
     # print the name of the current test
     current_test = os.environ.get("PYTEST_CURRENT_TEST")
@@ -291,8 +348,10 @@ def test_nanorc_success(run_dunerc):
 
 
 def test_log_command(run_dunerc) -> None:
-    test_str = "Bad query for logs: The process corresponding to the query doesn't exist"
-    assert test_str in run_dunerc.completed_process.stdout 
+    test_str = (
+        "Bad query for logs: The process corresponding to the query doesn't exist"
+    )
+    assert test_str in run_dunerc.completed_process.stdout
 
 
 def test_root_controller_logs(run_dunerc) -> None:
@@ -317,16 +376,19 @@ def test_root_controller_logs(run_dunerc) -> None:
         None,
     )
 
-    assert header_idx is not None, "Did not find the 'root-controller logs' header line in stdout."
-    assert footer_idx is not None, "Did not find the 'root-controller end' footer line in stdout."
+    assert header_idx is not None, (
+        "Did not find the 'root-controller logs' header line in stdout."
+    )
+    assert footer_idx is not None, (
+        "Did not find the 'root-controller end' footer line in stdout."
+    )
     assert footer_idx > header_idx, "Footer appears before header in stdout."
 
     # 2) Check there are 5 lines between header and footer
     between = lines[header_idx + 1 : footer_idx]
-    assert (
-        len(between) == 5
-    ), f"Expected exactly 5 lines between header and footer, found {len(between)}.\nBetween:\n" + "\n".join(
-        between
+    assert len(between) == 5, (
+        f"Expected exactly 5 lines between header and footer, found {len(between)}.\nBetween:\n"
+        + "\n".join(between)
     )
 
     # 3) Check the init_controller line ends with "Controller ready"
@@ -337,14 +399,12 @@ def test_root_controller_logs(run_dunerc) -> None:
     )
 
     matches = [line for line in between if init_controller_ready_re.search(line)]
-    assert (
-        len(matches) >= 1
-    ), "Did not find an init_controller line ending with 'Controller ready' within the 5 lines.\nBetween:\n" + "\n".join(
-        between
+    assert len(matches) >= 1, (
+        "Did not find an init_controller line ending with 'Controller ready' within the 5 lines.\nBetween:\n"
+        + "\n".join(between)
     )
 
 
-
 # def test_restart_changes_process_uuid(run_dunerc) -> None:
 #     stdout = run_dunerc.completed_process.stdout
 
@@ -366,7 +426,6 @@ def test_root_controller_logs(run_dunerc) -> None:
 #     )
 
 
-
 def test_log_files(run_dunerc):
     # Check that at least some of the expected log files are present
     assert any(
@@ -387,6 +446,11 @@ def test_log_files(run_dunerc):
         # Check that there are no warnings or errors in the log files
         assert log_file_checks.logs_are_error_free(
             [
-                logname for logname in run_dunerc.log_files if "process_manager" in str(logname)
-            ], True, True, ignored_logfile_problems
+                logname
+                for logname in run_dunerc.log_files
+                if "process_manager" in str(logname)
+            ],
+            True,
+            True,
+            ignored_logfile_problems,
         )

From ad5c80e532198789460a9c534226685cf64fb945 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 16:11:24 +0100
Subject: [PATCH 10/29] Added final set of tests,  ultra janky now

---
 integtest/process_manager_test.py | 102 +++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 22 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 04f9731f1..16b2d6c6d 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -88,10 +88,11 @@
 echo test_wait
 wait 10
 
-echo WE_STILL_NEED_TO_TEST-RESTART
+echo pre_restart_mlt
 restart -n mlt
 restart -n root-controller
 wait 5
+echo post_restart_mlt
 
 
 echo pre_kill_mlt
@@ -332,6 +333,84 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
     )
 
 
+def test_restart_mlt_logs(run_dunerc) -> None:
+    stdout = run_dunerc.completed_process.stdout
+    lines = strip_ansi(stdout).splitlines()
+
+    echo_idx = next(
+        (
+            idx
+            for idx, line in enumerate(lines)
+            if "drunc.echo" in line and line.rstrip().endswith("pre_restart_mlt")
+        ),
+        None,
+    )
+    assert echo_idx is not None, (
+        "Could not find drunc.echo marker 'pre_restart_mlt' in stdout."
+    )
+
+    post_restart_idx = next(
+        (
+            idx
+            for idx, line in enumerate(lines)
+            if idx > echo_idx
+            and "drunc.echo" in line
+            and line.rstrip().endswith("post_restart_mlt")
+        ),
+        None,
+    )
+    assert post_restart_idx is not None, (
+        "Could not find drunc.echo marker 'post_restart_mlt' in stdout."
+    )
+
+    restart_lines = lines[echo_idx + 1 : post_restart_idx]
+    restart_text = "\n".join(restart_lines)
+
+    restart_request_match = re.search(
+        r"process_manager restarting \['mlt'\] in session",
+        restart_text,
+    )
+    assert restart_request_match is not None, (
+        "Did not find the mlt restart request log line between restart markers."
+    )
+
+    graceful_termination_match = re.search(
+        r"Remote process .*?terminated gracefully following SIGQUIT signal\.",
+        restart_text[restart_request_match.end() :],
+        re.DOTALL,
+    )
+    assert graceful_termination_match is not None, (
+        "Did not find the graceful termination log line for mlt after restart request."
+    )
+
+    exit_code_search_text = restart_text[
+        restart_request_match.end() + graceful_termination_match.end() :
+    ]
+    exit_code_match = re.search(
+        r"Process 'mlt'.*?process exited\s+with exit code 0",
+        exit_code_search_text,
+        re.DOTALL,
+    )
+    assert exit_code_match is not None, (
+        "Did not find the mlt exit-code log line after graceful termination."
+    )
+
+    booted_search_text = exit_code_search_text[exit_code_match.end() :]
+    booted_match = re.search(
+        r"Booted 'mlt'.*?with UUID\s+([^\s\n]+)",
+        booted_search_text,
+        re.DOTALL,
+    )
+    assert booted_match is not None, (
+        "Did not find the mlt boot log line after the restart exit log."
+    )
+
+    booted_uuid = booted_match.group(1)
+    assert UUID_RE.match(booted_uuid), (
+        f"Expected the mlt boot log to contain a UUID, got: {booted_uuid}"
+    )
+
+
 def test_nanorc_success(run_dunerc):
     # print the name of the current test
     current_test = os.environ.get("PYTEST_CURRENT_TEST")
@@ -405,27 +484,6 @@ def test_root_controller_logs(run_dunerc) -> None:
     )
 
 
-# def test_restart_changes_process_uuid(run_dunerc) -> None:
-#     stdout = run_dunerc.completed_process.stdout
-
-#     ps_before_restart = get_ps_table_after_echo(stdout, "ps_before_restart")
-#     ps_after_restart = get_ps_table_after_echo(stdout, "ps_after_restart")
-
-#     root_before = get_uuid_for_friendly_name(ps_before_restart, "root-controller")
-#     root_after = get_uuid_for_friendly_name(ps_after_restart, "root-controller")
-#     assert root_before != root_after, (
-#         "Expected root-controller UUID to change after restart, "
-#         f"but it stayed the same ({root_before})."
-#     )
-
-#     mlt_before = get_uuid_for_friendly_name(ps_before_restart, "mlt")
-#     mlt_after = get_uuid_for_friendly_name(ps_after_restart, "mlt")
-#     assert mlt_before != mlt_after, (
-#         "Expected mlt UUID to change after restart, "
-#         f"but it stayed the same ({mlt_before})."
-#     )
-
-
 def test_log_files(run_dunerc):
     # Check that at least some of the expected log files are present
     assert any(

From 6c454c4d0939acdef9df7924248ddb335d9c6c16 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 16:31:38 +0100
Subject: [PATCH 11/29] Reorder tests

---
 integtest/process_manager_test.py | 225 ++++++++++++++++--------------
 1 file changed, 117 insertions(+), 108 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 16b2d6c6d..fa03801c4 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -205,6 +205,7 @@ def get_uuid_for_friendly_name(
 
 
 def test_boot(run_dunerc) -> None:
+    """Checks that boot starts the managed processes and exposes UUIDs in ps."""
     stdout = run_dunerc.completed_process.stdout
 
     ps_pre_boot = get_ps_table_after_echo(stdout, "pre_boot")
@@ -224,41 +225,67 @@ def test_boot(run_dunerc) -> None:
         )
 
 
-def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
-    stdout = run_dunerc.completed_process.stdout
+def test_log_command(run_dunerc) -> None:
+    """Checks that querying logs for an unknown process reports the expected error."""
+    test_str = (
+        "Bad query for logs: The process corresponding to the query doesn't exist"
+    )
+    assert test_str in run_dunerc.completed_process.stdout
 
-    ps_before_kill = get_ps_table_after_echo(stdout, "pre_kill_mlt")
-    ps_after_kill = get_ps_table_after_echo(stdout, "post_kill_mlt")
 
-    mlt_before_kill = [
-        row for row in ps_before_kill if row["friendly_name"].strip() == "mlt"
-    ]
-    mlt_after_kill = [
-        row for row in ps_after_kill if row["friendly_name"].strip() == "mlt"
-    ]
+def test_root_controller_logs(run_dunerc) -> None:
+    """
+    Verifies that:
+    - the stdout contains a "root-controller logs" header line and a "root-controller end" footer line
+    - there are exactly 5 lines between those two lines
+    - among those 5 lines, the one from "drunc.controller.core.init_controller" ends with "Controller ready"
+    """
+    stdout = run_dunerc.completed_process.stdout
+    assert isinstance(stdout, str)
 
-    assert mlt_before_kill, (
-        "Expected to find 'mlt' in ps table before kill, but it was missing."
+    lines = stdout.splitlines()
+
+    # 1) Find the header/footer lines
+    header_idx = next(
+        (i for i, line in enumerate(lines) if "root-controller logs" in line),
+        None,
     )
-    assert not mlt_after_kill, (
-        "Expected 'mlt' to be absent from ps table after kill, but it is still present."
+    footer_idx = next(
+        (i for i, line in enumerate(lines) if "root-controller end" in line),
+        None,
     )
 
+    assert header_idx is not None, (
+        "Did not find the 'root-controller logs' header line in stdout."
+    )
+    assert footer_idx is not None, (
+        "Did not find the 'root-controller end' footer line in stdout."
+    )
+    assert footer_idx > header_idx, "Footer appears before header in stdout."
 
-def test_mlt_recovers_after_kill(run_dunerc) -> None:
-    stdout = run_dunerc.completed_process.stdout
+    # 2) Check there are 5 lines between header and footer
+    between = lines[header_idx + 1 : footer_idx]
+    assert len(between) == 5, (
+        f"Expected exactly 5 lines between header and footer, found {len(between)}.\nBetween:\n"
+        + "\n".join(between)
+    )
 
-    ps_after_recovery = get_ps_table_after_echo(stdout, "ps_after_recovery")
+    # 3) Check the init_controller line ends with "Controller ready"
+    # Example line:
+    # [2026/03/13 08:17:47 UTC] INFO ... drunc.controller.core.init_controller ... Controller ready
+    init_controller_ready_re = re.compile(
+        r"drunc\.controller\.core\.init_controller.*Controller ready\s*$"
+    )
 
-    mlt_after_recovery = [
-        row for row in ps_after_recovery if row["friendly_name"].strip() == "mlt"
-    ]
-    assert mlt_after_recovery, (
-        "Expected 'mlt' to be present in ps table after recovery, but it was missing."
+    matches = [line for line in between if init_controller_ready_re.search(line)]
+    assert len(matches) >= 1, (
+        "Did not find an init_controller line ending with 'Controller ready' within the 5 lines.\nBetween:\n"
+        + "\n".join(between)
     )
 
 
 def test_wait_command_duration_from_logs(run_dunerc) -> None:
+    """Checks that the wait command logs the expected duration and elapsed time."""
     stdout = run_dunerc.completed_process.stdout
     lines = strip_ansi(stdout).splitlines()
 
@@ -334,6 +361,7 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
 
 
 def test_restart_mlt_logs(run_dunerc) -> None:
+    """Checks that restarting mlt produces the expected restart, exit, and boot logs."""
     stdout = run_dunerc.completed_process.stdout
     lines = strip_ansi(stdout).splitlines()
 
@@ -374,44 +402,82 @@ def test_restart_mlt_logs(run_dunerc) -> None:
         "Did not find the mlt restart request log line between restart markers."
     )
 
-    graceful_termination_match = re.search(
-        r"Remote process .*?terminated gracefully following SIGQUIT signal\.",
-        restart_text[restart_request_match.end() :],
-        re.DOTALL,
-    )
-    assert graceful_termination_match is not None, (
-        "Did not find the graceful termination log line for mlt after restart request."
-    )
+    #! Reinsert this in the future, but this log-based thing is super janky
+    # graceful_termination_match = re.search(
+    #     r"Remote process .*?terminated gracefully following SIGQUIT signal\.",
+    #     restart_text[restart_request_match.end() :],
+    #     re.DOTALL,
+    # )
+    # assert graceful_termination_match is not None, (
+    #     "Did not find the graceful termination log line for mlt after restart request."
+    # )
+
+    # exit_code_search_text = restart_text[
+    #     restart_request_match.end() + graceful_termination_match.end() :
+    # ]
+    # exit_code_match = re.search(
+    #     r"Process 'mlt'.*?process exited\s+with exit code 0",
+    #     exit_code_search_text,
+    #     re.DOTALL,
+    # )
+    # assert exit_code_match is not None, (
+    #     "Did not find the mlt exit-code log line after graceful termination."
+    # )
+
+    # booted_search_text = exit_code_search_text[exit_code_match.end() :]
+    # booted_match = re.search(
+    #     r"Booted 'mlt'.*?with UUID\s+([^\s\n]+)",
+    #     booted_search_text,
+    #     re.DOTALL,
+    # )
+    # assert booted_match is not None, (
+    #     "Did not find the mlt boot log line after the restart exit log."
+    # )
+
+    # booted_uuid = booted_match.group(1)
+    # assert UUID_RE.match(booted_uuid), (
+    #     f"Expected the mlt boot log to contain a UUID, got: {booted_uuid}"
+    # )
 
-    exit_code_search_text = restart_text[
-        restart_request_match.end() + graceful_termination_match.end() :
+
+def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
+    """Checks that killing mlt removes it from the subsequent ps table."""
+    stdout = run_dunerc.completed_process.stdout
+
+    ps_before_kill = get_ps_table_after_echo(stdout, "pre_kill_mlt")
+    ps_after_kill = get_ps_table_after_echo(stdout, "post_kill_mlt")
+
+    mlt_before_kill = [
+        row for row in ps_before_kill if row["friendly_name"].strip() == "mlt"
+    ]
+    mlt_after_kill = [
+        row for row in ps_after_kill if row["friendly_name"].strip() == "mlt"
     ]
-    exit_code_match = re.search(
-        r"Process 'mlt'.*?process exited\s+with exit code 0",
-        exit_code_search_text,
-        re.DOTALL,
-    )
-    assert exit_code_match is not None, (
-        "Did not find the mlt exit-code log line after graceful termination."
-    )
 
-    booted_search_text = exit_code_search_text[exit_code_match.end() :]
-    booted_match = re.search(
-        r"Booted 'mlt'.*?with UUID\s+([^\s\n]+)",
-        booted_search_text,
-        re.DOTALL,
+    assert mlt_before_kill, (
+        "Expected to find 'mlt' in ps table before kill, but it was missing."
     )
-    assert booted_match is not None, (
-        "Did not find the mlt boot log line after the restart exit log."
+    assert not mlt_after_kill, (
+        "Expected 'mlt' to be absent from ps table after kill, but it is still present."
     )
 
-    booted_uuid = booted_match.group(1)
-    assert UUID_RE.match(booted_uuid), (
-        f"Expected the mlt boot log to contain a UUID, got: {booted_uuid}"
+
+def test_mlt_recovers_after_kill(run_dunerc) -> None:
+    """Checks that mlt is present again after the recovery restart sequence."""
+    stdout = run_dunerc.completed_process.stdout
+
+    ps_after_recovery = get_ps_table_after_echo(stdout, "ps_after_recovery")
+
+    mlt_after_recovery = [
+        row for row in ps_after_recovery if row["friendly_name"].strip() == "mlt"
+    ]
+    assert mlt_after_recovery, (
+        "Expected 'mlt' to be present in ps table after recovery, but it was missing."
     )
 
 
 def test_nanorc_success(run_dunerc):
+    """Checks that the drunc integration command sequence completes successfully."""
     # print the name of the current test
     current_test = os.environ.get("PYTEST_CURRENT_TEST")
     match_obj = re.search(r".*\[(.+)-run_.*rc.*\d].*", current_test)
@@ -426,65 +492,8 @@ def test_nanorc_success(run_dunerc):
     assert run_dunerc.completed_process.returncode == 0
 
 
-def test_log_command(run_dunerc) -> None:
-    test_str = (
-        "Bad query for logs: The process corresponding to the query doesn't exist"
-    )
-    assert test_str in run_dunerc.completed_process.stdout
-
-
-def test_root_controller_logs(run_dunerc) -> None:
-    """
-    Verifies that:
-    - the stdout contains a "root-controller logs" header line and a "root-controller end" footer line
-    - there are exactly 5 lines between those two lines
-    - among those 5 lines, the one from "drunc.controller.core.init_controller" ends with "Controller ready"
-    """
-    stdout = run_dunerc.completed_process.stdout
-    assert isinstance(stdout, str)
-
-    lines = stdout.splitlines()
-
-    # 1) Find the header/footer lines
-    header_idx = next(
-        (i for i, line in enumerate(lines) if "root-controller logs" in line),
-        None,
-    )
-    footer_idx = next(
-        (i for i, line in enumerate(lines) if "root-controller end" in line),
-        None,
-    )
-
-    assert header_idx is not None, (
-        "Did not find the 'root-controller logs' header line in stdout."
-    )
-    assert footer_idx is not None, (
-        "Did not find the 'root-controller end' footer line in stdout."
-    )
-    assert footer_idx > header_idx, "Footer appears before header in stdout."
-
-    # 2) Check there are 5 lines between header and footer
-    between = lines[header_idx + 1 : footer_idx]
-    assert len(between) == 5, (
-        f"Expected exactly 5 lines between header and footer, found {len(between)}.\nBetween:\n"
-        + "\n".join(between)
-    )
-
-    # 3) Check the init_controller line ends with "Controller ready"
-    # Example line:
-    # [2026/03/13 08:17:47 UTC] INFO ... drunc.controller.core.init_controller ... Controller ready
-    init_controller_ready_re = re.compile(
-        r"drunc\.controller\.core\.init_controller.*Controller ready\s*$"
-    )
-
-    matches = [line for line in between if init_controller_ready_re.search(line)]
-    assert len(matches) >= 1, (
-        "Did not find an init_controller line ending with 'Controller ready' within the 5 lines.\nBetween:\n"
-        + "\n".join(between)
-    )
-
-
 def test_log_files(run_dunerc):
+    """Checks that expected process-manager log files exist and are free of errors."""
     # Check that at least some of the expected log files are present
     assert any(
         f"{run_dunerc.session}_df-01" in str(logname)

From 1287a9d0a7edae052d1de30677e81f4771b56e81 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 16:47:29 +0100
Subject: [PATCH 12/29] Add log echo (forgot to commit this)

---
 src/drunc/controller/interface/commands.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/drunc/controller/interface/commands.py b/src/drunc/controller/interface/commands.py
index 566f350c5..24d555d62 100644
--- a/src/drunc/controller/interface/commands.py
+++ b/src/drunc/controller/interface/commands.py
@@ -8,6 +8,7 @@
 from drunc.utils.utils import get_logger
 
 log = get_logger("controller.iface", rich_handler=True)
+log_echo = get_logger("echo", rich_handler=True)
 
 
 @click.command("list-transitions")
@@ -245,22 +246,26 @@ def who_am_i(obj: ControllerContext) -> None:
 
 # click_shell/_cmd.py, line 23. identchars only accepts ascii letters + digits + _
 # Can't really be used by the integ test tho..
-@click.command("comment", 
+@click.command(
+    "comment",
     hidden=True,
     context_settings=dict(
-    ignore_unknown_options=True,
-    allow_extra_args=True,
-))
+        ignore_unknown_options=True,
+        allow_extra_args=True,
+    ),
+)
 def comment_handler():
     """Ignore this line"""
     pass
 
+
 @click.command("echo")
 @click.argument("text", required=False)
 @click.pass_obj
 def echo(obj, text: str | None) -> None:
-    log.info(text or "")
-    
+    log_echo.info(text or "")
+
+
 @click.command("who-is-in-charge")
 @click.option("--target", type=str, help="The target to address", default="")
 @click.option(

From 1163ebb4469d9e4faa130af8a44f4bd419b3481f Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Fri, 13 Mar 2026 16:53:25 +0100
Subject: [PATCH 13/29] fix ruff

---
 src/drunc/controller/interface/shell.py | 4 ++--
 src/drunc/unified_shell/shell.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/drunc/controller/interface/shell.py b/src/drunc/controller/interface/shell.py
index 25373004c..a7135c127 100644
--- a/src/drunc/controller/interface/shell.py
+++ b/src/drunc/controller/interface/shell.py
@@ -6,8 +6,10 @@
 from daqpytools.logging import logging_log_levels
 
 from drunc.controller.interface.commands import (
+    comment_handler,
     connect,
     disconnect,
+    echo,
     exclude,
     expert_command,
     include,
@@ -17,8 +19,6 @@
     take_control,
     wait,
     who_am_i,
-    echo,
-    comment_handler,
     who_is_in_charge,
 )
 from drunc.controller.interface.shell_utils import (
diff --git a/src/drunc/unified_shell/shell.py b/src/drunc/unified_shell/shell.py
index 3d9efa1d7..ed068491b 100644
--- a/src/drunc/unified_shell/shell.py
+++ b/src/drunc/unified_shell/shell.py
@@ -18,8 +18,10 @@
 from drunc.connectivity_service.client import ConnectivityServiceClient
 from drunc.controller.configuration import ControllerConfHandler
 from drunc.controller.interface.commands import (
+    comment_handler,
     connect,
     disconnect,
+    echo,
     exclude,
     expert_command,
     include,
@@ -30,8 +32,6 @@
     to_error,
     wait,
     who_am_i,
-    echo,
-    comment_handler,
     who_is_in_charge,
 )
 from drunc.controller.interface.shell_utils import generate_fsm_command

From 8259ff02fb2dd8f653222bb1e6c847cc4dbb4f10 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Thu, 19 Mar 2026 15:09:09 +0100
Subject: [PATCH 14/29] Cleanup on repetition

---
 integtest/process_manager_test.py | 173 ++++++++++++++++--------------
 1 file changed, 90 insertions(+), 83 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index fa03801c4..8572db54f 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -1,6 +1,7 @@
 import getpass
 import os
 import re
+from collections.abc import Callable
 from datetime import datetime
 
 import integrationtest.data_classes as data_classes
@@ -84,12 +85,15 @@
 logs --name unknown
 logs --name root-controller --how-far 5
 logs --name mlt --how-far 5
+echo testing_logs_done
 
 echo test_wait
 wait 10
+echo test_wait_done
 
 echo pre_restart_mlt
 restart -n mlt
+echo fixture_1
 restart -n root-controller
 wait 5
 echo post_restart_mlt
@@ -101,14 +105,17 @@
 wait 2
 echo post_kill_mlt
 ps -u {getpass.getuser()}
+echo kill_mlt_done
 
 
+echo ps_recovery
 restart -n mlt
 restart -n trg-controller
 wait 5
 
 echo ps_after_recovery
 ps -u {getpass.getuser()}
+echo ps_recovery_done
 
 
 flush
@@ -128,6 +135,58 @@ def strip_ansi(text: str) -> str:
     return ANSI_ESCAPE_RE.sub("", text)
 
 
+def find_line_index(
+    lines: list[str],
+    predicate: Callable[[str], bool],
+    *,
+    start_idx: int = 0,
+) -> int | None:
+    return next(
+        (idx for idx in range(start_idx, len(lines)) if predicate(lines[idx])),
+        None,
+    )
+
+
+def require_line_index(
+    lines: list[str],
+    predicate: Callable[[str], bool],
+    *,
+    error_message: str,
+    start_idx: int = 0,
+) -> int:
+    line_idx = find_line_index(lines, predicate, start_idx=start_idx)
+    assert line_idx is not None, error_message
+    return line_idx
+
+
+def require_line_containing(
+    lines: list[str],
+    text: str,
+    *,
+    error_message: str,
+    start_idx: int = 0,
+) -> int:
+    return require_line_index(
+        lines,
+        lambda line: text in line,
+        error_message=error_message,
+        start_idx=start_idx,
+    )
+
+
+def require_echo_marker_index(
+    lines: list[str], echo_marker: str, *, start_idx: int = 0
+) -> int:
+    return require_line_index(
+        lines,
+        lambda line: "drunc.echo" in line and line.rstrip().endswith(echo_marker),
+        error_message=(
+            f"Could not find drunc.echo marker '{echo_marker}' in stdout."
+        ),
+        start_idx=start_idx,
+    )
+
+
 def _parse_ps_table_from_index(
     lines: list[str], start_idx: int
 ) -> list[dict[str, str]]:
@@ -164,25 +223,12 @@ def _parse_ps_table_from_index(
 def get_ps_table_after_echo(stdout: str, echo_marker: str) -> list[dict[str, str]]:
     lines = strip_ansi(stdout).splitlines()
 
-    echo_idx = next(
-        (
-            idx
-            for idx, line in enumerate(lines)
-            if "drunc.echo" in line and line.rstrip().endswith(echo_marker)
-        ),
-        None,
-    )
-    assert echo_idx is not None, (
-        f"Could not find drunc.echo marker '{echo_marker}' in stdout."
-    )
+    echo_idx = require_echo_marker_index(lines, echo_marker)
 
-    table_start_idx = next(
-        (
-            idx
-            for idx in range(echo_idx + 1, len(lines))
-            if "Processes running" in lines[idx]
-        ),
-        None,
+    table_start_idx = find_line_index(
+        lines,
+        lambda line: "Processes running" in line,
+        start_idx=echo_idx + 1,
     )
     if table_start_idx is None:
         return []
@@ -246,20 +292,15 @@ def test_root_controller_logs(run_dunerc) -> None:
     lines = stdout.splitlines()
 
     # 1) Find the header/footer lines
-    header_idx = next(
-        (i for i, line in enumerate(lines) if "root-controller logs" in line),
-        None,
-    )
-    footer_idx = next(
-        (i for i, line in enumerate(lines) if "root-controller end" in line),
-        None,
+    header_idx = require_line_containing(
+        lines,
+        "root-controller logs",
+        error_message="Did not find the 'root-controller logs' header line in stdout.",
     )
-
-    assert header_idx is not None, (
-        "Did not find the 'root-controller logs' header line in stdout."
-    )
-    assert footer_idx is not None, (
-        "Did not find the 'root-controller end' footer line in stdout."
+    footer_idx = require_line_containing(
+        lines,
+        "root-controller end",
+        error_message="Did not find the 'root-controller end' footer line in stdout.",
     )
     assert footer_idx > header_idx, "Footer appears before header in stdout."
 
@@ -284,49 +325,34 @@ def test_root_controller_logs(run_dunerc) -> None:
     )
 
 
+#! This you need to take a look at more
 def test_wait_command_duration_from_logs(run_dunerc) -> None:
     """Checks that the wait command logs the expected duration and elapsed time."""
     stdout = run_dunerc.completed_process.stdout
     lines = strip_ansi(stdout).splitlines()
 
-    echo_idx = next(
-        (
-            idx
-            for idx, line in enumerate(lines)
-            if "drunc.echo" in line and line.rstrip().endswith("test_wait")
-        ),
-        None,
-    )
-    assert echo_idx is not None, (
-        "Could not find drunc.echo marker 'test_wait' in stdout."
-    )
+    echo_idx = require_echo_marker_index(lines, "test_wait")
 
     running_pattern = re.compile(r"Command wait running for (\d+) seconds\.")
     ran_pattern = re.compile(r"Command wait ran for (\d+) seconds\.")
     timestamp_pattern = re.compile(r"^\[(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) UTC\]")
 
-    running_idx = next(
-        (
-            idx
-            for idx in range(echo_idx + 1, len(lines))
-            if running_pattern.search(lines[idx])
+    running_idx = require_line_index(
+        lines,
+        lambda line: running_pattern.search(line) is not None,
+        error_message=(
+            "Did not find 'Command wait running for ... seconds.' after test_wait marker."
         ),
-        None,
-    )
-    assert running_idx is not None, (
-        "Did not find 'Command wait running for ... seconds.' after test_wait marker."
+        start_idx=echo_idx + 1,
     )
 
-    ran_idx = next(
-        (
-            idx
-            for idx in range(running_idx + 1, len(lines))
-            if ran_pattern.search(lines[idx])
+    ran_idx = require_line_index(
+        lines,
+        lambda line: ran_pattern.search(line) is not None,
+        error_message=(
+            "Did not find 'Command wait ran for ... seconds.' after wait start log."
         ),
-        None,
-    )
-    assert ran_idx is not None, (
-        "Did not find 'Command wait ran for ... seconds.' after wait start log."
+        start_idx=running_idx + 1,
     )
 
     running_match = running_pattern.search(lines[running_idx])
@@ -360,35 +386,16 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
     )
 
 
+#! This you need to take a look at more
 def test_restart_mlt_logs(run_dunerc) -> None:
     """Checks that restarting mlt produces the expected restart, exit, and boot logs."""
     stdout = run_dunerc.completed_process.stdout
     lines = strip_ansi(stdout).splitlines()
 
-    echo_idx = next(
-        (
-            idx
-            for idx, line in enumerate(lines)
-            if "drunc.echo" in line and line.rstrip().endswith("pre_restart_mlt")
-        ),
-        None,
-    )
-    assert echo_idx is not None, (
-        "Could not find drunc.echo marker 'pre_restart_mlt' in stdout."
-    )
+    echo_idx = require_echo_marker_index(lines, "pre_restart_mlt")
 
-    post_restart_idx = next(
-        (
-            idx
-            for idx, line in enumerate(lines)
-            if idx > echo_idx
-            and "drunc.echo" in line
-            and line.rstrip().endswith("post_restart_mlt")
-        ),
-        None,
-    )
-    assert post_restart_idx is not None, (
-        "Could not find drunc.echo marker 'post_restart_mlt' in stdout."
+    post_restart_idx = require_echo_marker_index(
+        lines, "post_restart_mlt", start_idx=echo_idx + 1
     )
 
     restart_lines = lines[echo_idx + 1 : post_restart_idx]

From a075cf68c902d4ce13d3fd06237d661d9c6fb485 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Mon, 16 Mar 2026 16:39:14 +0100
Subject: [PATCH 15/29] More cleanup with helper functions

---
 integtest/process_manager_test.py | 133 ++++++++++++++++++++----------
 1 file changed, 90 insertions(+), 43 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 8572db54f..688bee78c 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -180,11 +180,38 @@ def require_echo_marker_index(
     return require_line_index(
         lines,
         lambda line: "drunc.echo" in line and line.rstrip().endswith(echo_marker),
-        error_message=(
-            f"Could not find drunc.echo marker '{echo_marker}' in stdout."
-        ),
+        error_message=(f"Could not find drunc.echo marker '{echo_marker}' in stdout."),
+        start_idx=start_idx,
+    )
+
+
+def require_pattern_match_index(
+    lines: list[str],
+    pattern: re.Pattern[str],
+    *,
+    error_message: str,
+    start_idx: int = 0,
+) -> tuple[int, re.Match[str]]:
+    line_idx = require_line_index(
+        lines,
+        lambda line: pattern.search(line) is not None,
+        error_message=error_message,
         start_idx=start_idx,
     )
+    match = pattern.search(lines[line_idx])
+    assert match is not None
+    return line_idx, match
+
+
+def require_pattern_match(
+    text: str,
+    pattern: re.Pattern[str],
+    *,
+    error_message: str,
+) -> re.Match[str]:
+    match = pattern.search(text)
+    assert match is not None, error_message
+    return match
 
 
 def _parse_ps_table_from_index(
@@ -250,6 +277,47 @@ def get_uuid_for_friendly_name(
     )
 
 
+def get_rows_for_friendly_name(
+    ps_table: list[dict[str, str]], friendly_name: str
+) -> list[dict[str, str]]:
+    return [row for row in ps_table if row["friendly_name"].strip() == friendly_name]
+
+
+def assert_process_presence(
+    ps_table: list[dict[str, str]],
+    friendly_name: str,
+    *,
+    expected_present: bool,
+    context: str,
+) -> None:
+    matching_rows = get_rows_for_friendly_name(ps_table, friendly_name)
+
+    if expected_present:
+        assert matching_rows, (
+            f"Expected to find '{friendly_name}' in ps table {context}, but it was missing."
+        )
+        return
+
+    assert not matching_rows, (
+        f"Expected '{friendly_name}' to be absent from ps table {context}, but it is still present."
+    )
+
+
+def assert_process(
+    ps_table: list[dict[str, str]],
+    friendly_name: str,
+    *,
+    context: str,
+    expected_present=True,
+) -> None:
+    assert_process_presence(
+        ps_table,
+        friendly_name,
+        expected_present=expected_present,
+        context=context,
+    )
+
+
 def test_boot(run_dunerc) -> None:
     """Checks that boot starts the managed processes and exposes UUIDs in ps."""
     stdout = run_dunerc.completed_process.stdout
@@ -337,29 +405,24 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
     ran_pattern = re.compile(r"Command wait ran for (\d+) seconds\.")
     timestamp_pattern = re.compile(r"^\[(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) UTC\]")
 
-    running_idx = require_line_index(
+    running_idx, running_match = require_pattern_match_index(
         lines,
-        lambda line: running_pattern.search(line) is not None,
+        running_pattern,
         error_message=(
             "Did not find 'Command wait running for ... seconds.' after test_wait marker."
         ),
         start_idx=echo_idx + 1,
     )
 
-    ran_idx = require_line_index(
+    ran_idx, ran_match = require_pattern_match_index(
         lines,
-        lambda line: ran_pattern.search(line) is not None,
+        ran_pattern,
         error_message=(
             "Did not find 'Command wait ran for ... seconds.' after wait start log."
         ),
         start_idx=running_idx + 1,
     )
 
-    running_match = running_pattern.search(lines[running_idx])
-    ran_match = ran_pattern.search(lines[ran_idx])
-    assert running_match is not None
-    assert ran_match is not None
-
     expected_seconds = 10
     assert int(running_match.group(1)) == expected_seconds, (
         f"Expected wait start log to report {expected_seconds} seconds, got {running_match.group(1)}."
@@ -368,12 +431,16 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
         f"Expected wait end log to report {expected_seconds} seconds, got {ran_match.group(1)}."
     )
 
-    start_ts_match = timestamp_pattern.search(lines[running_idx])
-    end_ts_match = timestamp_pattern.search(lines[ran_idx])
-    assert start_ts_match is not None, (
-        "Could not parse timestamp in wait start log line."
+    start_ts_match = require_pattern_match(
+        lines[running_idx],
+        timestamp_pattern,
+        error_message="Could not parse timestamp in wait start log line.",
+    )
+    end_ts_match = require_pattern_match(
+        lines[ran_idx],
+        timestamp_pattern,
+        error_message="Could not parse timestamp in wait end log line.",
     )
-    assert end_ts_match is not None, "Could not parse timestamp in wait end log line."
 
     start_ts = datetime.strptime(start_ts_match.group(1), "%Y/%m/%d %H:%M:%S")
     end_ts = datetime.strptime(end_ts_match.group(1), "%Y/%m/%d %H:%M:%S")
@@ -401,12 +468,10 @@ def test_restart_mlt_logs(run_dunerc) -> None:
     restart_lines = lines[echo_idx + 1 : post_restart_idx]
     restart_text = "\n".join(restart_lines)
 
-    restart_request_match = re.search(
-        r"process_manager restarting \['mlt'\] in session",
+    restart_request_match = require_pattern_match(
         restart_text,
-    )
-    assert restart_request_match is not None, (
-        "Did not find the mlt restart request log line between restart markers."
+        re.compile(r"process_manager restarting \['mlt'\] in session"),
+        error_message="Did not find the mlt restart request log line between restart markers.",
     )
 
     #! Reinsert this in the future, but this log-based thing is super janky
@@ -454,33 +519,15 @@ def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
     ps_before_kill = get_ps_table_after_echo(stdout, "pre_kill_mlt")
     ps_after_kill = get_ps_table_after_echo(stdout, "post_kill_mlt")
 
-    mlt_before_kill = [
-        row for row in ps_before_kill if row["friendly_name"].strip() == "mlt"
-    ]
-    mlt_after_kill = [
-        row for row in ps_after_kill if row["friendly_name"].strip() == "mlt"
-    ]
-
-    assert mlt_before_kill, (
-        "Expected to find 'mlt' in ps table before kill, but it was missing."
-    )
-    assert not mlt_after_kill, (
-        "Expected 'mlt' to be absent from ps table after kill, but it is still present."
-    )
+    assert_process(ps_before_kill, "mlt", context="before kill")
+    assert_process(ps_after_kill, "mlt", context="after kill", expected_present=False)
 
 
 def test_mlt_recovers_after_kill(run_dunerc) -> None:
     """Checks that mlt is present again after the recovery restart sequence."""
     stdout = run_dunerc.completed_process.stdout
-
     ps_after_recovery = get_ps_table_after_echo(stdout, "ps_after_recovery")
-
-    mlt_after_recovery = [
-        row for row in ps_after_recovery if row["friendly_name"].strip() == "mlt"
-    ]
-    assert mlt_after_recovery, (
-        "Expected 'mlt' to be present in ps table after recovery, but it was missing."
-    )
+    assert_process(ps_after_recovery, "mlt", context="after recovery")
 
 
 def test_nanorc_success(run_dunerc):

From 1e73641fdc397444a2da5fef216c735c03556efd Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Mon, 16 Mar 2026 16:48:57 +0100
Subject: [PATCH 16/29] move helper functions to general integ test utils

---
 integtest/integ_test_utils.py     | 246 ++++++++++++++++++++++++++++++
 integtest/process_manager_test.py | 197 ++----------------------
 2 files changed, 255 insertions(+), 188 deletions(-)
 create mode 100644 integtest/integ_test_utils.py

diff --git a/integtest/integ_test_utils.py b/integtest/integ_test_utils.py
new file mode 100644
index 000000000..f88baeb52
--- /dev/null
+++ b/integtest/integ_test_utils.py
@@ -0,0 +1,246 @@
+"""Shared helpers for drunc integration tests.
+
+This module centralizes commoon patterns used by process-manager integration tests. 
+Importantly, most of these are defined to help with processing the stdout log outputs
+of the integ tests. 
+
+Common functions include:
+- searching ordered log output for marker lines,
+- requiring regex/string matches with informative assertion errors,
+- extracting process-table rows from `ps` command output,
+- asserting process presence/absence by friendly name.
+
+The helpers are intentionally lightweight and pytest-friendly: failures are
+reported through `assert` with context-rich messages.
+"""
+
+import re
+from collections.abc import Callable
+
+ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
+
+
+def strip_ansi(text: str) -> str:
+    """Remove ANSI escape codes from a text block."""
+    return ANSI_ESCAPE_RE.sub("", text)
+
+
+def find_line_index(
+    lines: list[str],
+    predicate: Callable[[str], bool],
+    *,
+    start_idx: int = 0,
+) -> int | None:
+    """Return the first line index at or after `start_idx` matching `predicate`.
+
+    Returns `None` when no line matches.
+    """
+    return next(
+        (idx for idx in range(start_idx, len(lines)) if predicate(lines[idx])),
+        None,
+    )
+
+
+def require_line_index(
+    lines: list[str],
+    predicate: Callable[[str], bool],
+    *,
+    error_message: str,
+    start_idx: int = 0,
+) -> int:
+    """Like `find_line_index`, but assert a match exists and return its index."""
+    line_idx = find_line_index(lines, predicate, start_idx=start_idx)
+    assert line_idx is not None, error_message
+    return line_idx
+
+
+def require_line_containing(
+    lines: list[str],
+    text: str,
+    *,
+    error_message: str,
+    start_idx: int = 0,
+) -> int:
+    """Assert and return index of the first line containing `text`."""
+    return require_line_index(
+        lines,
+        lambda line: text in line,
+        error_message=error_message,
+        start_idx=start_idx,
+    )
+
+
+def require_echo_marker_index(
+    lines: list[str], echo_marker: str, *, start_idx: int = 0
+) -> int:
+    """Assert and return index of a `drunc.echo` line ending with `echo_marker`.
+    This is hardcoded since echo is a specific callable function with its own logger.
+    """
+    return require_line_index(
+        lines,
+        lambda line: "drunc.echo" in line and line.rstrip().endswith(echo_marker),
+        error_message=(f"Could not find drunc.echo marker '{echo_marker}' in stdout."),
+        start_idx=start_idx,
+    )
+
+
+def require_pattern_match_index(
+    lines: list[str],
+    pattern: re.Pattern[str],
+    *,
+    error_message: str,
+    start_idx: int = 0,
+) -> tuple[int, re.Match[str]]:
+    """Assert and return `(index, match)` for first line matching `pattern`."""
+    line_idx = require_line_index(
+        lines,
+        lambda line: pattern.search(line) is not None,
+        error_message=error_message,
+        start_idx=start_idx,
+    )
+    match = pattern.search(lines[line_idx])
+    assert match is not None
+    return line_idx, match
+
+
+def require_pattern_match(
+    text: str,
+    pattern: re.Pattern[str],
+    *,
+    error_message: str,
+) -> re.Match[str]:
+    """Assert `pattern` matches `text` and return the `re.Match` object."""
+    match = pattern.search(text)
+    assert match is not None, error_message
+    return match
+
+
+def _parse_ps_table_from_index(
+    lines: list[str], start_idx: int
+) -> list[dict[str, str]]:
+    """Parse a Unicode table of processes starting after `start_idx`.
+
+    The parser expects rows that start with `│` and stops at a line starting
+    with `└`. It returns dictionaries with normalized column names.
+    """
+    table_rows: list[dict[str, str]] = []
+
+    for line in lines[start_idx + 1 :]:
+        stripped = line.strip()
+
+        if stripped.startswith("└"):
+            break
+
+        if not stripped.startswith("│"):
+            continue
+
+        cells = [cell.strip() for cell in stripped.strip("│").split("│")]
+        if len(cells) < 7:
+            continue
+
+        table_rows.append(
+            {
+                "session": cells[0],
+                "friendly_name": cells[1],
+                "user": cells[2],
+                "host": cells[3],
+                "uuid": cells[4],
+                "alive": cells[5],
+                "exit_code": cells[6],
+            }
+        )
+
+    return table_rows
+
+
+def get_ps_table_after_echo(stdout: str, echo_marker: str) -> list[dict[str, str]]:
+    """Return parsed process-table rows found after a specific echo marker.
+
+    If no process table is found after the marker, returns an empty list.
+    """
+    lines = strip_ansi(stdout).splitlines()
+
+    echo_idx = require_echo_marker_index(lines, echo_marker)
+
+    table_start_idx = find_line_index(
+        lines,
+        lambda line: "Processes running" in line,
+        start_idx=echo_idx + 1,
+    )
+    if table_start_idx is None:
+        return []
+
+    return _parse_ps_table_from_index(lines, table_start_idx)
+
+
+def get_uuid_for_friendly_name(
+    ps_table: list[dict[str, str]], friendly_name: str
+) -> str:
+    """Return UUID for `friendly_name` from a parsed process table.
+
+    Raises:
+        AssertionError: if the friendly name is absent.
+    """
+    for row in ps_table:
+        if row["friendly_name"].strip() == friendly_name:
+            return row["uuid"]
+
+    available_names = ", ".join(row["friendly_name"].strip() for row in ps_table)
+    raise AssertionError(
+        f"Could not find friendly name '{friendly_name}' in ps table. "
+        f"Available names: {available_names}"
+    )
+
+
+def get_rows_for_friendly_name(
+    ps_table: list[dict[str, str]], friendly_name: str
+) -> list[dict[str, str]]:
+    """Return all rows whose `friendly_name` matches exactly after stripping."""
+    return [row for row in ps_table if row["friendly_name"].strip() == friendly_name]
+
+
+def assert_process_presence(
+    ps_table: list[dict[str, str]],
+    friendly_name: str,
+    *,
+    expected_present: bool,
+    context: str,
+) -> None:
+    """Assert whether a process is present/absent in a process table.
+
+    Args:
+        ps_table: Parsed process rows.
+        friendly_name: Process name to check.
+        expected_present: `True` if process should exist, `False` otherwise.
+        context: Short phrase appended to error text (e.g. "before kill").
+    """
+    matching_rows = get_rows_for_friendly_name(ps_table, friendly_name)
+
+    if expected_present:
+        assert matching_rows, (
+            f"Expected to find '{friendly_name}' in ps table {context}, but it was missing."
+        )
+        return
+
+    assert not matching_rows, (
+        f"Expected '{friendly_name}' to be absent from ps table {context}, but it is still present."
+    )
+
+
+def assert_process(
+    ps_table: list[dict[str, str]],
+    friendly_name: str,
+    *,
+    context: str,
+    expected_present: bool = True,
+) -> None:
+    """Convenience wrapper around `assert_process_presence`.
+
+    By default, asserts that the process is present.
+    """
+    assert_process_presence(
+        ps_table,
+        friendly_name,
+        expected_present=expected_present,
+        context=context,
+    )
\ No newline at end of file
diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 688bee78c..4d8dc73ec 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -1,11 +1,19 @@
 import getpass
 import os
 import re
-from collections.abc import Callable
 from datetime import datetime
 
 import integrationtest.data_classes as data_classes
 import integrationtest.log_file_checks as log_file_checks
+from integ_test_utils import (
+    assert_process,
+    get_ps_table_after_echo,
+    require_echo_marker_index,
+    require_line_containing,
+    require_pattern_match,
+    require_pattern_match_index,
+    strip_ansi,
+)
 
 pytest_plugins = "integrationtest.integrationtest_drunc"
 
@@ -128,194 +136,7 @@
     r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
     r"|^[0-9a-fA-F]{8}-[-0-9a-fA-F]*\u2026"  # truncated by Rich table column width
 )
-ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]")
-
-
-def strip_ansi(text: str) -> str:
-    return ANSI_ESCAPE_RE.sub("", text)
-
-
-def find_line_index(
-    lines: list[str],
-    predicate: Callable[[str], bool],
-    *,
-    start_idx: int = 0,
-) -> int | None:
-    return next(
-        (idx for idx in range(start_idx, len(lines)) if predicate(lines[idx])),
-        None,
-    )
-
-
-def require_line_index(
-    lines: list[str],
-    predicate: Callable[[str], bool],
-    *,
-    error_message: str,
-    start_idx: int = 0,
-) -> int:
-    line_idx = find_line_index(lines, predicate, start_idx=start_idx)
-    assert line_idx is not None, error_message
-    return line_idx
-
-
-def require_line_containing(
-    lines: list[str],
-    text: str,
-    *,
-    error_message: str,
-    start_idx: int = 0,
-) -> int:
-    return require_line_index(
-        lines,
-        lambda line: text in line,
-        error_message=error_message,
-        start_idx=start_idx,
-    )
-
 
-def require_echo_marker_index(
-    lines: list[str], echo_marker: str, *, start_idx: int = 0
-) -> int:
-    return require_line_index(
-        lines,
-        lambda line: "drunc.echo" in line and line.rstrip().endswith(echo_marker),
-        error_message=(f"Could not find drunc.echo marker '{echo_marker}' in stdout."),
-        start_idx=start_idx,
-    )
-
-
-def require_pattern_match_index(
-    lines: list[str],
-    pattern: re.Pattern[str],
-    *,
-    error_message: str,
-    start_idx: int = 0,
-) -> tuple[int, re.Match[str]]:
-    line_idx = require_line_index(
-        lines,
-        lambda line: pattern.search(line) is not None,
-        error_message=error_message,
-        start_idx=start_idx,
-    )
-    match = pattern.search(lines[line_idx])
-    assert match is not None
-    return line_idx, match
-
-
-def require_pattern_match(
-    text: str,
-    pattern: re.Pattern[str],
-    *,
-    error_message: str,
-) -> re.Match[str]:
-    match = pattern.search(text)
-    assert match is not None, error_message
-    return match
-
-
-def _parse_ps_table_from_index(
-    lines: list[str], start_idx: int
-) -> list[dict[str, str]]:
-    table_rows: list[dict[str, str]] = []
-
-    for line in lines[start_idx + 1 :]:
-        stripped = line.strip()
-
-        if stripped.startswith("└"):
-            break
-
-        if not stripped.startswith("│"):
-            continue
-
-        cells = [cell.strip() for cell in stripped.strip("│").split("│")]
-        if len(cells) < 7:
-            continue
-
-        table_rows.append(
-            {
-                "session": cells[0],
-                "friendly_name": cells[1],
-                "user": cells[2],
-                "host": cells[3],
-                "uuid": cells[4],
-                "alive": cells[5],
-                "exit_code": cells[6],
-            }
-        )
-
-    return table_rows
-
-
-def get_ps_table_after_echo(stdout: str, echo_marker: str) -> list[dict[str, str]]:
-    lines = strip_ansi(stdout).splitlines()
-
-    echo_idx = require_echo_marker_index(lines, echo_marker)
-
-    table_start_idx = find_line_index(
-        lines,
-        lambda line: "Processes running" in line,
-        start_idx=echo_idx + 1,
-    )
-    if table_start_idx is None:
-        return []
-
-    return _parse_ps_table_from_index(lines, table_start_idx)
-
-
-def get_uuid_for_friendly_name(
-    ps_table: list[dict[str, str]], friendly_name: str
-) -> str:
-    for row in ps_table:
-        if row["friendly_name"].strip() == friendly_name:
-            return row["uuid"]
-
-    available_names = ", ".join(row["friendly_name"].strip() for row in ps_table)
-    raise AssertionError(
-        f"Could not find friendly name '{friendly_name}' in ps table. "
-        f"Available names: {available_names}"
-    )
-
-
-def get_rows_for_friendly_name(
-    ps_table: list[dict[str, str]], friendly_name: str
-) -> list[dict[str, str]]:
-    return [row for row in ps_table if row["friendly_name"].strip() == friendly_name]
-
-
-def assert_process_presence(
-    ps_table: list[dict[str, str]],
-    friendly_name: str,
-    *,
-    expected_present: bool,
-    context: str,
-) -> None:
-    matching_rows = get_rows_for_friendly_name(ps_table, friendly_name)
-
-    if expected_present:
-        assert matching_rows, (
-            f"Expected to find '{friendly_name}' in ps table {context}, but it was missing."
-        )
-        return
-
-    assert not matching_rows, (
-        f"Expected '{friendly_name}' to be absent from ps table {context}, but it is still present."
-    )
-
-
-def assert_process(
-    ps_table: list[dict[str, str]],
-    friendly_name: str,
-    *,
-    context: str,
-    expected_present=True,
-) -> None:
-    assert_process_presence(
-        ps_table,
-        friendly_name,
-        expected_present=expected_present,
-        context=context,
-    )
 
 
 def test_boot(run_dunerc) -> None:

From 30aa804431455d632707be7bc4b4ea23861aa762 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Mon, 16 Mar 2026 18:15:54 +0100
Subject: [PATCH 17/29] Fix mlt logs and minor cleanup

---
 integtest/process_manager_test.py | 176 +++++++++++++-----------------
 1 file changed, 76 insertions(+), 100 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 4d8dc73ec..b486ec6c3 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -76,7 +76,7 @@
 
 
 confgen_arguments = {"MinimalSystem": conf_dict}
-# The commands to run in nanorc, as a list
+# The commands to run in dunerc
 # NOTE THAT WE HAVE NOT TESTED FLUSH BECAUSE IT IS BROKEN
 # see #821
 
@@ -120,7 +120,6 @@
 restart -n mlt
 restart -n trg-controller
 wait 5
-
 echo ps_after_recovery
 ps -u {getpass.getuser()}
 echo ps_recovery_done
@@ -138,6 +137,52 @@
 )
 
 
+def test_nanorc_success(run_dunerc) -> None:
+    """Checks that the drunc integration command sequence completes successfully."""
+    # print the name of the current test
+    current_test = os.environ.get("PYTEST_CURRENT_TEST")
+    match_obj = re.search(r".*\[(.+)-run_.*rc.*\d].*", current_test)
+    if match_obj:
+        current_test = match_obj.group(1)
+    banner_line = re.sub(".", "=", current_test)
+    print(banner_line)
+    print(current_test)
+    print(banner_line)
+
+    # Check that nanorc completed correctly
+    assert run_dunerc.completed_process.returncode == 0
+
+
+def test_log_files(run_dunerc) -> None:
+    """Checks that expected process-manager log files exist and are free of errors."""
+    # Check that at least some of the expected log files are present
+    assert any(
+        f"{run_dunerc.session}_df-01" in str(logname)
+        for logname in run_dunerc.log_files
+    )
+    assert any(
+        f"{run_dunerc.session}_dfo" in str(logname) for logname in run_dunerc.log_files
+    )
+    assert any(
+        f"{run_dunerc.session}_mlt" in str(logname) for logname in run_dunerc.log_files
+    )
+    assert any(
+        f"{run_dunerc.session}_ru" in str(logname) for logname in run_dunerc.log_files
+    )
+
+    if check_for_logfile_errors:
+        # Check that there are no warnings or errors in the log files
+        assert log_file_checks.logs_are_error_free(
+            [
+                logname
+                for logname in run_dunerc.log_files
+                if "process_manager" in str(logname)
+            ],
+            True,
+            True,
+            ignored_logfile_problems,
+        )
+
 
 def test_boot(run_dunerc) -> None:
     """Checks that boot starts the managed processes and exposes UUIDs in ps."""
@@ -160,7 +205,7 @@ def test_boot(run_dunerc) -> None:
         )
 
 
-def test_log_command(run_dunerc) -> None:
+def test_unknown_log_command(run_dunerc) -> None:
     """Checks that querying logs for an unknown process reports the expected error."""
     test_str = (
         "Bad query for logs: The process corresponding to the query doesn't exist"
@@ -175,10 +220,7 @@ def test_root_controller_logs(run_dunerc) -> None:
     - there are exactly 5 lines between those two lines
     - among those 5 lines, the one from "drunc.controller.core.init_controller" ends with "Controller ready"
     """
-    stdout = run_dunerc.completed_process.stdout
-    assert isinstance(stdout, str)
-
-    lines = stdout.splitlines()
+    lines = run_dunerc.completed_process.stdout.splitlines()
 
     # 1) Find the header/footer lines
     header_idx = require_line_containing(
@@ -200,7 +242,7 @@ def test_root_controller_logs(run_dunerc) -> None:
         + "\n".join(between)
     )
 
-    # 3) Check the init_controller line ends with "Controller ready"
+    # 3) Check one of the init_controller line ends with "Controller ready"
     # Example line:
     # [2026/03/13 08:17:47 UTC] INFO ... drunc.controller.core.init_controller ... Controller ready
     init_controller_ready_re = re.compile(
@@ -214,11 +256,9 @@ def test_root_controller_logs(run_dunerc) -> None:
     )
 
 
-#! This you need to take a look at more
 def test_wait_command_duration_from_logs(run_dunerc) -> None:
     """Checks that the wait command logs the expected duration and elapsed time."""
-    stdout = run_dunerc.completed_process.stdout
-    lines = strip_ansi(stdout).splitlines()
+    lines = strip_ansi(run_dunerc.completed_process.stdout).splitlines()
 
     echo_idx = require_echo_marker_index(lines, "test_wait")
 
@@ -263,8 +303,9 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
         error_message="Could not parse timestamp in wait end log line.",
     )
 
-    start_ts = datetime.strptime(start_ts_match.group(1), "%Y/%m/%d %H:%M:%S")
-    end_ts = datetime.strptime(end_ts_match.group(1), "%Y/%m/%d %H:%M:%S")
+    ts_strp_pattern = "%Y/%m/%d %H:%M:%S"
+    start_ts = datetime.strptime(start_ts_match.group(1), ts_strp_pattern)
+    end_ts = datetime.strptime(end_ts_match.group(1), ts_strp_pattern)
     elapsed_seconds = (end_ts - start_ts).total_seconds()
 
     tolerance_seconds = 1
@@ -274,7 +315,6 @@ def test_wait_command_duration_from_logs(run_dunerc) -> None:
     )
 
 
-#! This you need to take a look at more
 def test_restart_mlt_logs(run_dunerc) -> None:
     """Checks that restarting mlt produces the expected restart, exit, and boot logs."""
     stdout = run_dunerc.completed_process.stdout
@@ -289,48 +329,31 @@ def test_restart_mlt_logs(run_dunerc) -> None:
     restart_lines = lines[echo_idx + 1 : post_restart_idx]
     restart_text = "\n".join(restart_lines)
 
-    restart_request_match = require_pattern_match(
+    require_pattern_match(
+        restart_text,
+        re.compile(
+            r"Remote process .*?terminated gracefully following SIGQUIT signal\.",
+            re.DOTALL,
+        ),
+        error_message="Did not find the graceful termination log line for mlt after restart request.",
+    )
+
+    require_pattern_match(
         restart_text,
-        re.compile(r"process_manager restarting \['mlt'\] in session"),
-        error_message="Did not find the mlt restart request log line between restart markers.",
+        re.compile(r"Process 'mlt'.*?process exited\s+with exit code 0", re.DOTALL),
+        error_message="Did not find the mlt exit-code log line after graceful termination.",
     )
 
-    #! Reinsert this in the future, but this log-based thing is super janky
-    # graceful_termination_match = re.search(
-    #     r"Remote process .*?terminated gracefully following SIGQUIT signal\.",
-    #     restart_text[restart_request_match.end() :],
-    #     re.DOTALL,
-    # )
-    # assert graceful_termination_match is not None, (
-    #     "Did not find the graceful termination log line for mlt after restart request."
-    # )
-
-    # exit_code_search_text = restart_text[
-    #     restart_request_match.end() + graceful_termination_match.end() :
-    # ]
-    # exit_code_match = re.search(
-    #     r"Process 'mlt'.*?process exited\s+with exit code 0",
-    #     exit_code_search_text,
-    #     re.DOTALL,
-    # )
-    # assert exit_code_match is not None, (
-    #     "Did not find the mlt exit-code log line after graceful termination."
-    # )
-
-    # booted_search_text = exit_code_search_text[exit_code_match.end() :]
-    # booted_match = re.search(
-    #     r"Booted 'mlt'.*?with UUID\s+([^\s\n]+)",
-    #     booted_search_text,
-    #     re.DOTALL,
-    # )
-    # assert booted_match is not None, (
-    #     "Did not find the mlt boot log line after the restart exit log."
-    # )
-
-    # booted_uuid = booted_match.group(1)
-    # assert UUID_RE.match(booted_uuid), (
-    #     f"Expected the mlt boot log to contain a UUID, got: {booted_uuid}"
-    # )
+    booted_match = require_pattern_match(
+        restart_text,
+        re.compile(r"Booted 'mlt'.*?with UUID\s+([^\s\n]+)", re.DOTALL),
+        error_message="Did not find the mlt boot log line after the restart exit log.",
+    )
+
+    booted_uuid = booted_match.group(1)
+    assert UUID_RE.match(booted_uuid), (
+        f"Expected the mlt boot log to contain a UUID, got: {booted_uuid}"
+    )
 
 
 def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
@@ -349,50 +372,3 @@ def test_mlt_recovers_after_kill(run_dunerc) -> None:
     stdout = run_dunerc.completed_process.stdout
     ps_after_recovery = get_ps_table_after_echo(stdout, "ps_after_recovery")
     assert_process(ps_after_recovery, "mlt", context="after recovery")
-
-
-def test_nanorc_success(run_dunerc):
-    """Checks that the drunc integration command sequence completes successfully."""
-    # print the name of the current test
-    current_test = os.environ.get("PYTEST_CURRENT_TEST")
-    match_obj = re.search(r".*\[(.+)-run_.*rc.*\d].*", current_test)
-    if match_obj:
-        current_test = match_obj.group(1)
-    banner_line = re.sub(".", "=", current_test)
-    print(banner_line)
-    print(current_test)
-    print(banner_line)
-
-    # Check that nanorc completed correctly
-    assert run_dunerc.completed_process.returncode == 0
-
-
-def test_log_files(run_dunerc):
-    """Checks that expected process-manager log files exist and are free of errors."""
-    # Check that at least some of the expected log files are present
-    assert any(
-        f"{run_dunerc.session}_df-01" in str(logname)
-        for logname in run_dunerc.log_files
-    )
-    assert any(
-        f"{run_dunerc.session}_dfo" in str(logname) for logname in run_dunerc.log_files
-    )
-    assert any(
-        f"{run_dunerc.session}_mlt" in str(logname) for logname in run_dunerc.log_files
-    )
-    assert any(
-        f"{run_dunerc.session}_ru" in str(logname) for logname in run_dunerc.log_files
-    )
-
-    if check_for_logfile_errors:
-        # Check that there are no warnings or errors in the log files
-        assert log_file_checks.logs_are_error_free(
-            [
-                logname
-                for logname in run_dunerc.log_files
-                if "process_manager" in str(logname)
-            ],
-            True,
-            True,
-            ignored_logfile_problems,
-        )

From 999b99977b7988940aedd063ddeeea39734276b6 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Tue, 17 Mar 2026 11:42:53 +0100
Subject: [PATCH 18/29] Rename testing variables for clarity

---
 integtest/process_manager_test.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index b486ec6c3..f4a3fbf98 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -85,15 +85,15 @@
 echo pre_boot
 ps -u {getpass.getuser()}
 boot
-echo on_boot
+echo post_boot
 ps -u {getpass.getuser()}
 
 
-echo testing_logs
+echo test_logs
 logs --name unknown
 logs --name root-controller --how-far 5
 logs --name mlt --how-far 5
-echo testing_logs_done
+echo test_logs_done
 
 echo test_wait
 wait 10
@@ -101,28 +101,27 @@
 
 echo pre_restart_mlt
 restart -n mlt
-echo fixture_1
 restart -n root-controller
 wait 5
 echo post_restart_mlt
 
 
-echo pre_kill_mlt
+echo test_kill_mlt
 ps -u {getpass.getuser()}
 kill -n mlt
 wait 2
-echo post_kill_mlt
+echo test_kill_mlt_post
 ps -u {getpass.getuser()}
-echo kill_mlt_done
+echo test_kill_mlt_done
 
 
-echo ps_recovery
+echo test_recovery
 restart -n mlt
 restart -n trg-controller
 wait 5
-echo ps_after_recovery
+echo test_recovery_post
 ps -u {getpass.getuser()}
-echo ps_recovery_done
+echo test_recovery_done
 
 
 flush
@@ -189,17 +188,17 @@ def test_boot(run_dunerc) -> None:
     stdout = run_dunerc.completed_process.stdout
 
     ps_pre_boot = get_ps_table_after_echo(stdout, "pre_boot")
-    ps_on_boot = get_ps_table_after_echo(stdout, "on_boot")
+    ps_post_boot = get_ps_table_after_echo(stdout, "post_boot")
 
     assert not ps_pre_boot, (
         f"Expected ps table before boot to be empty, but found {len(ps_pre_boot)} row(s): "
         + ", ".join(row["friendly_name"] for row in ps_pre_boot)
     )
 
-    assert ps_on_boot, (
+    assert ps_post_boot, (
         "Expected ps table after boot to contain processes, but it was empty."
     )
-    for row in ps_on_boot:
+    for row in ps_post_boot:
         assert UUID_RE.match(row["uuid"]), (
             f"Expected a valid UUID for process '{row['friendly_name']}', got '{row['uuid']}'"
         )
@@ -360,8 +359,8 @@ def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
     """Checks that killing mlt removes it from the subsequent ps table."""
     stdout = run_dunerc.completed_process.stdout
 
-    ps_before_kill = get_ps_table_after_echo(stdout, "pre_kill_mlt")
-    ps_after_kill = get_ps_table_after_echo(stdout, "post_kill_mlt")
+    ps_before_kill = get_ps_table_after_echo(stdout, "test_kill_mlt")
+    ps_after_kill = get_ps_table_after_echo(stdout, "test_kill_mlt_post")
 
     assert_process(ps_before_kill, "mlt", context="before kill")
     assert_process(ps_after_kill, "mlt", context="after kill", expected_present=False)
@@ -370,5 +369,5 @@ def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
 def test_mlt_recovers_after_kill(run_dunerc) -> None:
     """Checks that mlt is present again after the recovery restart sequence."""
     stdout = run_dunerc.completed_process.stdout
-    ps_after_recovery = get_ps_table_after_echo(stdout, "ps_after_recovery")
+    ps_after_recovery = get_ps_table_after_echo(stdout, "test_recovery_post")
     assert_process(ps_after_recovery, "mlt", context="after recovery")

From da7e4ec31af0ca20fa794c3eade701e80dc94bad Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Tue, 17 Mar 2026 12:21:07 +0100
Subject: [PATCH 19/29] add examples to docstrings

---
 integtest/integ_test_utils.py     | 142 ++++++++++++++++++++++++------
 integtest/process_manager_test.py |  10 ++-
 2 files changed, 122 insertions(+), 30 deletions(-)

diff --git a/integtest/integ_test_utils.py b/integtest/integ_test_utils.py
index f88baeb52..bebc870a9 100644
--- a/integtest/integ_test_utils.py
+++ b/integtest/integ_test_utils.py
@@ -1,8 +1,8 @@
 """Shared helpers for drunc integration tests.
 
-This module centralizes commoon patterns used by process-manager integration tests. 
+This module centralizes commoon patterns used by process-manager integration tests.
 Importantly, most of these are defined to help with processing the stdout log outputs
-of the integ tests. 
+of the integ tests.
 
 Common functions include:
 - searching ordered log output for marker lines,
@@ -34,6 +34,17 @@ def find_line_index(
     """Return the first line index at or after `start_idx` matching `predicate`.
 
     Returns `None` when no line matches.
+
+    Example:
+        >>> lines = [
+        ...     "[2026/03/17 10:48:10 UTC] INFO drunc.controller.iface Command wait running for 5 seconds.",
+        ...     "[2026/03/17 10:48:15 UTC] INFO drunc.controller.iface Command wait ran for 5 seconds.",
+        ...     "[2026/03/17 10:48:15 UTC] INFO drunc.echo test_recovery_post",
+        ... ]
+        >>> find_line_index(lines, lambda line: "Command wait ran" in line)
+        1
+        >>> find_line_index(lines, lambda line: "test_wait_done" in line) is None
+        True
     """
     return next(
         (idx for idx in range(start_idx, len(lines)) if predicate(lines[idx])),
@@ -48,7 +59,20 @@ def require_line_index(
     error_message: str,
     start_idx: int = 0,
 ) -> int:
-    """Like `find_line_index`, but assert a match exists and return its index."""
+    """Like `find_line_index`, but assert a match exists and return its index.
+
+    Example:
+        >>> lines = [
+        ...     "[2026/03/17 10:47:38 UTC] INFO drunc.echo test_wait",
+        ...     "[2026/03/17 10:47:48 UTC] INFO drunc.echo test_wait_done",
+        ... ]
+        >>> require_line_index(
+        ...     lines,
+        ...     lambda line: "test_wait_done" in line,
+        ...     error_message="Could not find wait completion marker",
+        ... )
+        1
+    """
     line_idx = find_line_index(lines, predicate, start_idx=start_idx)
     assert line_idx is not None, error_message
     return line_idx
@@ -61,7 +85,21 @@ def require_line_containing(
     error_message: str,
     start_idx: int = 0,
 ) -> int:
-    """Assert and return index of the first line containing `text`."""
+    """Assert and return index of the first line containing `text`.
+
+    Example:
+    [2026/03/17] WARNING drunc.process_manager_driver Bad query for logs
+    ────────────────────────────── root-controller logs ──────────────────────────────
+    [2026/03/17] INFO drunc.init_controller Taking control of trg-controller
+
+    header_idx = require_line_containing(
+        lines,
+        "root-controller logs",
+        error_message="Did not find the 'root-controller logs' header line in stdout.",
+    )
+
+
+    """
     return require_line_index(
         lines,
         lambda line: text in line,
@@ -75,6 +113,14 @@ def require_echo_marker_index(
 ) -> int:
     """Assert and return index of a `drunc.echo` line ending with `echo_marker`.
     This is hardcoded since echo is a specific callable function with its own logger.
+
+    Example:
+        >>> lines = [
+        ...     "[2026/03/17 10:48:15 UTC] INFO drunc.echo test_recovery_post",
+        ...     "Processes running",
+        ... ]
+        >>> require_echo_marker_index(lines, "test_recovery_post")
+        0
     """
     return require_line_index(
         lines,
@@ -91,7 +137,22 @@ def require_pattern_match_index(
     error_message: str,
     start_idx: int = 0,
 ) -> tuple[int, re.Match[str]]:
-    """Assert and return `(index, match)` for first line matching `pattern`."""
+    """Assert and return `(index, match)` for first line matching `pattern`.
+
+    Example:
+        >>> lines = [
+        ...     "[2026/03/17] INFO drunc.iface Command wait running for 10 seconds.",
+        ...     "[2026/03/17] INFO drunc.iface Command wait ran for 10 seconds.",
+        ... ]
+        >>> pattern = re.compile(r"Command wait ran for (\\d+) seconds\\.")
+        >>> line_idx, match = require_pattern_match_index(
+        ...     lines,
+        ...     pattern,
+        ...     error_message="Did not find wait completion log line.",
+        ... )
+        >>> (line_idx, match.group(1))
+        (1, '10')
+    """
     line_idx = require_line_index(
         lines,
         lambda line: pattern.search(line) is not None,
@@ -109,7 +170,19 @@ def require_pattern_match(
     *,
     error_message: str,
 ) -> re.Match[str]:
-    """Assert `pattern` matches `text` and return the `re.Match` object."""
+    """Assert `pattern` matches `text` and return the `re.Match` object.
+
+    Example:
+        >>> line = "[2026/03/17] INFO Command wait ran for 10 seconds."
+        >>> pattern = re.compile(r"Command wait ran for (\\d+) seconds\\.")
+        >>> match = require_pattern_match(
+        ...     line,
+        ...     pattern,
+        ...     error_message="Did not find wait completion log line.",
+        ... )
+        >>> match.group(1)
+        '10'
+    """
     match = pattern.search(text)
     assert match is not None, error_message
     return match
@@ -157,6 +230,17 @@ def get_ps_table_after_echo(stdout: str, echo_marker: str) -> list[dict[str, str
     """Return parsed process-table rows found after a specific echo marker.
 
     If no process table is found after the marker, returns an empty list.
+
+    Example:
+        >>> stdout = (
+        ...     "[2026/03/17 10:48:15 UTC] INFO drunc.echo test_recovery_post\n"
+        ...     "Processes running\n"
+        ...     "│ minimal │ root-controller │ emmuhamm │ localhost │ f201f9c7-b910-4100-bd78-11765a4d2ee1 │ True │ 0 │\n"
+        ...     "└"
+        ... )
+        >>> table = get_ps_table_after_echo(stdout, "test_recovery_post")
+        >>> table[0]["friendly_name"]
+        'root-controller'
     """
     lines = strip_ansi(stdout).splitlines()
 
@@ -203,8 +287,8 @@ def assert_process_presence(
     ps_table: list[dict[str, str]],
     friendly_name: str,
     *,
-    expected_present: bool,
     context: str,
+    expected_present: bool = True,
 ) -> None:
     """Assert whether a process is present/absent in a process table.
 
@@ -213,6 +297,31 @@ def assert_process_presence(
         friendly_name: Process name to check.
         expected_present: `True` if process should exist, `False` otherwise.
         context: Short phrase appended to error text (e.g. "before kill").
+
+    Example:
+        >>> ps_table = [
+        ...     {
+        ...         "session": "minimal",
+        ...         "friendly_name": "root-controller",
+        ...         "user": "daq",
+        ...         "host": "localhost",
+        ...         "uuid": "f201f9c7-b910-4100-bd78-11765a4d2ee1",
+        ...         "alive": "True",
+        ...         "exit_code": "0",
+        ...     }
+        ... ]
+        >>> assert_process_presence(
+        ...     ps_table,
+        ...     "root-controller",
+        ...     context="before restart",
+        ...     expected_present=True,
+        ... )
+        >>> assert_process_presence(
+        ...     ps_table,
+        ...     "mlt",
+        ...     context="after restart",
+        ...     expected_present=False,
+        ... )
     """
     matching_rows = get_rows_for_friendly_name(ps_table, friendly_name)
 
@@ -225,22 +334,3 @@ def assert_process_presence(
     assert not matching_rows, (
         f"Expected '{friendly_name}' to be absent from ps table {context}, but it is still present."
     )
-
-
-def assert_process(
-    ps_table: list[dict[str, str]],
-    friendly_name: str,
-    *,
-    context: str,
-    expected_present: bool = True,
-) -> None:
-    """Convenience wrapper around `assert_process_presence`.
-
-    By default, asserts that the process is present.
-    """
-    assert_process_presence(
-        ps_table,
-        friendly_name,
-        expected_present=expected_present,
-        context=context,
-    )
\ No newline at end of file
diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index f4a3fbf98..62fe20e79 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -6,7 +6,7 @@
 import integrationtest.data_classes as data_classes
 import integrationtest.log_file_checks as log_file_checks
 from integ_test_utils import (
-    assert_process,
+    assert_process_presence,
     get_ps_table_after_echo,
     require_echo_marker_index,
     require_line_containing,
@@ -362,12 +362,14 @@ def test_kill_removes_mlt_from_ps_table(run_dunerc) -> None:
     ps_before_kill = get_ps_table_after_echo(stdout, "test_kill_mlt")
     ps_after_kill = get_ps_table_after_echo(stdout, "test_kill_mlt_post")
 
-    assert_process(ps_before_kill, "mlt", context="before kill")
-    assert_process(ps_after_kill, "mlt", context="after kill", expected_present=False)
+    assert_process_presence(ps_before_kill, "mlt", context="before kill")
+    assert_process_presence(
+        ps_after_kill, "mlt", context="after kill", expected_present=False
+    )
 
 
 def test_mlt_recovers_after_kill(run_dunerc) -> None:
     """Checks that mlt is present again after the recovery restart sequence."""
     stdout = run_dunerc.completed_process.stdout
     ps_after_recovery = get_ps_table_after_echo(stdout, "test_recovery_post")
-    assert_process(ps_after_recovery, "mlt", context="after recovery")
+    assert_process_presence(ps_after_recovery, "mlt", context="after recovery")

From 2b2e878ac4c272fd48d6e42effb020374af90591 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Wed, 18 Mar 2026 14:45:41 +0100
Subject: [PATCH 20/29] Add width for tables

---
 .../process_manager/interface/commands.py     | 20 ++++++++++++++++---
 src/drunc/process_manager/utils.py            |  4 ++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/drunc/process_manager/interface/commands.py b/src/drunc/process_manager/interface/commands.py
index 2937515ca..a1287afdc 100644
--- a/src/drunc/process_manager/interface/commands.py
+++ b/src/drunc/process_manager/interface/commands.py
@@ -260,8 +260,20 @@ def restart(obj: ProcessManagerContext, query: ProcessQuery) -> None:
     default=False,
     help="Whether to have a long output",
 )
+@click.option(
+    "-w",
+    "--width",
+    type=int,
+    default=None,
+    help="Table width. Default is automatically calculated",
+)
 @click.pass_obj
-def ps(obj: ProcessManagerContext, query: ProcessQuery, long_format: bool) -> None:
+def ps(
+    obj: ProcessManagerContext,
+    query: ProcessQuery,
+    long_format: bool,
+    width: int | None,
+) -> None:
     log = get_logger("process_manager.shell")
     log.debug(f"Running ps with query {query}")
     results = obj.get_driver("process_manager").ps(query)
@@ -269,6 +281,8 @@ def ps(obj: ProcessManagerContext, query: ProcessQuery, long_format: bool) -> No
         return
     obj.print(
         tabulate_process_instance_list(
-            results, title="Processes running", long=long_format
-        )
+            results, title="Processes running", long=long_format, width=width
+        ),
+        overflow="fold",
+        soft_wrap=True,
     )
diff --git a/src/drunc/process_manager/utils.py b/src/drunc/process_manager/utils.py
index 602a2dab2..095d70df1 100644
--- a/src/drunc/process_manager/utils.py
+++ b/src/drunc/process_manager/utils.py
@@ -118,9 +118,9 @@ def walk(tree_id):
 
 
 def tabulate_process_instance_list(
-    pil: ProcessInstanceList, title: str, long: bool = False
+    pil: ProcessInstanceList, title: str, long: bool = False, width: int | None = None
 ):
-    t = Table(title=title)
+    t = Table(title=title, width=width)
     t.add_column("session")
     t.add_column("friendly name")
     t.add_column("user")

From 092da09c64e4f94eac0f07cac39b93d00344c711 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Wed, 18 Mar 2026 14:54:09 +0100
Subject: [PATCH 21/29] Fix table width bug in test

---
 integtest/process_manager_test.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 62fe20e79..2e9fb7755 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -83,10 +83,10 @@
 dunerc_command_list = f"""
 
 echo pre_boot
-ps -u {getpass.getuser()}
+ps -u {getpass.getuser()} -w 180
 boot
 echo post_boot
-ps -u {getpass.getuser()}
+ps -u {getpass.getuser()} -w 180
 
 
 echo test_logs
@@ -107,11 +107,11 @@
 
 
 echo test_kill_mlt
-ps -u {getpass.getuser()}
+ps -u {getpass.getuser()} -w 180
 kill -n mlt
 wait 2
 echo test_kill_mlt_post
-ps -u {getpass.getuser()}
+ps -u {getpass.getuser()} -w 180
 echo test_kill_mlt_done
 
 
@@ -120,7 +120,7 @@
 restart -n trg-controller
 wait 5
 echo test_recovery_post
-ps -u {getpass.getuser()}
+ps -u {getpass.getuser()} -w 180
 echo test_recovery_done
 
 
@@ -132,7 +132,6 @@
 
 UUID_RE = re.compile(
     r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"
-    r"|^[0-9a-fA-F]{8}-[-0-9a-fA-F]*\u2026"  # truncated by Rich table column width
 )
 
 

From bb80196e9bfbab13bdb048250cc8bbb514376fab Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Wed, 18 Mar 2026 15:00:25 +0100
Subject: [PATCH 22/29] Propagate width to other table options

---
 .../process_manager/interface/commands.py     | 37 ++++++++++++++++---
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/drunc/process_manager/interface/commands.py b/src/drunc/process_manager/interface/commands.py
index a1287afdc..7fb1c1935 100644
--- a/src/drunc/process_manager/interface/commands.py
+++ b/src/drunc/process_manager/interface/commands.py
@@ -151,44 +151,69 @@ def dummy_boot(
 
 
 @click.command("terminate")
+@click.option(
+    "-w",
+    "--width",
+    type=int,
+    default=None,
+    help="Table width. Default is automatically calculated",
+)
 @click.pass_obj
-def terminate(obj: ProcessManagerContext) -> None:
+def terminate(obj: ProcessManagerContext, width: int | None) -> None:
     log = get_logger("process_manager.shell")
     log.debug("Terminating")
     result = obj.get_driver("process_manager").terminate()
     if not result:
         return
     obj.print(
-        tabulate_process_instance_list(result, "Terminated process", False)
+        tabulate_process_instance_list(result, "Terminated process", False, width=width)
     )  # rich tables require console printing
     obj.delete_driver("controller")
 
 
 @click.command("kill")
+@click.option(
+    "-w",
+    "--width",
+    type=int,
+    default=None,
+    help="Table width. Default is automatically calculated",
+)
 @add_query_options(at_least_one=True)
 @click.pass_obj
-def kill(obj: ProcessManagerContext, query: ProcessQuery) -> None:
+def kill(obj: ProcessManagerContext, query: ProcessQuery, width: int | None) -> None:
     log = get_logger("process_manager.shell")
     log.debug(f"Killing with query {query}")
     result = obj.get_driver("process_manager").kill(query)
     if not result:
         return
     obj.print(
-        tabulate_process_instance_list(result, "Killed process", False)
+        tabulate_process_instance_list(result, "Killed process", False, width=width)
     )  # rich tables require console printing
 
 
 @click.command("flush")
+@click.option(
+    "-w",
+    "--width",
+    type=int,
+    default=None,
+    help="Table width. Default is automatically calculated",
+)
 @add_query_options(at_least_one=False, all_processes_by_default=True)
 @click.pass_obj
-def flush(obj: ProcessManagerContext, query: ProcessQuery) -> None:
+def flush(
+    obj: ProcessManagerContext,
+    query: ProcessQuery,
+    width: int | None,
+) -> None:
     log = get_logger("process_manager.shell")
     log.debug(f"Flushing with query {query}")
     result = obj.get_driver("process_manager").flush(query)
     if not result:
         return
     obj.print(
-        tabulate_process_instance_list(result, "Flushed process", False)
+        tabulate_process_instance_list(result, "Flushed process", False, width=width)
     )  # rich tables require console printing
 
 

From 557d5c7db79ac934db96db9c7f0c53d752bc5548 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Wed, 18 Mar 2026 15:16:49 +0100
Subject: [PATCH 23/29] Fix pytest print mockcontext

---
 tests/process_manager/interface/test_commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/process_manager/interface/test_commands.py b/tests/process_manager/interface/test_commands.py
index 6b98b1544..3571690e2 100644
--- a/tests/process_manager/interface/test_commands.py
+++ b/tests/process_manager/interface/test_commands.py
@@ -115,7 +115,7 @@ def __init__(self, driver=None):
     def get_driver(self, name):
         return self.driver
 
-    def print(self, msg, justify=None):
+    def print(self, msg, justify=None, overflow=None, soft_wrap=None):
         self.output.append(str(msg))
 
 

From 2e0331ff43dc6477e2d37dd7d913cc04258e793d Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Thu, 19 Mar 2026 15:04:32 +0100
Subject: [PATCH 24/29] drunc connsvc true, fix minor typos

---
 integtest/integ_test_utils.py     | 2 +-
 integtest/process_manager_test.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/integtest/integ_test_utils.py b/integtest/integ_test_utils.py
index bebc870a9..6dc9fe014 100644
--- a/integtest/integ_test_utils.py
+++ b/integtest/integ_test_utils.py
@@ -1,6 +1,6 @@
 """Shared helpers for drunc integration tests.
 
-This module centralizes commoon patterns used by process-manager integration tests.
+This module centralizes common patterns used by process-manager integration tests.
 Importantly, most of these are defined to help with processing the stdout log outputs
 of the integ tests.
 
diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 2e9fb7755..3a418c14e 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -53,7 +53,7 @@
 conf_dict.tpg_enabled = False
 
 # For testing, allow drunc to manage ConnectivityService (default is False, integrationtest manages Connectivity Service)
-# conf_dict.drunc_connsvc = True
+conf_dict.drunc_connsvc = True
 # For testing, specify connectivity service port (default is 0, a random port is chosen for the Connectivity Service)
 # conf_dict.connsvc_port = 12345
 
@@ -80,6 +80,10 @@
 # NOTE THAT WE HAVE NOT TESTED FLUSH BECAUSE IT IS BROKEN
 # see #821
 
+
+# The commands mostly come from the msqt, with a few minor changes
+# The entire format is a standard that is  basically copied over from the
+#  typical msqt tests, so they bear no direct effect on the scope of this test.
 dunerc_command_list = f"""
 
 echo pre_boot

From 74301e761dd971420b5180e85e096b0d2f3dd10a Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Thu, 19 Mar 2026 15:04:53 +0100
Subject: [PATCH 25/29] remove comment command

---
 src/drunc/controller/interface/commands.py | 15 ---------------
 src/drunc/controller/interface/shell.py    |  2 --
 src/drunc/unified_shell/shell.py           |  2 --
 3 files changed, 19 deletions(-)

diff --git a/src/drunc/controller/interface/commands.py b/src/drunc/controller/interface/commands.py
index 24d555d62..f054a8e29 100644
--- a/src/drunc/controller/interface/commands.py
+++ b/src/drunc/controller/interface/commands.py
@@ -244,21 +244,6 @@ def who_am_i(obj: ControllerContext) -> None:
     log.info(obj.get_token().user_name)
 
 
-# click_shell/_cmd.py, line 23. identchars only accepts ascii letters + digits + _
-# Can't really be used by the integ test tho..
-@click.command(
-    "comment",
-    hidden=True,
-    context_settings=dict(
-        ignore_unknown_options=True,
-        allow_extra_args=True,
-    ),
-)
-def comment_handler():
-    """Ignore this line"""
-    pass
-
-
 @click.command("echo")
 @click.argument("text", required=False)
 @click.pass_obj
diff --git a/src/drunc/controller/interface/shell.py b/src/drunc/controller/interface/shell.py
index a7135c127..5c71bde1c 100644
--- a/src/drunc/controller/interface/shell.py
+++ b/src/drunc/controller/interface/shell.py
@@ -6,7 +6,6 @@
 from daqpytools.logging import logging_log_levels
 
 from drunc.controller.interface.commands import (
-    comment_handler,
     connect,
     disconnect,
     echo,
@@ -93,7 +92,6 @@ def controller_shell(ctx, controller_address: str, log_level: str) -> None:
     ctx.command.add_command(surrender_control, "surrender-control")
     ctx.command.add_command(who_am_i, "whoami")
     ctx.command.add_command(echo, "echo")
-    ctx.command.add_command(comment_handler, "comment-handler")
     ctx.command.add_command(who_is_in_charge, "who-is-in-charge")
     for transition in transitions.commands:
         ctx.command.add_command(*generate_fsm_command(ctx.obj, transition, desc.name))
diff --git a/src/drunc/unified_shell/shell.py b/src/drunc/unified_shell/shell.py
index ed068491b..a3dfd1678 100644
--- a/src/drunc/unified_shell/shell.py
+++ b/src/drunc/unified_shell/shell.py
@@ -18,7 +18,6 @@
 from drunc.connectivity_service.client import ConnectivityServiceClient
 from drunc.controller.configuration import ControllerConfHandler
 from drunc.controller.interface.commands import (
-    comment_handler,
     connect,
     disconnect,
     echo,
@@ -384,7 +383,6 @@ def unified_shell(
         surrender_control,
         who_am_i,
         echo,
-        comment_handler,
         who_is_in_charge,
         include,
         exclude,

From 73188d15308e69228bc140e7e8b641d6148965e0 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Tue, 24 Mar 2026 17:22:02 +0100
Subject: [PATCH 26/29] Add flush check

---
 integtest/integ_test_utils.py     |  8 ++++----
 integtest/process_manager_test.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/integtest/integ_test_utils.py b/integtest/integ_test_utils.py
index 6dc9fe014..2f85ee3a9 100644
--- a/integtest/integ_test_utils.py
+++ b/integtest/integ_test_utils.py
@@ -257,17 +257,17 @@ def get_ps_table_after_echo(stdout: str, echo_marker: str) -> list[dict[str, str
     return _parse_ps_table_from_index(lines, table_start_idx)
 
 
-def get_uuid_for_friendly_name(
-    ps_table: list[dict[str, str]], friendly_name: str
+def get_column_for_friendly_name(
+    ps_table: list[dict[str, str]], friendly_name: str, column: str
 ) -> str:
-    """Return UUID for `friendly_name` from a parsed process table.
+    """Return the column for `friendly_name` from a parsed process table.
 
     Raises:
         AssertionError: if the friendly name is absent.
     """
     for row in ps_table:
         if row["friendly_name"].strip() == friendly_name:
-            return row["uuid"]
+            return row[column]
 
     available_names = ", ".join(row["friendly_name"].strip() for row in ps_table)
     raise AssertionError(
diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 3a418c14e..8ce13d44b 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -7,6 +7,7 @@
 import integrationtest.log_file_checks as log_file_checks
 from integ_test_utils import (
     assert_process_presence,
+    get_column_for_friendly_name,
     get_ps_table_after_echo,
     require_echo_marker_index,
     require_line_containing,
@@ -128,7 +129,18 @@
 echo test_recovery_done
 
 
+echo test_flush
+ps -u {getpass.getuser()} -w 180
+kill -n mlt --crash 
+wait 5
+echo after_crash
+ps -u {getpass.getuser()} -w 180
 flush
+echo after_flush
+ps -u {getpass.getuser()} -w 180
+echo test_flush_done
+
+
 terminate
 
 """.split()
@@ -376,3 +388,21 @@ def test_mlt_recovers_after_kill(run_dunerc) -> None:
     stdout = run_dunerc.completed_process.stdout
     ps_after_recovery = get_ps_table_after_echo(stdout, "test_recovery_post")
     assert_process_presence(ps_after_recovery, "mlt", context="after recovery")
+
+
+def test_flush(run_dunerc) -> None:
+    """Checks that flush work by crashing mlt, seeing that the process exists,
+    and then flushing to show its gone"""
+
+    stdout = run_dunerc.completed_process.stdout
+    ps_initial = get_ps_table_after_echo(stdout, "test_flush")
+    assert_process_presence(ps_initial, "mlt", context="before crash")
+
+    ps_after_crash = get_ps_table_after_echo(stdout, "after_crash")
+    mlt_alive = get_column_for_friendly_name(ps_after_crash, "mlt", "alive")
+    assert mlt_alive == "False", "The mlt should have crashed"
+
+    ps_after_flash = get_ps_table_after_echo(stdout, "after_flush")
+    assert_process_presence(
+        ps_after_flash, "mlt", context="after crash", expected_present=False
+    )

From e57c098c8d2ca4e3fad5995e2cc7eeece23c3137 Mon Sep 17 00:00:00 2001
From: Emir Muhammad <emir.muhammad@cern.ch>
Date: Wed, 25 Mar 2026 17:15:31 +0100
Subject: [PATCH 27/29] Document width in the wiki

---
 docs/Unified-shell-reference.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/Unified-shell-reference.md b/docs/Unified-shell-reference.md
index dd05aa0f5..2cf1c9c6a 100644
--- a/docs/Unified-shell-reference.md
+++ b/docs/Unified-shell-reference.md
@@ -437,6 +437,7 @@ The `ps` command must take at least one the following options:
 * `-n/--name`, to select a process to flush based on its "friendly name".
 * `-s/--session`, to select the processes to flush based on a session name.
 * `--long-format/-l`, to get a long listing format.
+* `-w/--width`, to fix the table width to a supplied length.
 
 By default, `ps` list all the processes.
 

From f7fc2781f79fcc1dd1d96398e65515d669314c59 Mon Sep 17 00:00:00 2001
From: PawelPlesniak <plesniakpaul@gmail.com>
Date: Tue, 21 Apr 2026 17:27:10 +0200
Subject: [PATCH 28/29] Updating parameter name for tests to pass

---
 integtest/process_manager_test.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 8ce13d44b..6e7bbdc9b 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -170,18 +170,24 @@ def test_nanorc_success(run_dunerc) -> None:
 def test_log_files(run_dunerc) -> None:
     """Checks that expected process-manager log files exist and are free of errors."""
     # Check that at least some of the expected log files are present
+    print(f"PP: TESTING {type(run_dunerc)=}")
+    print(f"PP: TESTING {dir(run_dunerc)=}")
+    print(f"PP: TESTING {run_dunerc.daq_session_name=}")
     assert any(
-        f"{run_dunerc.session}_df-01" in str(logname)
+        f"{run_dunerc.daq_session_name}_df-01" in str(logname)
         for logname in run_dunerc.log_files
     )
     assert any(
-        f"{run_dunerc.session}_dfo" in str(logname) for logname in run_dunerc.log_files
+        f"{run_dunerc.daq_session_name}_dfo" in str(logname)
+        for logname in run_dunerc.log_files
     )
     assert any(
-        f"{run_dunerc.session}_mlt" in str(logname) for logname in run_dunerc.log_files
+        f"{run_dunerc.daq_session_name}_mlt" in str(logname)
+        for logname in run_dunerc.log_files
     )
     assert any(
-        f"{run_dunerc.session}_ru" in str(logname) for logname in run_dunerc.log_files
+        f"{run_dunerc.daq_session_name}_ru" in str(logname)
+        for logname in run_dunerc.log_files
     )
 
     if check_for_logfile_errors:

From cb0ca9307655b11c71f67c5d6e316f9145882d5b Mon Sep 17 00:00:00 2001
From: PawelPlesniak <plesniakpaul@gmail.com>
Date: Wed, 22 Apr 2026 17:08:00 +0200
Subject: [PATCH 29/29] Comment removal

---
 integtest/process_manager_test.py | 9 +--------
 src/drunc/unified_shell/shell.py  | 4 ++++
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/integtest/process_manager_test.py b/integtest/process_manager_test.py
index 6e7bbdc9b..3284bddee 100644
--- a/integtest/process_manager_test.py
+++ b/integtest/process_manager_test.py
@@ -78,13 +78,9 @@
 
 confgen_arguments = {"MinimalSystem": conf_dict}
 # The commands to run in dunerc
-# NOTE THAT WE HAVE NOT TESTED FLUSH BECAUSE IT IS BROKEN
-# see #821
-
-
 # The commands mostly come from the msqt, with a few minor changes
 # The entire format is a standard that is  basically copied over from the
-#  typical msqt tests, so they bear no direct effect on the scope of this test.
+# typical msqt tests, so they bear no direct effect on the scope of this test.
 dunerc_command_list = f"""
 
 echo pre_boot
@@ -170,9 +166,6 @@ def test_nanorc_success(run_dunerc) -> None:
 def test_log_files(run_dunerc) -> None:
     """Checks that expected process-manager log files exist and are free of errors."""
     # Check that at least some of the expected log files are present
-    print(f"PP: TESTING {type(run_dunerc)=}")
-    print(f"PP: TESTING {dir(run_dunerc)=}")
-    print(f"PP: TESTING {run_dunerc.daq_session_name=}")
     assert any(
         f"{run_dunerc.daq_session_name}_df-01" in str(logname)
         for logname in run_dunerc.log_files
diff --git a/src/drunc/unified_shell/shell.py b/src/drunc/unified_shell/shell.py
index a3dfd1678..d007bfece 100644
--- a/src/drunc/unified_shell/shell.py
+++ b/src/drunc/unified_shell/shell.py
@@ -152,10 +152,14 @@ def unified_shell(
     unified_shell_log.debug("Setting up the [green]unified_shell[/green] logger")
 
     # Parse the process manager argument to determine if it's a config or an address
+    unified_shell_log.critical(
+        f"Parsing the process manager argument: {process_manager}"
+    )
     process_manager_url: ParseResult = urlparse(process_manager)
     internal_pm: bool = True
     if process_manager_url.scheme == "grpc":  # i.e. if it's an address
         internal_pm = False
+    unified_shell_log.critical(f"{internal_pm=}, {process_manager_url=}")
 
     # If using a k8s process manager, validate the session name before proceeding
     if get_pm_type_from_name(