From f3481d5aaff1645ec1804bc520fee472fc28a966 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 8 Apr 2026 00:30:14 +0000
Subject: [PATCH 01/13] Add eval, dataset-qa, and unlearning CLI sub-commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement three new typer sub-apps following the skills.py pattern:

- `hirundo eval run/list/check` — launch/list/monitor LLM behavior eval runs
  via LlmBehaviorEval.launch_eval_run / check_run_by_id / list_runs
- `hirundo dataset-qa run/list/check` — launch/list/monitor Dataset QA runs
  via QADataset.launch_qa_run / check_run_by_id / list_runs
- `hirundo unlearning run/list/check` — launch/list/monitor LLM unlearning runs
  via LlmUnlearningRun.launch / check_run_by_id / list

Each `run` command accepts --wait/--no-wait (default: wait) to either stream
tqdm progress or immediately return the run ID for scripting.

Closes #220

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/cli.py            |   7 ++
 hirundo/cli_dataset_qa.py |  96 +++++++++++++++++++++++
 hirundo/cli_eval.py       | 138 ++++++++++++++++++++++++++++++++
 hirundo/cli_unlearning.py | 161 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 402 insertions(+)
 create mode 100644 hirundo/cli_dataset_qa.py
 create mode 100644 hirundo/cli_eval.py
 create mode 100644 hirundo/cli_unlearning.py

diff --git a/hirundo/cli.py b/hirundo/cli.py
index 6714e596..76fbc439 100644
--- a/hirundo/cli.py
+++ b/hirundo/cli.py
@@ -10,6 +10,9 @@
 from rich.table import Table
 
 from hirundo._env import API_HOST, EnvLocation
+from hirundo.cli_dataset_qa import dataset_qa_app
+from hirundo.cli_eval import eval_app
+from hirundo.cli_unlearning import unlearning_app
 
 docs = "sphinx" in sys.modules
 hirundo_epilog = (
@@ -26,6 +29,10 @@
     epilog=hirundo_epilog,
 )
 
+app.add_typer(eval_app, name="eval")
+app.add_typer(dataset_qa_app, name="dataset-qa")
+app.add_typer(unlearning_app, name="unlearning")
+
 
 def _upsert_env(dotenv_filepath: str | Path, var_name: str, var_value: str):
     """
diff --git a/hirundo/cli_dataset_qa.py b/hirundo/cli_dataset_qa.py
new file mode 100644
index 00000000..c47f1224
--- /dev/null
+++ b/hirundo/cli_dataset_qa.py
@@ -0,0 +1,96 @@
+"""
+CLI sub-app for Dataset QA commands.
+
+Commands:
+    hirundo dataset-qa run    - Launch a Dataset QA run
+    hirundo dataset-qa list   - List Dataset QA runs
+    hirundo dataset-qa check  - Check the status of a Dataset QA run
+"""
+
+import sys
+from typing import Annotated
+
+import typer
+from rich.console import Console
+from rich.table import Table
+
+docs = "sphinx" in sys.modules
+dataset_qa_epilog = (
+    None
+    if docs
+    else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
+)
+
+console = Console()
+
+dataset_qa_app = typer.Typer(
+    name="dataset-qa",
+    no_args_is_help=True,
+    rich_markup_mode="rich",
+    epilog=dataset_qa_epilog,
+    help="Launch and monitor Dataset QA runs.",
+)
+
+
+@dataset_qa_app.command("run", epilog=dataset_qa_epilog)
+def dataset_qa_run(
+    dataset_id: Annotated[int, typer.Argument(help="ID of the dataset to run QA on.")],
+    wait: Annotated[
+        bool,
+        typer.Option("--wait/--no-wait", help="Wait for the run to complete and stream progress."),
+    ] = True,
+):
+    """
+    Launch a Dataset QA run on the dataset with the given ID.
+    """
+    from hirundo.dataset_qa import QADataset
+
+    run_id = QADataset.launch_qa_run(dataset_id)
+    console.print(f"Dataset QA run started. Run ID: [bold]{run_id}[/bold]")
+
+    if wait:
+        QADataset.check_run_by_id(run_id)
+    else:
+        console.print(
+            "Use [bold]hirundo dataset-qa check[/bold] [italic]<run_id>[/italic] to monitor progress."
+        )
+
+
+@dataset_qa_app.command("list", epilog=dataset_qa_epilog)
+def dataset_qa_list(
+    archived: Annotated[
+        bool,
+        typer.Option("--archived/--no-archived", help="Include archived runs."),
+    ] = False,
+):
+    """
+    List Dataset QA runs.
+    """
+    from hirundo.dataset_qa import QADataset
+
+    runs = QADataset.list_runs(archived=archived)
+
+    table = Table(title="Dataset QA Runs:", expand=True)
+    for col in ("Dataset Name", "Run ID", "Status", "Created At", "Run Args"):
+        table.add_column(col, overflow="fold")
+    for run in runs:
+        table.add_row(
+            str(run.name),
+            str(run.run_id),
+            str(run.status),
+            run.created_at.isoformat(),
+            run.run_args.model_dump_json() if run.run_args else None,
+        )
+    console.print(table)
+
+
+@dataset_qa_app.command("check", epilog=dataset_qa_epilog)
+def dataset_qa_check(
+    run_id: Annotated[str, typer.Argument(help="The run ID to check.")],
+):
+    """
+    Check the status of a Dataset QA run and stream progress.
+    """
+    from hirundo.dataset_qa import QADataset
+
+    QADataset.check_run_by_id(run_id)
diff --git a/hirundo/cli_eval.py b/hirundo/cli_eval.py
new file mode 100644
index 00000000..ab019a0f
--- /dev/null
+++ b/hirundo/cli_eval.py
@@ -0,0 +1,138 @@
+"""
+CLI sub-app for LLM behavior evaluation commands.
+
+Commands:
+    hirundo eval run    - Launch an LLM behavior evaluation run
+    hirundo eval list   - List evaluation runs
+    hirundo eval check  - Check the status of an evaluation run
+"""
+
+import sys
+from typing import Annotated, Optional
+
+import typer
+from rich.console import Console
+from rich.table import Table
+
+docs = "sphinx" in sys.modules
+eval_epilog = (
+    None
+    if docs
+    else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
+)
+
+console = Console()
+
+eval_app = typer.Typer(
+    name="eval",
+    no_args_is_help=True,
+    rich_markup_mode="rich",
+    epilog=eval_epilog,
+    help="Launch and monitor LLM behavior evaluation runs.",
+)
+
+
+@eval_app.command("run", epilog=eval_epilog)
+def eval_run(
+    preset: Annotated[
+        str,
+        typer.Option(
+            "--preset",
+            help="Evaluation preset. One of: BBQ_BIAS, BBQ_UNBIAS, UNQOVER_BIAS, HALU_EVAL, MED_HALLU, INJECTION_EVAL",
+        ),
+    ],
+    model_id: Annotated[
+        Optional[int],
+        typer.Option("--model-id", help="ID of the LLM model to evaluate."),
+    ] = None,
+    source_run_id: Annotated[
+        Optional[str],
+        typer.Option("--source-run-id", help="ID of the unlearning run to evaluate."),
+    ] = None,
+    name: Annotated[
+        Optional[str],
+        typer.Option("--name", help="Optional name for this evaluation run."),
+    ] = None,
+    wait: Annotated[
+        bool,
+        typer.Option("--wait/--no-wait", help="Wait for the run to complete and stream progress."),
+    ] = True,
+):
+    """
+    Launch an LLM behavior evaluation run.
+
+    Either --model-id or --source-run-id must be provided.
+    """
+    from hirundo.llm_behavior_eval import EvalRunInfo, LlmBehaviorEval, ModelOrRun, PresetType
+
+    if model_id is None and source_run_id is None:
+        console.print("[red]Error: either --model-id or --source-run-id must be provided.[/red]")
+        raise typer.Exit(code=1)
+    if model_id is not None and source_run_id is not None:
+        console.print("[red]Error: only one of --model-id or --source-run-id may be provided.[/red]")
+        raise typer.Exit(code=1)
+
+    try:
+        preset_type = PresetType(preset.upper())
+    except ValueError:
+        valid = ", ".join(p.value for p in PresetType)
+        console.print(f"[red]Invalid preset '{preset}'. Valid options: {valid}[/red]")
+        raise typer.Exit(code=1)
+
+    model_or_run = ModelOrRun.MODEL if model_id is not None else ModelOrRun.RUN
+    run_info = EvalRunInfo(
+        model_id=model_id,
+        source_run_id=source_run_id,
+        preset_type=preset_type,
+        name=name,
+    )
+
+    run_id = LlmBehaviorEval.launch_eval_run(model_or_run, run_info)
+    console.print(f"Eval run started. Run ID: [bold]{run_id}[/bold]")
+
+    if wait:
+        LlmBehaviorEval.check_run_by_id(run_id)
+    else:
+        console.print(
+            "Use [bold]hirundo eval check[/bold] [italic]<run_id>[/italic] to monitor progress."
+        )
+
+
+@eval_app.command("list", epilog=eval_epilog)
+def eval_list(
+    archived: Annotated[
+        bool,
+        typer.Option("--archived/--no-archived", help="Include archived runs."),
+    ] = False,
+):
+    """
+    List LLM behavior evaluation runs.
+    """
+    from hirundo.llm_behavior_eval import LlmBehaviorEval
+
+    runs = LlmBehaviorEval.list_runs(archived=archived)
+
+    table = Table(title="Eval Runs:", expand=True)
+    for col in ("Run ID", "Name", "Status", "Preset", "Created At"):
+        table.add_column(col, overflow="fold")
+    for run in runs:
+        table.add_row(
+            str(run.run_id),
+            str(run.name),
+            str(run.status),
+            run.preset_type.value if run.preset_type else None,
+            run.created_at.isoformat(),
+        )
+    console.print(table)
+
+
+@eval_app.command("check", epilog=eval_epilog)
+def eval_check(
+    run_id: Annotated[str, typer.Argument(help="The run ID to check.")],
+):
+    """
+    Check the status of an LLM behavior evaluation run and stream progress.
+    """
+    from hirundo.llm_behavior_eval import LlmBehaviorEval
+
+    LlmBehaviorEval.check_run_by_id(run_id)
diff --git a/hirundo/cli_unlearning.py b/hirundo/cli_unlearning.py
new file mode 100644
index 00000000..6be6fed2
--- /dev/null
+++ b/hirundo/cli_unlearning.py
@@ -0,0 +1,161 @@
+"""
+CLI sub-app for LLM unlearning commands.
+
+Commands:
+    hirundo unlearning run    - Launch an LLM unlearning run
+    hirundo unlearning list   - List LLM unlearning runs
+    hirundo unlearning check  - Check the status of an LLM unlearning run
+"""
+
+import sys
+from typing import Annotated, Optional
+
+import typer
+from rich.console import Console
+from rich.table import Table
+
+docs = "sphinx" in sys.modules
+unlearning_epilog = (
+    None
+    if docs
+    else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
+)
+
+console = Console()
+
+unlearning_app = typer.Typer(
+    name="unlearning",
+    no_args_is_help=True,
+    rich_markup_mode="rich",
+    epilog=unlearning_epilog,
+    help="Launch and monitor LLM unlearning runs.",
+)
+
+
+@unlearning_app.command("run", epilog=unlearning_epilog)
+def unlearning_run(
+    model_id: Annotated[int, typer.Argument(help="ID of the LLM model to unlearn.")],
+    bias_type: Annotated[
+        Optional[str],
+        typer.Option(
+            "--bias-type",
+            help="Bias type for unlearning. One of: ALL, RACE, NATIONALITY, GENDER, PHYSICAL_APPEARANCE, RELIGION, AGE",
+        ),
+    ] = None,
+    hallucination_type: Annotated[
+        Optional[str],
+        typer.Option(
+            "--hallucination-type",
+            help="Hallucination type for unlearning. One of: GENERAL, MEDICAL, LEGAL, DEFENSE",
+        ),
+    ] = None,
+    name: Annotated[
+        Optional[str],
+        typer.Option("--name", help="Optional name for this unlearning run."),
+    ] = None,
+    wait: Annotated[
+        bool,
+        typer.Option("--wait/--no-wait", help="Wait for the run to complete and stream progress."),
+    ] = True,
+):
+    """
+    Launch an LLM unlearning run.
+
+    Exactly one of --bias-type or --hallucination-type must be provided.
+    """
+    from hirundo.llm_bias_type import BBQBiasType
+    from hirundo.unlearning_llm import (
+        BiasBehavior,
+        DefaultUtility,
+        HallucinationBehavior,
+        HallucinationType,
+        LlmRunInfo,
+        LlmUnlearningRun,
+    )
+
+    if bias_type is None and hallucination_type is None:
+        console.print(
+            "[red]Error: either --bias-type or --hallucination-type must be provided.[/red]"
+        )
+        raise typer.Exit(code=1)
+    if bias_type is not None and hallucination_type is not None:
+        console.print(
+            "[red]Error: only one of --bias-type or --hallucination-type may be provided.[/red]"
+        )
+        raise typer.Exit(code=1)
+
+    if bias_type is not None:
+        try:
+            bias_type_enum = BBQBiasType(bias_type.upper())
+        except ValueError:
+            valid = ", ".join(b.value for b in BBQBiasType)
+            console.print(
+                f"[red]Invalid bias type '{bias_type}'. Valid options: {valid}[/red]"
+            )
+            raise typer.Exit(code=1)
+        target_behavior = BiasBehavior(bias_type=bias_type_enum)
+    else:
+        try:
+            hallucination_type_enum = HallucinationType(hallucination_type.upper())
+        except ValueError:
+            valid = ", ".join(h.value for h in HallucinationType)
+            console.print(
+                f"[red]Invalid hallucination type '{hallucination_type}'. Valid options: {valid}[/red]"
+            )
+            raise typer.Exit(code=1)
+        target_behavior = HallucinationBehavior(hallucination_type=hallucination_type_enum)
+
+    run_info = LlmRunInfo(
+        name=name,
+        target_behaviors=[target_behavior],
+        target_utilities=[DefaultUtility()],
+    )
+
+    run_id = LlmUnlearningRun.launch(model_id, run_info)
+    console.print(f"Unlearning run started. Run ID: [bold]{run_id}[/bold]")
+
+    if wait:
+        LlmUnlearningRun.check_run_by_id(run_id)
+    else:
+        console.print(
+            "Use [bold]hirundo unlearning check[/bold] [italic]<run_id>[/italic] to monitor progress."
+        )
+
+
+@unlearning_app.command("list", epilog=unlearning_epilog)
+def unlearning_list(
+    archived: Annotated[
+        bool,
+        typer.Option("--archived/--no-archived", help="Include archived runs."),
+    ] = False,
+):
+    """
+    List LLM unlearning runs.
+    """
+    from hirundo.unlearning_llm import LlmUnlearningRun
+
+    runs = LlmUnlearningRun.list(archived=archived)
+
+    table = Table(title="Unlearning Runs:", expand=True)
+    for col in ("Name", "Run ID", "Status", "Created At"):
+        table.add_column(col, overflow="fold")
+    for run in runs:
+        table.add_row(
+            str(run.name),
+            str(run.run_id),
+            str(run.status),
+            run.created_at.isoformat(),
+        )
+    console.print(table)
+
+
+@unlearning_app.command("check", epilog=unlearning_epilog)
+def unlearning_check(
+    run_id: Annotated[str, typer.Argument(help="The run ID to check.")],
+):
+    """
+    Check the status of an LLM unlearning run and stream progress.
+    """
+    from hirundo.unlearning_llm import LlmUnlearningRun
+
+    LlmUnlearningRun.check_run_by_id(run_id)

From 7b375da6420812a7ec4c0cfb80b29bffa4468ec4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 10 Apr 2026 00:27:20 +0000
Subject: [PATCH 02/13] Deduplicate CLI boilerplate into _cli_common module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract repeated docs/epilog/console/Typer setup from the three
CLI sub-apps into hirundo/_cli_common.py:

- hirundo_epilog, docs, console — computed/instantiated once
- make_app() — factory for sub-app Typer instances
- validate_enum() — replaces copy-paste try/except enum coercion

Remove verbose module docstrings that duplicated typer --help output.
Remove local Console() instantiation from cli.py list_runs().

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/_cli_common.py    | 32 ++++++++++++++++++++++
 hirundo/cli.py            | 11 +-------
 hirundo/cli_dataset_qa.py | 34 ++++--------------------
 hirundo/cli_eval.py       | 42 +++++------------------------
 hirundo/cli_unlearning.py | 56 +++++++--------------------------------
 5 files changed, 53 insertions(+), 122 deletions(-)
 create mode 100644 hirundo/_cli_common.py

diff --git a/hirundo/_cli_common.py b/hirundo/_cli_common.py
new file mode 100644
index 00000000..c564588f
--- /dev/null
+++ b/hirundo/_cli_common.py
@@ -0,0 +1,32 @@
+import sys
+
+import typer
+from rich.console import Console
+
+docs = "sphinx" in sys.modules
+hirundo_epilog = (
+    None
+    if docs
+    else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
+)
+
+console = Console()
+
+
+def make_app(name: str, help: str) -> typer.Typer:
+    return typer.Typer(
+        name=name,
+        no_args_is_help=True,
+        rich_markup_mode="rich",
+        epilog=hirundo_epilog,
+        help=help,
+    )
+
+
+def validate_enum(value: str, enum_cls, label: str):
+    try:
+        return enum_cls(value.upper())
+    except ValueError:
+        valid = ", ".join(e.value for e in enum_cls)
+        console.print(f"[red]Invalid {label} '{value}'. Valid options: {valid}[/red]")
+        raise typer.Exit(code=1)
diff --git a/hirundo/cli.py b/hirundo/cli.py
index 76fbc439..24e8bd60 100644
--- a/hirundo/cli.py
+++ b/hirundo/cli.py
@@ -1,26 +1,18 @@
 import os
 import re
-import sys
 from pathlib import Path
 from typing import Annotated
 from urllib.parse import urlparse
 
 import typer
-from rich.console import Console
 from rich.table import Table
 
+from hirundo._cli_common import console, docs, hirundo_epilog
 from hirundo._env import API_HOST, EnvLocation
 from hirundo.cli_dataset_qa import dataset_qa_app
 from hirundo.cli_eval import eval_app
 from hirundo.cli_unlearning import unlearning_app
 
-docs = "sphinx" in sys.modules
-hirundo_epilog = (
-    None
-    if docs
-    else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
-)
-
 
 app = typer.Typer(
     name="hirundo",
@@ -217,7 +209,6 @@ def list_runs():
 
     runs = QADataset.list_runs()
 
-    console = Console()
     table = Table(
         title="Runs:",
         expand=True,
diff --git a/hirundo/cli_dataset_qa.py b/hirundo/cli_dataset_qa.py
index c47f1224..45e43a95 100644
--- a/hirundo/cli_dataset_qa.py
+++ b/hirundo/cli_dataset_qa.py
@@ -1,38 +1,14 @@
-"""
-CLI sub-app for Dataset QA commands.
-
-Commands:
-    hirundo dataset-qa run    - Launch a Dataset QA run
-    hirundo dataset-qa list   - List Dataset QA runs
-    hirundo dataset-qa check  - Check the status of a Dataset QA run
-"""
-
-import sys
 from typing import Annotated
 
 import typer
-from rich.console import Console
 from rich.table import Table
 
-docs = "sphinx" in sys.modules
-dataset_qa_epilog = (
-    None
-    if docs
-    else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
-)
-
-console = Console()
+from hirundo._cli_common import console, hirundo_epilog, make_app
 
-dataset_qa_app = typer.Typer(
-    name="dataset-qa",
-    no_args_is_help=True,
-    rich_markup_mode="rich",
-    epilog=dataset_qa_epilog,
-    help="Launch and monitor Dataset QA runs.",
-)
+dataset_qa_app = make_app("dataset-qa", "Launch and monitor Dataset QA runs.")
 
 
-@dataset_qa_app.command("run", epilog=dataset_qa_epilog)
+@dataset_qa_app.command("run", epilog=hirundo_epilog)
 def dataset_qa_run(
     dataset_id: Annotated[int, typer.Argument(help="ID of the dataset to run QA on.")],
     wait: Annotated[
@@ -56,7 +32,7 @@ def dataset_qa_run(
         )
 
 
-@dataset_qa_app.command("list", epilog=dataset_qa_epilog)
+@dataset_qa_app.command("list", epilog=hirundo_epilog)
 def dataset_qa_list(
     archived: Annotated[
         bool,
@@ -84,7 +60,7 @@ def dataset_qa_list(
     console.print(table)
 
 
-@dataset_qa_app.command("check", epilog=dataset_qa_epilog)
+@dataset_qa_app.command("check", epilog=hirundo_epilog)
 def dataset_qa_check(
     run_id: Annotated[str, typer.Argument(help="The run ID to check.")],
 ):
diff --git a/hirundo/cli_eval.py b/hirundo/cli_eval.py
index ab019a0f..e54fa737 100644
--- a/hirundo/cli_eval.py
+++ b/hirundo/cli_eval.py
@@ -1,38 +1,14 @@
-"""
-CLI sub-app for LLM behavior evaluation commands.
-
-Commands:
-    hirundo eval run    - Launch an LLM behavior evaluation run
-    hirundo eval list   - List evaluation runs
-    hirundo eval check  - Check the status of an evaluation run
-"""
-
-import sys
 from typing import Annotated, Optional
 
 import typer
-from rich.console import Console
 from rich.table import Table
 
-docs = "sphinx" in sys.modules
-eval_epilog = (
-    None
-    if docs
-    else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
-)
-
-console = Console()
+from hirundo._cli_common import console, hirundo_epilog, make_app, validate_enum
 
-eval_app = typer.Typer(
-    name="eval",
-    no_args_is_help=True,
-    rich_markup_mode="rich",
-    epilog=eval_epilog,
-    help="Launch and monitor LLM behavior evaluation runs.",
-)
+eval_app = make_app("eval", "Launch and monitor LLM behavior evaluation runs.")
 
 
-@eval_app.command("run", epilog=eval_epilog)
+@eval_app.command("run", epilog=hirundo_epilog)
 def eval_run(
     preset: Annotated[
         str,
@@ -72,13 +48,7 @@ def eval_run(
         console.print("[red]Error: only one of --model-id or --source-run-id may be provided.[/red]")
         raise typer.Exit(code=1)
 
-    try:
-        preset_type = PresetType(preset.upper())
-    except ValueError:
-        valid = ", ".join(p.value for p in PresetType)
-        console.print(f"[red]Invalid preset '{preset}'. Valid options: {valid}[/red]")
-        raise typer.Exit(code=1)
-
+    preset_type = validate_enum(preset, PresetType, "preset")
     model_or_run = ModelOrRun.MODEL if model_id is not None else ModelOrRun.RUN
     run_info = EvalRunInfo(
         model_id=model_id,
@@ -98,7 +68,7 @@ def eval_run(
         )
 
 
-@eval_app.command("list", epilog=eval_epilog)
+@eval_app.command("list", epilog=hirundo_epilog)
 def eval_list(
     archived: Annotated[
         bool,
@@ -126,7 +96,7 @@ def eval_list(
     console.print(table)
 
 
-@eval_app.command("check", epilog=eval_epilog)
+@eval_app.command("check", epilog=hirundo_epilog)
 def eval_check(
     run_id: Annotated[str, typer.Argument(help="The run ID to check.")],
 ):
diff --git a/hirundo/cli_unlearning.py b/hirundo/cli_unlearning.py
index 6be6fed2..d920f36a 100644
--- a/hirundo/cli_unlearning.py
+++ b/hirundo/cli_unlearning.py
@@ -1,38 +1,14 @@
-"""
-CLI sub-app for LLM unlearning commands.
-
-Commands:
-    hirundo unlearning run    - Launch an LLM unlearning run
-    hirundo unlearning list   - List LLM unlearning runs
-    hirundo unlearning check  - Check the status of an LLM unlearning run
-"""
-
-import sys
 from typing import Annotated, Optional
 
 import typer
-from rich.console import Console
 from rich.table import Table
 
-docs = "sphinx" in sys.modules
-unlearning_epilog = (
-    None
-    if docs
-    else "Made with ❤️ by Hirundo. Visit https://www.hirundo.io for more information."
-)
+from hirundo._cli_common import console, hirundo_epilog, make_app, validate_enum
 
-console = Console()
+unlearning_app = make_app("unlearning", "Launch and monitor LLM unlearning runs.")
 
-unlearning_app = typer.Typer(
-    name="unlearning",
-    no_args_is_help=True,
-    rich_markup_mode="rich",
-    epilog=unlearning_epilog,
-    help="Launch and monitor LLM unlearning runs.",
-)
 
-
-@unlearning_app.command("run", epilog=unlearning_epilog)
+@unlearning_app.command("run", epilog=hirundo_epilog)
 def unlearning_run(
     model_id: Annotated[int, typer.Argument(help="ID of the LLM model to unlearn.")],
     bias_type: Annotated[
@@ -85,25 +61,11 @@ def unlearning_run(
         raise typer.Exit(code=1)
 
     if bias_type is not None:
-        try:
-            bias_type_enum = BBQBiasType(bias_type.upper())
-        except ValueError:
-            valid = ", ".join(b.value for b in BBQBiasType)
-            console.print(
-                f"[red]Invalid bias type '{bias_type}'. Valid options: {valid}[/red]"
-            )
-            raise typer.Exit(code=1)
-        target_behavior = BiasBehavior(bias_type=bias_type_enum)
+        target_behavior = BiasBehavior(bias_type=validate_enum(bias_type, BBQBiasType, "bias type"))
     else:
-        try:
-            hallucination_type_enum = HallucinationType(hallucination_type.upper())
-        except ValueError:
-            valid = ", ".join(h.value for h in HallucinationType)
-            console.print(
-                f"[red]Invalid hallucination type '{hallucination_type}'. Valid options: {valid}[/red]"
-            )
-            raise typer.Exit(code=1)
-        target_behavior = HallucinationBehavior(hallucination_type=hallucination_type_enum)
+        target_behavior = HallucinationBehavior(
+            hallucination_type=validate_enum(hallucination_type, HallucinationType, "hallucination type")
+        )
 
     run_info = LlmRunInfo(
         name=name,
@@ -122,7 +84,7 @@ def unlearning_run(
         )
 
 
-@unlearning_app.command("list", epilog=unlearning_epilog)
+@unlearning_app.command("list", epilog=hirundo_epilog)
 def unlearning_list(
     archived: Annotated[
         bool,
@@ -149,7 +111,7 @@ def unlearning_list(
     console.print(table)
 
 
-@unlearning_app.command("check", epilog=unlearning_epilog)
+@unlearning_app.command("check", epilog=hirundo_epilog)
 def unlearning_check(
     run_id: Annotated[str, typer.Argument(help="The run ID to check.")],
 ):

From 43afadcae526b1d174c42aba6f97ac6d79933a5c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 21:43:27 +0000
Subject: [PATCH 03/13] Fix CI: ruff, basedpyright, and module-docs failures

ruff:
- Rename make_app `help` param to `help_text` (A002 builtin shadow)
- raise typer.Exit(code=1) from None in validate_enum (B904)
- Remove unused Optional imports now replaced by X | None (F401)
- Fix import ordering (I001, auto-fixed)
- Convert Optional[X] to X | None (UP045, auto-fixed)

basedpyright:
- Use elif hallucination_type is not None + exhaustive else to let
  pyright narrow the str | None type before passing to validate_enum

module-docs:
- Add .rst files for the three new public CLI modules

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 docs/hirundo.cli_dataset_qa.rst | 10 ++++++++++
 docs/hirundo.cli_eval.rst       | 10 ++++++++++
 docs/hirundo.cli_unlearning.rst | 10 ++++++++++
 hirundo/_cli_common.py          |  6 +++---
 hirundo/cli_eval.py             | 15 ++++++++++-----
 hirundo/cli_unlearning.py       | 12 +++++++-----
 6 files changed, 50 insertions(+), 13 deletions(-)
 create mode 100644 docs/hirundo.cli_dataset_qa.rst
 create mode 100644 docs/hirundo.cli_eval.rst
 create mode 100644 docs/hirundo.cli_unlearning.rst

diff --git a/docs/hirundo.cli_dataset_qa.rst b/docs/hirundo.cli_dataset_qa.rst
new file mode 100644
index 00000000..2144fdf0
--- /dev/null
+++ b/docs/hirundo.cli_dataset_qa.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.cli_dataset_qa module
+=============================
+
+.. automodule:: hirundo.cli_dataset_qa
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/hirundo.cli_eval.rst b/docs/hirundo.cli_eval.rst
new file mode 100644
index 00000000..85c429f3
--- /dev/null
+++ b/docs/hirundo.cli_eval.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.cli_eval module
+=======================
+
+.. automodule:: hirundo.cli_eval
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/hirundo.cli_unlearning.rst b/docs/hirundo.cli_unlearning.rst
new file mode 100644
index 00000000..1b54735e
--- /dev/null
+++ b/docs/hirundo.cli_unlearning.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.cli_unlearning module
+=============================
+
+.. automodule:: hirundo.cli_unlearning
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/hirundo/_cli_common.py b/hirundo/_cli_common.py
index c564588f..154bc891 100644
--- a/hirundo/_cli_common.py
+++ b/hirundo/_cli_common.py
@@ -13,13 +13,13 @@
 console = Console()
 
 
-def make_app(name: str, help: str) -> typer.Typer:
+def make_app(name: str, help_text: str) -> typer.Typer:
     return typer.Typer(
         name=name,
         no_args_is_help=True,
         rich_markup_mode="rich",
         epilog=hirundo_epilog,
-        help=help,
+        help=help_text,
     )
 
 
@@ -29,4 +29,4 @@ def validate_enum(value: str, enum_cls, label: str):
     except ValueError:
         valid = ", ".join(e.value for e in enum_cls)
         console.print(f"[red]Invalid {label} '{value}'. Valid options: {valid}[/red]")
-        raise typer.Exit(code=1)
+        raise typer.Exit(code=1) from None
diff --git a/hirundo/cli_eval.py b/hirundo/cli_eval.py
index e54fa737..ce2416ce 100644
--- a/hirundo/cli_eval.py
+++ b/hirundo/cli_eval.py
@@ -1,4 +1,4 @@
-from typing import Annotated, Optional
+from typing import Annotated
 
 import typer
 from rich.table import Table
@@ -18,15 +18,15 @@ def eval_run(
         ),
     ],
     model_id: Annotated[
-        Optional[int],
+        int | None,
         typer.Option("--model-id", help="ID of the LLM model to evaluate."),
     ] = None,
     source_run_id: Annotated[
-        Optional[str],
+        str | None,
         typer.Option("--source-run-id", help="ID of the unlearning run to evaluate."),
     ] = None,
     name: Annotated[
-        Optional[str],
+        str | None,
         typer.Option("--name", help="Optional name for this evaluation run."),
     ] = None,
     wait: Annotated[
@@ -39,7 +39,12 @@ def eval_run(
 
     Either --model-id or --source-run-id must be provided.
     """
-    from hirundo.llm_behavior_eval import EvalRunInfo, LlmBehaviorEval, ModelOrRun, PresetType
+    from hirundo.llm_behavior_eval import (
+        EvalRunInfo,
+        LlmBehaviorEval,
+        ModelOrRun,
+        PresetType,
+    )
 
     if model_id is None and source_run_id is None:
         console.print("[red]Error: either --model-id or --source-run-id must be provided.[/red]")
diff --git a/hirundo/cli_unlearning.py b/hirundo/cli_unlearning.py
index d920f36a..4c228af1 100644
--- a/hirundo/cli_unlearning.py
+++ b/hirundo/cli_unlearning.py
@@ -1,4 +1,4 @@
-from typing import Annotated, Optional
+from typing import Annotated
 
 import typer
 from rich.table import Table
@@ -12,21 +12,21 @@
 def unlearning_run(
     model_id: Annotated[int, typer.Argument(help="ID of the LLM model to unlearn.")],
     bias_type: Annotated[
-        Optional[str],
+        str | None,
         typer.Option(
             "--bias-type",
             help="Bias type for unlearning. One of: ALL, RACE, NATIONALITY, GENDER, PHYSICAL_APPEARANCE, RELIGION, AGE",
         ),
     ] = None,
     hallucination_type: Annotated[
-        Optional[str],
+        str | None,
         typer.Option(
             "--hallucination-type",
             help="Hallucination type for unlearning. One of: GENERAL, MEDICAL, LEGAL, DEFENSE",
         ),
     ] = None,
     name: Annotated[
-        Optional[str],
+        str | None,
         typer.Option("--name", help="Optional name for this unlearning run."),
     ] = None,
     wait: Annotated[
@@ -62,10 +62,12 @@ def unlearning_run(
 
     if bias_type is not None:
         target_behavior = BiasBehavior(bias_type=validate_enum(bias_type, BBQBiasType, "bias type"))
-    else:
+    elif hallucination_type is not None:
         target_behavior = HallucinationBehavior(
             hallucination_type=validate_enum(hallucination_type, HallucinationType, "hallucination type")
         )
+    else:
+        raise typer.Exit(code=1) from None
 
     run_info = LlmRunInfo(
         name=name,

From d235f6bf9c44dd667f5b730bda438ce374069340 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 21:43:42 +0000
Subject: [PATCH 04/13] Remove extra blank line in cli.py (linter)

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hirundo/cli.py b/hirundo/cli.py
index 24e8bd60..ea4d841e 100644
--- a/hirundo/cli.py
+++ b/hirundo/cli.py
@@ -13,7 +13,6 @@
 from hirundo.cli_eval import eval_app
 from hirundo.cli_unlearning import unlearning_app
 
-
 app = typer.Typer(
     name="hirundo",
     no_args_is_help=True,

From f1a4e0f5a901f40e712e8c6e95356a6b5197f6fc Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 21:47:56 +0000
Subject: [PATCH 05/13] Fix ruff format failures

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/cli_dataset_qa.py |  4 +++-
 hirundo/cli_eval.py       | 12 +++++++++---
 hirundo/cli_unlearning.py | 12 +++++++++---
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/hirundo/cli_dataset_qa.py b/hirundo/cli_dataset_qa.py
index 45e43a95..89cf5209 100644
--- a/hirundo/cli_dataset_qa.py
+++ b/hirundo/cli_dataset_qa.py
@@ -13,7 +13,9 @@ def dataset_qa_run(
     dataset_id: Annotated[int, typer.Argument(help="ID of the dataset to run QA on.")],
     wait: Annotated[
         bool,
-        typer.Option("--wait/--no-wait", help="Wait for the run to complete and stream progress."),
+        typer.Option(
+            "--wait/--no-wait", help="Wait for the run to complete and stream progress."
+        ),
     ] = True,
 ):
     """
diff --git a/hirundo/cli_eval.py b/hirundo/cli_eval.py
index ce2416ce..2fdf2c2b 100644
--- a/hirundo/cli_eval.py
+++ b/hirundo/cli_eval.py
@@ -31,7 +31,9 @@ def eval_run(
     ] = None,
     wait: Annotated[
         bool,
-        typer.Option("--wait/--no-wait", help="Wait for the run to complete and stream progress."),
+        typer.Option(
+            "--wait/--no-wait", help="Wait for the run to complete and stream progress."
+        ),
     ] = True,
 ):
     """
@@ -47,10 +49,14 @@ def eval_run(
     )
 
     if model_id is None and source_run_id is None:
-        console.print("[red]Error: either --model-id or --source-run-id must be provided.[/red]")
+        console.print(
+            "[red]Error: either --model-id or --source-run-id must be provided.[/red]"
+        )
         raise typer.Exit(code=1)
     if model_id is not None and source_run_id is not None:
-        console.print("[red]Error: only one of --model-id or --source-run-id may be provided.[/red]")
+        console.print(
+            "[red]Error: only one of --model-id or --source-run-id may be provided.[/red]"
+        )
         raise typer.Exit(code=1)
 
     preset_type = validate_enum(preset, PresetType, "preset")
diff --git a/hirundo/cli_unlearning.py b/hirundo/cli_unlearning.py
index 4c228af1..987cbf4a 100644
--- a/hirundo/cli_unlearning.py
+++ b/hirundo/cli_unlearning.py
@@ -31,7 +31,9 @@ def unlearning_run(
     ] = None,
     wait: Annotated[
         bool,
-        typer.Option("--wait/--no-wait", help="Wait for the run to complete and stream progress."),
+        typer.Option(
+            "--wait/--no-wait", help="Wait for the run to complete and stream progress."
+        ),
     ] = True,
 ):
     """
@@ -61,10 +63,14 @@ def unlearning_run(
         raise typer.Exit(code=1)
 
     if bias_type is not None:
-        target_behavior = BiasBehavior(bias_type=validate_enum(bias_type, BBQBiasType, "bias type"))
+        target_behavior = BiasBehavior(
+            bias_type=validate_enum(bias_type, BBQBiasType, "bias type")
+        )
     elif hallucination_type is not None:
         target_behavior = HallucinationBehavior(
-            hallucination_type=validate_enum(hallucination_type, HallucinationType, "hallucination type")
+            hallucination_type=validate_enum(
+                hallucination_type, HallucinationType, "hallucination type"
+            )
         )
     else:
         raise typer.Exit(code=1) from None

From 23e9cc8bff1b2e229ba40e183efd295ac3e2eac0 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 21:52:56 +0000
Subject: [PATCH 06/13] Validate run_id in check commands to prevent path
 traversal

run_id flows into cache_dir / f"{run_id}.zip" in unzip.py.
A malicious value like ../../evil would escape ~/.hirundo/cache/.

Add validate_run_id() to _cli_common which rejects anything outside
[a-zA-Z0-9_-] (covers UUIDs and integer IDs). Apply it in the check
command of each sub-app before passing run_id to the SDK.

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/_cli_common.py    | 13 +++++++++++++
 hirundo/cli_dataset_qa.py |  4 ++--
 hirundo/cli_eval.py       | 10 ++++++++--
 hirundo/cli_unlearning.py | 10 ++++++++--
 4 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/hirundo/_cli_common.py b/hirundo/_cli_common.py
index 154bc891..183ee4e0 100644
--- a/hirundo/_cli_common.py
+++ b/hirundo/_cli_common.py
@@ -1,8 +1,11 @@
+import re
 import sys
 
 import typer
 from rich.console import Console
 
+_RUN_ID_RE = re.compile(r"^[a-zA-Z0-9_-]+$")
+
 docs = "sphinx" in sys.modules
 hirundo_epilog = (
     None
@@ -23,6 +26,16 @@ def make_app(name: str, help_text: str) -> typer.Typer:
     )
 
 
+def validate_run_id(run_id: str) -> str:
+    if not _RUN_ID_RE.match(run_id):
+        console.print(
+            f"[red]Invalid run ID '{run_id}'. "
+            "Run IDs may only contain alphanumeric characters, hyphens, and underscores.[/red]"
+        )
+        raise typer.Exit(code=1) from None
+    return run_id
+
+
 def validate_enum(value: str, enum_cls, label: str):
     try:
         return enum_cls(value.upper())
diff --git a/hirundo/cli_dataset_qa.py b/hirundo/cli_dataset_qa.py
index 89cf5209..0954323b 100644
--- a/hirundo/cli_dataset_qa.py
+++ b/hirundo/cli_dataset_qa.py
@@ -3,7 +3,7 @@
 import typer
 from rich.table import Table
 
-from hirundo._cli_common import console, hirundo_epilog, make_app
+from hirundo._cli_common import console, hirundo_epilog, make_app, validate_run_id
 
 dataset_qa_app = make_app("dataset-qa", "Launch and monitor Dataset QA runs.")
 
@@ -71,4 +71,4 @@ def dataset_qa_check(
     """
     from hirundo.dataset_qa import QADataset
 
-    QADataset.check_run_by_id(run_id)
+    QADataset.check_run_by_id(validate_run_id(run_id))
diff --git a/hirundo/cli_eval.py b/hirundo/cli_eval.py
index 2fdf2c2b..5e4b49e0 100644
--- a/hirundo/cli_eval.py
+++ b/hirundo/cli_eval.py
@@ -3,7 +3,13 @@
 import typer
 from rich.table import Table
 
-from hirundo._cli_common import console, hirundo_epilog, make_app, validate_enum
+from hirundo._cli_common import (
+    console,
+    hirundo_epilog,
+    make_app,
+    validate_enum,
+    validate_run_id,
+)
 
 eval_app = make_app("eval", "Launch and monitor LLM behavior evaluation runs.")
 
@@ -116,4 +122,4 @@ def eval_check(
     """
     from hirundo.llm_behavior_eval import LlmBehaviorEval
 
-    LlmBehaviorEval.check_run_by_id(run_id)
+    LlmBehaviorEval.check_run_by_id(validate_run_id(run_id))
diff --git a/hirundo/cli_unlearning.py b/hirundo/cli_unlearning.py
index 987cbf4a..955aaefd 100644
--- a/hirundo/cli_unlearning.py
+++ b/hirundo/cli_unlearning.py
@@ -3,7 +3,13 @@
 import typer
 from rich.table import Table
 
-from hirundo._cli_common import console, hirundo_epilog, make_app, validate_enum
+from hirundo._cli_common import (
+    console,
+    hirundo_epilog,
+    make_app,
+    validate_enum,
+    validate_run_id,
+)
 
 unlearning_app = make_app("unlearning", "Launch and monitor LLM unlearning runs.")
 
@@ -128,4 +134,4 @@ def unlearning_check(
     """
     from hirundo.unlearning_llm import LlmUnlearningRun
 
-    LlmUnlearningRun.check_run_by_id(run_id)
+    LlmUnlearningRun.check_run_by_id(validate_run_id(run_id))

From 36663cfa646f71f4d306518d47ef369761f16544 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 22:04:14 +0000
Subject: [PATCH 07/13] refactor(cli): extract wait_or_notify, print_runs_table
 helpers; surface cached_zip_path

Adds two shared helpers to _cli_common to eliminate copy-paste across the
three CLI sub-apps: wait_or_notify centralises the wait/notify branching,
and print_runs_table centralises Rich table construction. Also captures and
prints results.cached_zip_path for dataset-qa and eval check/run commands,
matching the legacy check-run behaviour.

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/_cli_common.py    | 29 ++++++++++++++++++++++-
 hirundo/cli_dataset_qa.py | 50 ++++++++++++++++++++++-----------------
 hirundo/cli_eval.py       | 43 +++++++++++++++++----------------
 hirundo/cli_unlearning.py | 35 +++++++++++++--------------
 4 files changed, 94 insertions(+), 63 deletions(-)

diff --git a/hirundo/_cli_common.py b/hirundo/_cli_common.py
index 183ee4e0..24d9738f 100644
--- a/hirundo/_cli_common.py
+++ b/hirundo/_cli_common.py
@@ -1,8 +1,11 @@
 import re
 import sys
+from collections.abc import Callable
+from typing import Any
 
 import typer
 from rich.console import Console
+from rich.table import Table
 
 _RUN_ID_RE = re.compile(r"^[a-zA-Z0-9_-]+$")
 
@@ -36,10 +39,34 @@ def validate_run_id(run_id: str) -> str:
     return run_id
 
 
-def validate_enum(value: str, enum_cls, label: str):
+def validate_enum(value: str, enum_cls: type, label: str) -> Any:
     try:
         return enum_cls(value.upper())
     except ValueError:
         valid = ", ".join(e.value for e in enum_cls)
         console.print(f"[red]Invalid {label} '{value}'. Valid options: {valid}[/red]")
         raise typer.Exit(code=1) from None
+
+
+def wait_or_notify(
+    run_id: str, check_fn: Callable[[str], Any], cmd_name: str, wait: bool
+) -> Any:
+    if wait:
+        return check_fn(run_id)
+    console.print(
+        f"Use [bold]hirundo {cmd_name} check[/bold] [italic]<run_id>[/italic] to monitor progress."
+    )
+    return None
+
+
+def print_runs_table(
+    title: str,
+    columns: tuple[str, ...],
+    rows: list[tuple[str | None, ...]],
+) -> None:
+    table = Table(title=title, expand=True)
+    for col in columns:
+        table.add_column(col, overflow="fold")
+    for row in rows:
+        table.add_row(*row)
+    console.print(table)
diff --git a/hirundo/cli_dataset_qa.py b/hirundo/cli_dataset_qa.py
index 0954323b..50c7b2e9 100644
--- a/hirundo/cli_dataset_qa.py
+++ b/hirundo/cli_dataset_qa.py
@@ -1,9 +1,15 @@
 from typing import Annotated
 
 import typer
-from rich.table import Table
 
-from hirundo._cli_common import console, hirundo_epilog, make_app, validate_run_id
+from hirundo._cli_common import (
+    console,
+    hirundo_epilog,
+    make_app,
+    print_runs_table,
+    validate_run_id,
+    wait_or_notify,
+)
 
 dataset_qa_app = make_app("dataset-qa", "Launch and monitor Dataset QA runs.")
 
@@ -26,12 +32,9 @@ def dataset_qa_run(
     run_id = QADataset.launch_qa_run(dataset_id)
     console.print(f"Dataset QA run started. Run ID: [bold]{run_id}[/bold]")
 
-    if wait:
-        QADataset.check_run_by_id(run_id)
-    else:
-        console.print(
-            "Use [bold]hirundo dataset-qa check[/bold] [italic]<run_id>[/italic] to monitor progress."
-        )
+    results = wait_or_notify(run_id, QADataset.check_run_by_id, "dataset-qa", wait)
+    if results is not None:
+        console.print(f"Run results saved to {results.cached_zip_path}")
 
 
 @dataset_qa_app.command("list", epilog=hirundo_epilog)
@@ -47,19 +50,20 @@ def dataset_qa_list(
     from hirundo.dataset_qa import QADataset
 
     runs = QADataset.list_runs(archived=archived)
-
-    table = Table(title="Dataset QA Runs:", expand=True)
-    for col in ("Dataset Name", "Run ID", "Status", "Created At", "Run Args"):
-        table.add_column(col, overflow="fold")
-    for run in runs:
-        table.add_row(
-            str(run.name),
-            str(run.run_id),
-            str(run.status),
-            run.created_at.isoformat(),
-            run.run_args.model_dump_json() if run.run_args else None,
-        )
-    console.print(table)
+    print_runs_table(
+        "Dataset QA Runs:",
+        ("Dataset Name", "Run ID", "Status", "Created At", "Run Args"),
+        [
+            (
+                str(run.name),
+                str(run.run_id),
+                str(run.status),
+                run.created_at.isoformat(),
+                run.run_args.model_dump_json() if run.run_args else None,
+            )
+            for run in runs
+        ],
+    )
 
 
 @dataset_qa_app.command("check", epilog=hirundo_epilog)
@@ -71,4 +75,6 @@ def dataset_qa_check(
     """
     from hirundo.dataset_qa import QADataset
 
-    QADataset.check_run_by_id(validate_run_id(run_id))
+    results = QADataset.check_run_by_id(validate_run_id(run_id))
+    if results is not None:
+        console.print(f"Run results saved to {results.cached_zip_path}")
diff --git a/hirundo/cli_eval.py b/hirundo/cli_eval.py
index 5e4b49e0..4889fc2c 100644
--- a/hirundo/cli_eval.py
+++ b/hirundo/cli_eval.py
@@ -1,14 +1,15 @@
 from typing import Annotated
 
 import typer
-from rich.table import Table
 
 from hirundo._cli_common import (
     console,
     hirundo_epilog,
     make_app,
+    print_runs_table,
     validate_enum,
     validate_run_id,
+    wait_or_notify,
 )
 
 eval_app = make_app("eval", "Launch and monitor LLM behavior evaluation runs.")
@@ -77,12 +78,9 @@ def eval_run(
     run_id = LlmBehaviorEval.launch_eval_run(model_or_run, run_info)
     console.print(f"Eval run started. Run ID: [bold]{run_id}[/bold]")
 
-    if wait:
-        LlmBehaviorEval.check_run_by_id(run_id)
-    else:
-        console.print(
-            "Use [bold]hirundo eval check[/bold] [italic]<run_id>[/italic] to monitor progress."
-        )
+    results = wait_or_notify(run_id, LlmBehaviorEval.check_run_by_id, "eval", wait)
+    if results is not None:
+        console.print(f"Run results saved to {results.cached_zip_path}")
 
 
 @eval_app.command("list", epilog=hirundo_epilog)
@@ -98,19 +96,20 @@ def eval_list(
     from hirundo.llm_behavior_eval import LlmBehaviorEval
 
     runs = LlmBehaviorEval.list_runs(archived=archived)
-
-    table = Table(title="Eval Runs:", expand=True)
-    for col in ("Run ID", "Name", "Status", "Preset", "Created At"):
-        table.add_column(col, overflow="fold")
-    for run in runs:
-        table.add_row(
-            str(run.run_id),
-            str(run.name),
-            str(run.status),
-            run.preset_type.value if run.preset_type else None,
-            run.created_at.isoformat(),
-        )
-    console.print(table)
+    print_runs_table(
+        "Eval Runs:",
+        ("Run ID", "Name", "Status", "Preset", "Created At"),
+        [
+            (
+                str(run.run_id),
+                str(run.name),
+                str(run.status),
+                run.preset_type.value if run.preset_type else None,
+                run.created_at.isoformat(),
+            )
+            for run in runs
+        ],
+    )
 
 
 @eval_app.command("check", epilog=hirundo_epilog)
@@ -122,4 +121,6 @@ def eval_check(
     """
     from hirundo.llm_behavior_eval import LlmBehaviorEval
 
-    LlmBehaviorEval.check_run_by_id(validate_run_id(run_id))
+    results = LlmBehaviorEval.check_run_by_id(validate_run_id(run_id))
+    if results is not None:
+        console.print(f"Run results saved to {results.cached_zip_path}")
diff --git a/hirundo/cli_unlearning.py b/hirundo/cli_unlearning.py
index 955aaefd..c7df1495 100644
--- a/hirundo/cli_unlearning.py
+++ b/hirundo/cli_unlearning.py
@@ -1,14 +1,15 @@
 from typing import Annotated
 
 import typer
-from rich.table import Table
 
 from hirundo._cli_common import (
     console,
     hirundo_epilog,
     make_app,
+    print_runs_table,
     validate_enum,
     validate_run_id,
+    wait_or_notify,
 )
 
 unlearning_app = make_app("unlearning", "Launch and monitor LLM unlearning runs.")
@@ -90,12 +91,7 @@ def unlearning_run(
     run_id = LlmUnlearningRun.launch(model_id, run_info)
     console.print(f"Unlearning run started. Run ID: [bold]{run_id}[/bold]")
 
-    if wait:
-        LlmUnlearningRun.check_run_by_id(run_id)
-    else:
-        console.print(
-            "Use [bold]hirundo unlearning check[/bold] [italic]<run_id>[/italic] to monitor progress."
-        )
+    wait_or_notify(run_id, LlmUnlearningRun.check_run_by_id, "unlearning", wait)
 
 
 @unlearning_app.command("list", epilog=hirundo_epilog)
@@ -111,18 +107,19 @@ def unlearning_list(
     from hirundo.unlearning_llm import LlmUnlearningRun
 
     runs = LlmUnlearningRun.list(archived=archived)
-
-    table = Table(title="Unlearning Runs:", expand=True)
-    for col in ("Name", "Run ID", "Status", "Created At"):
-        table.add_column(col, overflow="fold")
-    for run in runs:
-        table.add_row(
-            str(run.name),
-            str(run.run_id),
-            str(run.status),
-            run.created_at.isoformat(),
-        )
-    console.print(table)
+    print_runs_table(
+        "Unlearning Runs:",
+        ("Name", "Run ID", "Status", "Created At"),
+        [
+            (
+                str(run.name),
+                str(run.run_id),
+                str(run.status),
+                run.created_at.isoformat(),
+            )
+            for run in runs
+        ],
+    )
 
 
 @unlearning_app.command("check", epilog=hirundo_epilog)

From 63c27230b7ec36c26c4369bb2fbe70c45003cda0 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 22:05:49 +0000
Subject: [PATCH 08/13] fix(cli): use run.run_id instead of run.id in legacy
 list-runs table

The "Run ID" column was displaying the numeric DB id (run.id) instead of
the string run identifier (run.run_id), inconsistent with the dataset-qa list command.

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hirundo/cli.py b/hirundo/cli.py
index ea4d841e..3c84cb7e 100644
--- a/hirundo/cli.py
+++ b/hirundo/cli.py
@@ -227,7 +227,7 @@ def list_runs():
     for run in runs:
         table.add_row(
             str(run.name),
-            str(run.id),
+            str(run.run_id),
             str(run.status),
             run.created_at.isoformat(),
             run.run_args.model_dump_json() if run.run_args else None,

From cc2e8bfdc845b31694e260d9218aa388572cda47 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 22:06:36 +0000
Subject: [PATCH 09/13] fix(cli): validate --source-run-id in eval run command

Applies the same validate_run_id() check used by eval check/unlearning check
so invalid run ID formats are rejected consistently before launching.

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/cli_eval.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hirundo/cli_eval.py b/hirundo/cli_eval.py
index 4889fc2c..6ddc7863 100644
--- a/hirundo/cli_eval.py
+++ b/hirundo/cli_eval.py
@@ -66,6 +66,9 @@ def eval_run(
         )
         raise typer.Exit(code=1)
 
+    if source_run_id is not None:
+        source_run_id = validate_run_id(source_run_id)
+
     preset_type = validate_enum(preset, PresetType, "preset")
     model_or_run = ModelOrRun.MODEL if model_id is not None else ModelOrRun.RUN
     run_info = EvalRunInfo(

From 67dc58a019a59395068cf8db3511ebdf04584cb7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 22:12:51 +0000
Subject: [PATCH 10/13] fix(cli): use fullmatch for run ID validation to reject
 trailing newlines

re.match with $ allows a single trailing newline through; fullmatch requires
the entire string to consist of only the allowed characters.

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/_cli_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hirundo/_cli_common.py b/hirundo/_cli_common.py
index 24d9738f..a306ba66 100644
--- a/hirundo/_cli_common.py
+++ b/hirundo/_cli_common.py
@@ -30,7 +30,7 @@ def make_app(name: str, help_text: str) -> typer.Typer:
 
 
 def validate_run_id(run_id: str) -> str:
-    if not _RUN_ID_RE.match(run_id):
+    if not _RUN_ID_RE.fullmatch(run_id):
         console.print(
             f"[red]Invalid run ID '{run_id}'. "
             "Run IDs may only contain alphanumeric characters, hyphens, and underscores.[/red]"

From 474d44afd1bec7cb602c2fa6171beda374e6c510 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 22:14:59 +0000
Subject: [PATCH 11/13] fix(cli): type validate_enum enum_cls as type[Enum] for
 basedpyright

bare `type` is not iterable; `type[Enum]` lets basedpyright know the class
supports iteration via EnumMeta.

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/_cli_common.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hirundo/_cli_common.py b/hirundo/_cli_common.py
index a306ba66..0c1e1b2c 100644
--- a/hirundo/_cli_common.py
+++ b/hirundo/_cli_common.py
@@ -1,6 +1,7 @@
 import re
 import sys
 from collections.abc import Callable
+from enum import Enum
 from typing import Any
 
 import typer
@@ -39,7 +40,7 @@ def validate_run_id(run_id: str) -> str:
     return run_id
 
 
-def validate_enum(value: str, enum_cls: type, label: str) -> Any:
+def validate_enum(value: str, enum_cls: type[Enum], label: str) -> Any:
     try:
         return enum_cls(value.upper())
     except ValueError:

From 95812ac67919ef014cda46c8d8d02930abb4d014 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 9 May 2026 22:17:57 +0000
Subject: [PATCH 12/13] refactor(cli): use print_runs_table in legacy list-runs
 command

Replaces inline Rich Table construction with the shared print_runs_table
helper, removing the last duplication between list-runs and dataset-qa list.

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/cli.py | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/hirundo/cli.py b/hirundo/cli.py
index 3c84cb7e..1260205a 100644
--- a/hirundo/cli.py
+++ b/hirundo/cli.py
@@ -5,9 +5,8 @@
 from urllib.parse import urlparse
 
 import typer
-from rich.table import Table
 
-from hirundo._cli_common import console, docs, hirundo_epilog
+from hirundo._cli_common import docs, hirundo_epilog, print_runs_table
 from hirundo._env import API_HOST, EnvLocation
 from hirundo.cli_dataset_qa import dataset_qa_app
 from hirundo.cli_eval import eval_app
@@ -207,32 +206,20 @@ def list_runs():
     from hirundo.dataset_qa import QADataset
 
     runs = QADataset.list_runs()
-
-    table = Table(
-        title="Runs:",
-        expand=True,
-    )
-    cols = (
-        "Dataset name",
-        "Run ID",
-        "Status",
-        "Created At",
-        "Run Args",
+    print_runs_table(
+        "Runs:",
+        ("Dataset name", "Run ID", "Status", "Created At", "Run Args"),
+        [
+            (
+                str(run.name),
+                str(run.run_id),
+                str(run.status),
+                run.created_at.isoformat(),
+                run.run_args.model_dump_json() if run.run_args else None,
+            )
+            for run in runs
+        ],
     )
-    for col in cols:
-        table.add_column(
-            col,
-            overflow="fold",
-        )
-    for run in runs:
-        table.add_row(
-            str(run.name),
-            str(run.run_id),
-            str(run.status),
-            run.created_at.isoformat(),
-            run.run_args.model_dump_json() if run.run_args else None,
-        )
-    console.print(table)
 
 
 typer_click_object = typer.main.get_command(app)

From 9a4feff51c1f7a2014fe3bcd81ab6a754959ac1c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 10 May 2026 18:20:08 +0000
Subject: [PATCH 13/13] refactor(cli): clean up duplication, naming, and add
 unit tests

- Rename loop variable `e` -> `member` in validate_enum
- Delegate legacy list-runs to dataset_qa_list to eliminate duplicate table rendering
- Add validate_run_id + None guard to legacy check-run for consistency
- Add tests/test_cli_common.py covering validate_run_id (valid/invalid cases)
  and wait_or_notify (both wait=True/False branches)

https://claude.ai/code/session_01QGjQJb8G8DM8UaYt66Jewt
---
 hirundo/_cli_common.py   |  2 +-
 hirundo/cli.py           | 27 +++++------------------
 tests/test_cli_common.py | 47 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 22 deletions(-)
 create mode 100644 tests/test_cli_common.py

diff --git a/hirundo/_cli_common.py b/hirundo/_cli_common.py
index 0c1e1b2c..c2557fb7 100644
--- a/hirundo/_cli_common.py
+++ b/hirundo/_cli_common.py
@@ -44,7 +44,7 @@ def validate_enum(value: str, enum_cls: type[Enum], label: str) -> Any:
     try:
         return enum_cls(value.upper())
     except ValueError:
-        valid = ", ".join(e.value for e in enum_cls)
+        valid = ", ".join(member.value for member in enum_cls)
         console.print(f"[red]Invalid {label} '{value}'. Valid options: {valid}[/red]")
         raise typer.Exit(code=1) from None
 
diff --git a/hirundo/cli.py b/hirundo/cli.py
index 1260205a..0744df49 100644
--- a/hirundo/cli.py
+++ b/hirundo/cli.py
@@ -6,9 +6,9 @@
 
 import typer
 
-from hirundo._cli_common import docs, hirundo_epilog, print_runs_table
+from hirundo._cli_common import docs, hirundo_epilog, validate_run_id
 from hirundo._env import API_HOST, EnvLocation
-from hirundo.cli_dataset_qa import dataset_qa_app
+from hirundo.cli_dataset_qa import dataset_qa_app, dataset_qa_list
 from hirundo.cli_eval import eval_app
 from hirundo.cli_unlearning import unlearning_app
 
@@ -194,8 +194,9 @@ def check_run(
     """
     from hirundo.dataset_qa import QADataset
 
-    results = QADataset.check_run_by_id(run_id)
-    print(f"Run results saved to {results.cached_zip_path}")
+    results = QADataset.check_run_by_id(validate_run_id(run_id))
+    if results is not None:
+        print(f"Run results saved to {results.cached_zip_path}")
 
 
 @app.command("list-runs", epilog=hirundo_epilog)
@@ -203,23 +204,7 @@ def list_runs():
     """
     List all runs available.
     """
-    from hirundo.dataset_qa import QADataset
-
-    runs = QADataset.list_runs()
-    print_runs_table(
-        "Runs:",
-        ("Dataset name", "Run ID", "Status", "Created At", "Run Args"),
-        [
-            (
-                str(run.name),
-                str(run.run_id),
-                str(run.status),
-                run.created_at.isoformat(),
-                run.run_args.model_dump_json() if run.run_args else None,
-            )
-            for run in runs
-        ],
-    )
+    dataset_qa_list(archived=False)
 
 
 typer_click_object = typer.main.get_command(app)
diff --git a/tests/test_cli_common.py b/tests/test_cli_common.py
new file mode 100644
index 00000000..3b18d59d
--- /dev/null
+++ b/tests/test_cli_common.py
@@ -0,0 +1,47 @@
+from unittest.mock import MagicMock, patch
+
+import hirundo._cli_common as cli_common  # noqa: E402
+import pytest
+import typer
+from hirundo._cli_common import validate_run_id, wait_or_notify
+
+
+class TestValidateRunId:
+    def test_valid_id_returned_unchanged(self):
+        assert validate_run_id("abc-123_XYZ") == "abc-123_XYZ"
+
+    @pytest.mark.parametrize(
+        "bad_id", ["run/id", "run\\id", "run id", "run\nid", "run.id", ""]
+    )
+    def test_invalid_id_exits(self, bad_id):
+        with pytest.raises(typer.Exit) as exc:
+            validate_run_id(bad_id)
+        assert exc.value.exit_code == 1
+
+    def test_invalid_id_prints_message(self, bad_id="bad id"):
+        with (
+            patch.object(cli_common.console, "print") as mock_print,
+            pytest.raises(typer.Exit),
+        ):
+            validate_run_id(bad_id)
+        output = mock_print.call_args[0][0]
+        assert "bad id" in output
+        assert "may only contain" in output
+
+
+class TestWaitOrNotify:
+    def test_wait_true_calls_check_fn_and_returns_result(self):
+        check_fn = MagicMock(return_value="result")
+        assert wait_or_notify("run-1", check_fn, "dataset-qa", wait=True) == "result"
+        check_fn.assert_called_once_with("run-1")
+
+    def test_wait_false_returns_none_without_calling_check_fn(self):
+        check_fn = MagicMock()
+        assert wait_or_notify("run-1", check_fn, "dataset-qa", wait=False) is None
+        check_fn.assert_not_called()
+
+    def test_wait_false_prints_check_hint(self):
+        with patch.object(cli_common.console, "print") as mock_print:
+            wait_or_notify("run-1", MagicMock(), "dataset-qa", wait=False)
+        output = mock_print.call_args[0][0]
+        assert "dataset-qa check" in output