From 1c091b48a6293bd3eebb54e0134a018f9c630a5b Mon Sep 17 00:00:00 2001
From: Ben Lewis <hello@blewis.me>
Date: Tue, 13 Jan 2026 15:44:50 +0200
Subject: [PATCH 01/24] Add LLM eval metric models

---
 hirundo/__init__.py          |  18 ++
 hirundo/llm_behavior_eval.py | 393 +++++++++++++++++++++++++++++++++++
 2 files changed, 411 insertions(+)
 create mode 100644 hirundo/llm_behavior_eval.py

diff --git a/hirundo/__init__.py b/hirundo/__init__.py
index 96c51f2e..3028857f 100644
--- a/hirundo/__init__.py
+++ b/hirundo/__init__.py
@@ -23,6 +23,16 @@
     KeylabsObjSegImages,
     KeylabsObjSegVideo,
 )
+from .llm_behavior_eval import (
+    BiasType,
+    EvalRunInfo,
+    EvalRunRecord,
+    JudgeModel,
+    LlmBehaviorEval,
+    LlmBehaviorEvalError,
+    ModelOrRun,
+    PresetType,
+)
 from .storage import (
     StorageConfig,
     StorageGCP,
@@ -42,8 +52,16 @@
     "KeylabsObjDetVideo",
     "KeylabsObjSegImages",
     "KeylabsObjSegVideo",
+    "BiasType",
     "QADataset",
+    "EvalRunInfo",
+    "EvalRunRecord",
+    "JudgeModel",
+    "LlmBehaviorEval",
+    "LlmBehaviorEvalError",
     "ModalityType",
+    "ModelOrRun",
+    "PresetType",
     "RunArgs",
     "ClassificationRunArgs",
     "ObjectDetectionRunArgs",
diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
new file mode 100644
index 00000000..8df92d12
--- /dev/null
+++ b/hirundo/llm_behavior_eval.py
@@ -0,0 +1,393 @@
+import datetime
+import json
+import typing
+from collections.abc import AsyncGenerator, Generator
+from enum import Enum
+
+import httpx
+from pydantic import BaseModel, ConfigDict
+
+from hirundo._env import API_HOST
+from hirundo._headers import get_headers
+from hirundo._http import raise_for_status_with_reason, requests
+from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
+from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
+from hirundo.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class LlmBehaviorEvalError(Exception):
+    """
+    Custom exception used to indicate errors in `hirundo` LLM behavior eval runs.
+    """
+
+    pass
+
+
+class ModelOrRun(str, Enum):
+    MODEL = "model"
+    RUN = "run"
+
+
+class PresetType(str, Enum):
+    BBQ_BIAS = "BBQ_BIAS"
+    BBQ_UNBIAS = "BBQ_UNBIAS"
+    UNQOVER_BIAS = "UNQOVER_BIAS"
+    HALU_EVAL = "HALU_EVAL"
+    MED_HALLU = "MED_HALLU"
+    INJECTION_EVAL = "INJECTION_EVAL"
+
+
+class BiasType(str, Enum):
+    ALL = "ALL"
+    RACE = "RACE"
+    NATIONALITY = "NATIONALITY"
+    GENDER = "GENDER"
+    PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE"
+    RELIGION = "RELIGION"
+    AGE = "AGE"
+
+
+class JudgeModel(BaseModel):
+    path_or_repo_id: str
+    token: typing.Optional[str] = None
+    batch_size: typing.Optional[int] = None
+    output_tokens: typing.Optional[int] = None
+    use_4bit: typing.Optional[bool] = None
+
+
+class EvalRunInfo(BaseModel):
+    organization_id: typing.Optional[int] = None
+    name: typing.Optional[str] = None
+    model_id: typing.Optional[int] = None
+    source_run_id: typing.Optional[str] = None
+    file_path: typing.Optional[str] = None
+    preset_type: typing.Optional[PresetType] = None
+    bias_type: typing.Optional[BiasType] = None
+    judge_model: typing.Optional[JudgeModel] = None
+
+
+class OutputLlm(BaseModel):
+    model_config = {"extra": "allow"}
+
+    id: int
+    organization_id: int
+    creator_id: int
+    creator_name: str
+    created_at: datetime.datetime
+    updated_at: datetime.datetime
+    model_name: str
+    model_source: dict
+
+
+class OutputUnlearningLlmRun(BaseModel):
+    model_config = {"extra": "allow"}
+
+    id: int
+    name: str
+    run_id: str
+    status: str
+    created_at: datetime.datetime
+
+
+class LlmEvalMetricRow(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    benchmark: str
+    metric: str
+    original: typing.Optional[typing.Union[float, str]] = None
+    post_unlearning: typing.Optional[typing.Union[float, str]] = None
+    reduction_percent: typing.Optional[float] = None
+    subset: typing.Optional[str] = None
+
+
+class LlmEvalMetrics(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    rows: list[LlmEvalMetricRow]
+
+
+class EvalRunRecord(BaseModel):
+    id: int
+    name: str
+    model_id: typing.Optional[int]
+    model: typing.Optional[OutputLlm]
+    source_run_id: typing.Optional[str]
+    source_run: typing.Optional[OutputUnlearningLlmRun]
+    file_path: typing.Optional[str]
+    preset_type: typing.Optional[PresetType]
+    bias_type: typing.Optional[BiasType]
+    judge_model: typing.Optional[JudgeModel]
+    run_id: str
+    mlflow_run_id: typing.Optional[str]
+    status: str
+    created_at: datetime.datetime
+    pre_process_progress: float
+    optimization_progress: float
+    post_process_progress: float
+    metrics: typing.Optional[LlmEvalMetrics] = None
+    responses_zip_url: typing.Optional[str] = None
+
+
+class LlmBehaviorEval:
+    def __init__(self, run_id: typing.Optional[str] = None):
+        self.run_id = run_id
+
+    @staticmethod
+    def _parse_eval_run_record(response_payload: dict) -> EvalRunRecord:
+        model_payload = response_payload.get("model")
+        source_run_payload = response_payload.get("source_run")
+        judge_model_payload = response_payload.get("judge_model")
+        metrics_payload = response_payload.get("metrics")
+
+        model = (
+            OutputLlm.model_validate(model_payload)
+            if isinstance(model_payload, dict)
+            else None
+        )
+        source_run = (
+            OutputUnlearningLlmRun.model_validate(source_run_payload)
+            if isinstance(source_run_payload, dict)
+            else None
+        )
+        judge_model = (
+            JudgeModel.model_validate(judge_model_payload)
+            if isinstance(judge_model_payload, dict)
+            else None
+        )
+        if isinstance(metrics_payload, dict):
+            metrics = LlmEvalMetrics.model_validate(metrics_payload)
+        elif isinstance(metrics_payload, list):
+            metric_rows = [
+                LlmEvalMetricRow.model_validate(metric_row)
+                for metric_row in metrics_payload
+                if isinstance(metric_row, dict)
+            ]
+            metrics = LlmEvalMetrics(rows=metric_rows)
+        else:
+            metrics = None
+
+        return EvalRunRecord(
+            id=response_payload["id"],
+            name=response_payload["name"],
+            model_id=response_payload.get("model_id"),
+            model=model,
+            source_run_id=response_payload.get("source_run_id"),
+            source_run=source_run,
+            file_path=response_payload.get("file_path"),
+            preset_type=response_payload.get("preset_type"),
+            bias_type=response_payload.get("bias_type"),
+            judge_model=judge_model,
+            run_id=response_payload["run_id"],
+            mlflow_run_id=response_payload.get("mlflow_run_id"),
+            status=response_payload["status"],
+            created_at=response_payload["created_at"],
+            pre_process_progress=response_payload.get("pre_process_progress", 0.0),
+            optimization_progress=response_payload.get("optimization_progress", 0.0),
+            post_process_progress=response_payload.get("post_process_progress", 0.0),
+            metrics=metrics,
+            responses_zip_url=response_payload.get("responses_zip_url"),
+        )
+
+    @staticmethod
+    def launch_eval_run(
+        model_or_run: typing.Union[ModelOrRun, str],
+        run_info: EvalRunInfo,
+    ) -> str:
+        """
+        Launch an LLM behavior evaluation run.
+
+        Args:
+            model_or_run: Whether the evaluation is based on a model or a run.
+            run_info: The evaluation run parameters.
+
+        Returns:
+            The ID of the created evaluation run.
+        """
+        if isinstance(model_or_run, str):
+            model_or_run_value = ModelOrRun(model_or_run)
+        else:
+            model_or_run_value = model_or_run
+
+        response = requests.post(
+            f"{API_HOST}/llm-behavior-eval/run/{model_or_run_value.value}",
+            json=run_info.model_dump(mode="json"),
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        response_payload = response.json()
+        run_identifier = (
+            response_payload.get("run_id")
+            or response_payload.get("hir_run_id")
+            or response_payload.get("id")
+        )
+        if not run_identifier:
+            raise LlmBehaviorEvalError(
+                "Unable to determine the run ID from the response payload."
+            )
+        return run_identifier
+
+    @staticmethod
+    def cancel_by_id(run_id: str) -> None:
+        """
+        Cancel a running evaluation.
+        """
+        response = requests.patch(
+            f"{API_HOST}/llm-behavior-eval/run/cancel/{run_id}",
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+
+    def cancel(self) -> None:
+        if not self.run_id:
+            raise ValueError("No run has been started")
+        self.cancel_by_id(self.run_id)
+
+    @staticmethod
+    def rename_by_id(run_id: str, new_name: str) -> None:
+        """
+        Rename an evaluation run.
+        """
+        response = requests.patch(
+            f"{API_HOST}/llm-behavior-eval/run/rename/{run_id}",
+            json={"new_name": new_name},
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+
+    def rename(self, new_name: str) -> None:
+        if not self.run_id:
+            raise ValueError("No run has been started")
+        self.rename_by_id(self.run_id, new_name)
+
+    @staticmethod
+    def archive_by_id(run_id: str) -> None:
+        """
+        Archive an evaluation run.
+        """
+        response = requests.patch(
+            f"{API_HOST}/llm-behavior-eval/run/archive/{run_id}",
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+
+    def archive(self) -> None:
+        if not self.run_id:
+            raise ValueError("No run has been started")
+        self.archive_by_id(self.run_id)
+
+    @staticmethod
+    def restore_by_id(run_id: str) -> None:
+        """
+        Restore an archived evaluation run.
+        """
+        response = requests.patch(
+            f"{API_HOST}/llm-behavior-eval/run/restore/{run_id}",
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+
+    def restore(self) -> None:
+        if not self.run_id:
+            raise ValueError("No run has been started")
+        self.restore_by_id(self.run_id)
+
+    @staticmethod
+    def get_run_info_by_id(run_id: str) -> EvalRunRecord:
+        """
+        Retrieve the metadata for an evaluation run.
+        """
+        response = requests.get(
+            f"{API_HOST}/llm-behavior-eval/run/info/{run_id}",
+            headers=get_headers(),
+            timeout=READ_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        response_payload = response.json()
+        return LlmBehaviorEval._parse_eval_run_record(response_payload)
+
+    @staticmethod
+    def list_runs(
+        organization_id: typing.Optional[int] = None,
+        archived: bool = False,
+    ) -> list[EvalRunRecord]:
+        """
+        List evaluation runs.
+        """
+        response = requests.get(
+            f"{API_HOST}/llm-behavior-eval/run/list",
+            params={
+                "eval_organization_id": organization_id,
+                "archived": archived,
+            },
+            headers=get_headers(),
+            timeout=READ_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        response_payload = response.json()
+        return [
+            LlmBehaviorEval._parse_eval_run_record(run_record)
+            for run_record in response_payload
+        ]
+
+    @staticmethod
+    def stream_results_by_id(run_id: str) -> Generator[dict, None, None]:
+        """
+        Stream evaluation results for a run.
+        """
+        with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client:
+            for sse_event in iter_sse_retrying(
+                client,
+                "GET",
+                f"{API_HOST}/llm-behavior-eval/run/{run_id}",
+                headers=get_headers(),
+            ):
+                if sse_event.event == "ping":
+                    continue
+                try:
+                    yield json.loads(sse_event.data)
+                except json.JSONDecodeError:
+                    logger.debug("Non-JSON SSE payload received: %s", sse_event.data)
+                    yield {"data": sse_event.data}
+
+    def stream_results(self) -> Generator[dict, None, None]:
+        if not self.run_id:
+            raise ValueError("No run has been started")
+        yield from self.stream_results_by_id(self.run_id)
+
+    @staticmethod
+    async def astream_results_by_id(run_id: str) -> AsyncGenerator[dict, None]:
+        """
+        Async stream evaluation results for a run.
+        """
+        async with httpx.AsyncClient(
+            timeout=httpx.Timeout(None, connect=5.0)
+        ) as client:
+            async_iterator = await aiter_sse_retrying(
+                client,
+                "GET",
+                f"{API_HOST}/llm-behavior-eval/run/{run_id}",
+                headers=get_headers(),
+            )
+            async for sse_event in async_iterator:
+                if sse_event.event == "ping":
+                    continue
+                try:
+                    yield json.loads(sse_event.data)
+                except json.JSONDecodeError:
+                    logger.debug(
+                        "Non-JSON SSE payload received: %s", sse_event.data
+                    )
+                    yield {"data": sse_event.data}
+
+    async def astream_results(self) -> AsyncGenerator[dict, None]:
+        if not self.run_id:
+            raise ValueError("No run has been started")
+        async for payload in self.astream_results_by_id(self.run_id):
+            yield payload

From 1a047bb3c566b333fbccb9f9acb43c083979ea97 Mon Sep 17 00:00:00 2001
From: Ben Lewis <hello@blewis.me>
Date: Tue, 13 Jan 2026 19:36:19 +0200
Subject: [PATCH 02/24] Format llm_behavior_eval

---
 hirundo/llm_behavior_eval.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index 8df92d12..5bdadba1 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -381,9 +381,7 @@ async def astream_results_by_id(run_id: str) -> AsyncGenerator[dict, None]:
                 try:
                     yield json.loads(sse_event.data)
                 except json.JSONDecodeError:
-                    logger.debug(
-                        "Non-JSON SSE payload received: %s", sse_event.data
-                    )
+                    logger.debug("Non-JSON SSE payload received: %s", sse_event.data)
                     yield {"data": sse_event.data}
 
     async def astream_results(self) -> AsyncGenerator[dict, None]:

From 06d040cbc93c742a289ccdd7ddea97e91f64a0f2 Mon Sep 17 00:00:00 2001
From: Ben Lewis <hello@blewis.me>
Date: Tue, 13 Jan 2026 23:43:06 +0200
Subject: [PATCH 03/24] Fix optional type hints in llm behavior eval

---
 hirundo/llm_behavior_eval.py | 61 ++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index 5bdadba1..0bd08650 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -1,6 +1,5 @@
 import datetime
 import json
-import typing
 from collections.abc import AsyncGenerator, Generator
 from enum import Enum
 
@@ -51,21 +50,21 @@ class BiasType(str, Enum):
 
 class JudgeModel(BaseModel):
     path_or_repo_id: str
-    token: typing.Optional[str] = None
-    batch_size: typing.Optional[int] = None
-    output_tokens: typing.Optional[int] = None
-    use_4bit: typing.Optional[bool] = None
+    token: str | None = None
+    batch_size: int | None = None
+    output_tokens: int | None = None
+    use_4bit: bool | None = None
 
 
 class EvalRunInfo(BaseModel):
-    organization_id: typing.Optional[int] = None
-    name: typing.Optional[str] = None
-    model_id: typing.Optional[int] = None
-    source_run_id: typing.Optional[str] = None
-    file_path: typing.Optional[str] = None
-    preset_type: typing.Optional[PresetType] = None
-    bias_type: typing.Optional[BiasType] = None
-    judge_model: typing.Optional[JudgeModel] = None
+    organization_id: int | None = None
+    name: str | None = None
+    model_id: int | None = None
+    source_run_id: str | None = None
+    file_path: str | None = None
+    preset_type: PresetType | None = None
+    bias_type: BiasType | None = None
+    judge_model: JudgeModel | None = None
 
 
 class OutputLlm(BaseModel):
@@ -96,10 +95,10 @@ class LlmEvalMetricRow(BaseModel):
 
     benchmark: str
     metric: str
-    original: typing.Optional[typing.Union[float, str]] = None
-    post_unlearning: typing.Optional[typing.Union[float, str]] = None
-    reduction_percent: typing.Optional[float] = None
-    subset: typing.Optional[str] = None
+    original: float | str | None = None
+    post_unlearning: float | str | None = None
+    reduction_percent: float | None = None
+    subset: str | None = None
 
 
 class LlmEvalMetrics(BaseModel):
@@ -111,27 +110,27 @@ class LlmEvalMetrics(BaseModel):
 class EvalRunRecord(BaseModel):
     id: int
     name: str
-    model_id: typing.Optional[int]
-    model: typing.Optional[OutputLlm]
-    source_run_id: typing.Optional[str]
-    source_run: typing.Optional[OutputUnlearningLlmRun]
-    file_path: typing.Optional[str]
-    preset_type: typing.Optional[PresetType]
-    bias_type: typing.Optional[BiasType]
-    judge_model: typing.Optional[JudgeModel]
+    model_id: int | None
+    model: OutputLlm | None
+    source_run_id: str | None
+    source_run: OutputUnlearningLlmRun | None
+    file_path: str | None
+    preset_type: PresetType | None
+    bias_type: BiasType | None
+    judge_model: JudgeModel | None
     run_id: str
-    mlflow_run_id: typing.Optional[str]
+    mlflow_run_id: str | None
     status: str
     created_at: datetime.datetime
     pre_process_progress: float
     optimization_progress: float
     post_process_progress: float
-    metrics: typing.Optional[LlmEvalMetrics] = None
-    responses_zip_url: typing.Optional[str] = None
+    metrics: LlmEvalMetrics | None = None
+    responses_zip_url: str | None = None
 
 
 class LlmBehaviorEval:
-    def __init__(self, run_id: typing.Optional[str] = None):
+    def __init__(self, run_id: str | None = None):
         self.run_id = run_id
 
     @staticmethod
@@ -192,7 +191,7 @@ def _parse_eval_run_record(response_payload: dict) -> EvalRunRecord:
 
     @staticmethod
     def launch_eval_run(
-        model_or_run: typing.Union[ModelOrRun, str],
+        model_or_run: ModelOrRun | str,
         run_info: EvalRunInfo,
     ) -> str:
         """
@@ -314,7 +313,7 @@ def get_run_info_by_id(run_id: str) -> EvalRunRecord:
 
     @staticmethod
     def list_runs(
-        organization_id: typing.Optional[int] = None,
+        organization_id: int | None = None,
         archived: bool = False,
     ) -> list[EvalRunRecord]:
         """

From 415f7e39e00a444a6ed76759bca57d9c5ed0a253 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 28 Jan 2026 21:49:11 +0200
Subject: [PATCH 04/24] Basic first implementation of matching naming for
 `check_run` instead of `stream_results` and adding `tqdm`, unzipping and
 loading of CSVs

Essentially matching the data QA behavior
---
 hirundo/__init__.py                           |  30 ++-
 hirundo/_hirundo_error.py                     |   6 +
 hirundo/_llm_pipeline.py                      |   4 +-
 hirundo/_llm_sources.py                       |  43 ++++
 hirundo/_run_checking.py                      |  35 ++-
 hirundo/_sse_event_data.py                    |  24 ++
 hirundo/dataset_qa.py                         |   7 +-
 hirundo/llm_behavior_eval.py                  | 225 ++++++++++++++----
 hirundo/llm_behavior_eval_results.py          |  27 +++
 hirundo/llm_bias_type.py                      |  11 +
 hirundo/unlearning_llm.py                     |  51 +---
 hirundo/unzip.py                              |  84 ++++++-
 .../llm_behavior_eval_test.py                 |  40 ++++
 13 files changed, 460 insertions(+), 127 deletions(-)
 create mode 100644 hirundo/_hirundo_error.py
 create mode 100644 hirundo/_llm_sources.py
 create mode 100644 hirundo/_sse_event_data.py
 create mode 100644 hirundo/llm_behavior_eval_results.py
 create mode 100644 hirundo/llm_bias_type.py
 create mode 100644 tests/llm-behavior-eval/llm_behavior_eval_test.py

diff --git a/hirundo/__init__.py b/hirundo/__init__.py
index 83d1cca1..2cb0ca16 100644
--- a/hirundo/__init__.py
+++ b/hirundo/__init__.py
@@ -1,3 +1,12 @@
+from ._hirundo_error import HirundoError
+from ._llm_sources import (
+    HuggingFaceTransformersModel,
+    HuggingFaceTransformersModelOutput,
+    LlmSources,
+    LlmSourcesOutput,
+    LocalTransformersModel,
+    ModelSourceType,
+)
 from .dataset_enum import (
     DatasetMetadataType,
     LabelingType,
@@ -5,7 +14,7 @@
 )
 from .dataset_qa import (
     ClassificationRunArgs,
-    HirundoError,
+    HirundoDatasetQaError,
     ModalityType,
     ObjectDetectionRunArgs,
     QADataset,
@@ -24,15 +33,16 @@
     KeylabsObjSegVideo,
 )
 from .llm_behavior_eval import (
-    BiasType,
     EvalRunInfo,
     EvalRunRecord,
+    HirundoLlmBehaviorEvalError,
     JudgeModel,
     LlmBehaviorEval,
-    LlmBehaviorEvalError,
     ModelOrRun,
     PresetType,
 )
+from .llm_behavior_eval_results import LlmBehaviorEvalResults
+from .llm_bias_type import BiasType
 from .storage import (
     StorageConfig,
     StorageGCP,
@@ -42,20 +52,18 @@
 )
 from .unlearning_llm import (
     BiasRunInfo,
-    BiasType,
-    HuggingFaceTransformersModel,
     LlmModel,
-    LlmSources,
     LlmUnlearningRun,
-    LocalTransformersModel,
 )
 from .unzip import load_df, load_from_zip
 
 __all__ = [
     "COCO",
     "YOLO",
-    "HirundoError",
     "HirundoCSV",
+    "HirundoError",
+    "HirundoDatasetQaError",
+    "HirundoLlmBehaviorEvalError",
     "KeylabsAuth",
     "KeylabsObjDetImages",
     "KeylabsObjDetVideo",
@@ -67,7 +75,7 @@
     "EvalRunRecord",
     "JudgeModel",
     "LlmBehaviorEval",
-    "LlmBehaviorEvalError",
+    "LlmBehaviorEvalResults",
     "ModalityType",
     "ModelOrRun",
     "PresetType",
@@ -87,14 +95,16 @@
     "StorageConfig",
     "DatasetQAResults",
     "BiasRunInfo",
-    "BiasType",
     "HuggingFaceTransformersModel",
+    "HuggingFaceTransformersModelOutput",
     "LlmModel",
     "LlmSources",
+    "LlmSourcesOutput",
     "LlmUnlearningRun",
     "LocalTransformersModel",
     "load_df",
     "load_from_zip",
+    "ModelSourceType",
 ]
 
 __version__ = "0.1.22"
diff --git a/hirundo/_hirundo_error.py b/hirundo/_hirundo_error.py
new file mode 100644
index 00000000..5f798aff
--- /dev/null
+++ b/hirundo/_hirundo_error.py
@@ -0,0 +1,6 @@
+class HirundoError(Exception):
+    """
+    Custom exception used to indicate errors in `hirundo` dataset QA runs
+    """
+
+    pass
diff --git a/hirundo/_llm_pipeline.py b/hirundo/_llm_pipeline.py
index 5224e732..a78c6b39 100644
--- a/hirundo/_llm_pipeline.py
+++ b/hirundo/_llm_pipeline.py
@@ -48,9 +48,11 @@ def get_hf_pipeline_for_run_given_model(
     from transformers.models.auto.tokenization_auto import AutoTokenizer
     from transformers.pipelines import pipeline
 
-    from hirundo.unlearning_llm import (
+    from hirundo._llm_sources import (
         HuggingFaceTransformersModel,
         HuggingFaceTransformersModelOutput,
+    )
+    from hirundo.unlearning_llm import (
         LlmUnlearningRun,
     )
 
diff --git a/hirundo/_llm_sources.py b/hirundo/_llm_sources.py
new file mode 100644
index 00000000..03e1a112
--- /dev/null
+++ b/hirundo/_llm_sources.py
@@ -0,0 +1,43 @@
+from enum import Enum
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict
+
+
+class ModelSourceType(str, Enum):
+    HUGGINGFACE_TRANSFORMERS = "huggingface_transformers"
+    LOCAL_TRANSFORMERS = "local_transformers"
+
+
+class HuggingFaceTransformersModel(BaseModel):
+    model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
+
+    type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = (
+        ModelSourceType.HUGGINGFACE_TRANSFORMERS
+    )
+    revision: str | None = None
+    code_revision: str | None = None
+    model_name: str
+    token: str | None = None
+
+
+class HuggingFaceTransformersModelOutput(BaseModel):
+    model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
+
+    type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = (
+        ModelSourceType.HUGGINGFACE_TRANSFORMERS
+    )
+    model_name: str
+
+
+class LocalTransformersModel(BaseModel):
+    type: Literal[ModelSourceType.LOCAL_TRANSFORMERS] = (
+        ModelSourceType.LOCAL_TRANSFORMERS
+    )
+    revision: None = None
+    code_revision: None = None
+    local_path: str
+
+
+LlmSources = HuggingFaceTransformersModel | LocalTransformersModel
+LlmSourcesOutput = HuggingFaceTransformersModelOutput | LocalTransformersModel
diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py
index 4866454b..9bf68622 100644
--- a/hirundo/_run_checking.py
+++ b/hirundo/_run_checking.py
@@ -6,6 +6,7 @@
 from tqdm import tqdm
 
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
+from hirundo._sse_event_data import SseRunEventData
 from hirundo.logger import get_logger
 
 _logger = get_logger(__name__)
@@ -62,7 +63,9 @@ def build_status_text_map(
     }
 
 
-def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None:
+def get_state(
+    payload: dict | SseRunEventData, status_keys: tuple[str, ...]
+) -> str | None:
     """
     Return the first non-null state value from a payload using a list of keys.
 
@@ -74,7 +77,7 @@ def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None:
         The first non-null state value, or None if none are present.
     """
     for key in status_keys:
-        value = payload.get(key)
+        value = payload.get(key) if isinstance(payload, dict) else getattr(payload, key)
         if value is not None:
             return value
     return None
@@ -222,7 +225,7 @@ async def aiter_run_events(
 
 
 def update_progress_from_result(
-    iteration: dict,
+    iteration: dict | SseRunEventData,
     progress: tqdm,
     *,
     uploading_text: str,
@@ -240,13 +243,15 @@ def update_progress_from_result(
     Returns:
         True if a progress update occurred, False otherwise.
     """
-    if (
-        iteration.get("result")
-        and isinstance(iteration["result"], dict)
-        and iteration["result"].get("result")
-        and isinstance(iteration["result"]["result"], str)
-    ):
-        result_info = iteration["result"]["result"].split(":")
+    result_outer = (
+        iteration.get("result") if isinstance(iteration, dict) else iteration.result
+    )
+    result_inner = (
+        result_outer.get("result") if isinstance(result_outer, dict) else result_outer
+    )
+
+    if result_inner:
+        result_info = result_inner.split(":")
         if len(result_info) > 1:
             stage = result_info[0]
             current_progress_percentage = float(
@@ -268,7 +273,7 @@ def update_progress_from_result(
 
 
 def handle_run_failure(
-    iteration: dict, *, error_cls: type[Exception], run_label: str
+    iteration: dict | SseRunEventData, *, error_cls: type[Exception], run_label: str
 ) -> None:
     """
     Raise a run-specific failure exception based on the iteration payload.
@@ -278,6 +283,10 @@ def handle_run_failure(
         error_cls: Exception type to raise.
         run_label: Human-readable label for the run type.
     """
-    if iteration.get("result"):
-        raise error_cls(f"{run_label} run failed with error: {iteration['result']}")
+    if (
+        result := iteration.get("result")
+        if isinstance(iteration, dict)
+        else iteration.result
+    ):
+        raise error_cls(f"{run_label} run failed with error: {result}")
     raise error_cls(f"{run_label} run failed with an unknown error")
diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py
new file mode 100644
index 00000000..00155f6f
--- /dev/null
+++ b/hirundo/_sse_event_data.py
@@ -0,0 +1,24 @@
+from typing import Any
+
+from pydantic import BaseModel, TypeAdapter
+
+from hirundo._run_checking import RunStatus
+
+
+class SseRunEventData(BaseModel):
+    id: str
+    state: RunStatus
+    result: str | dict | None
+
+
+_SSE_RUN_EVENT_ADAPTER = TypeAdapter(SseRunEventData)
+
+
+def _parse_sse_payload(payload: Any) -> SseRunEventData:
+    if isinstance(payload, dict):
+        if "data" in payload:
+            data = payload["data"]
+            if isinstance(data, dict):
+                return _SSE_RUN_EVENT_ADAPTER.validate_python(data)
+
+    raise ValueError(f"Invalid SSE payload: {payload}")
diff --git a/hirundo/dataset_qa.py b/hirundo/dataset_qa.py
index 44f0bca9..99e211f6 100644
--- a/hirundo/dataset_qa.py
+++ b/hirundo/dataset_qa.py
@@ -11,6 +11,7 @@
 from hirundo._constraints import validate_labeling_info, validate_url
 from hirundo._env import API_HOST
 from hirundo._headers import get_headers
+from hirundo._hirundo_error import HirundoError
 from hirundo._http import raise_for_status_with_reason, requests
 from hirundo._run_checking import (
     STATUS_TO_PROGRESS_MAP,
@@ -34,11 +35,7 @@
 logger = get_logger(__name__)
 
 
-class HirundoError(Exception):
-    """
-    Custom exception used to indicate errors in `hirundo` dataset QA runs
-    """
-
+class HirundoDatasetQaError(HirundoError):
     pass
 
 
diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index 0bd08650..72c27ba6 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -1,26 +1,42 @@
 import datetime
-import json
+import typing
 from collections.abc import AsyncGenerator, Generator
 from enum import Enum
+from typing import overload
 
 import httpx
 from pydantic import BaseModel, ConfigDict
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
 
 from hirundo._env import API_HOST
 from hirundo._headers import get_headers
+from hirundo._hirundo_error import HirundoError
 from hirundo._http import raise_for_status_with_reason, requests
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
+from hirundo._llm_sources import HuggingFaceTransformersModelOutput, LlmSourcesOutput
+from hirundo._run_checking import (
+    STATUS_TO_PROGRESS_MAP,
+    RunStatus,
+    build_status_text_map,
+    get_state,
+    handle_run_failure,
+    update_progress_from_result,
+)
+from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
+from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults
+from hirundo.llm_bias_type import BiasType
 from hirundo.logger import get_logger
+from hirundo.unzip import download_and_extract_llm_behavior_eval_zip
 
 logger = get_logger(__name__)
 
 
-class LlmBehaviorEvalError(Exception):
-    """
-    Custom exception used to indicate errors in `hirundo` LLM behavior eval runs.
-    """
+STATUS_TO_TEXT_MAP = build_status_text_map("LLM behavior eval")
 
+
+class HirundoLlmBehaviorEvalError(HirundoError):
     pass
 
 
@@ -38,16 +54,6 @@ class PresetType(str, Enum):
     INJECTION_EVAL = "INJECTION_EVAL"
 
 
-class BiasType(str, Enum):
-    ALL = "ALL"
-    RACE = "RACE"
-    NATIONALITY = "NATIONALITY"
-    GENDER = "GENDER"
-    PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE"
-    RELIGION = "RELIGION"
-    AGE = "AGE"
-
-
 class JudgeModel(BaseModel):
     path_or_repo_id: str
     token: str | None = None
@@ -77,7 +83,7 @@ class OutputLlm(BaseModel):
     created_at: datetime.datetime
     updated_at: datetime.datetime
     model_name: str
-    model_source: dict
+    model_source: LlmSourcesOutput
 
 
 class OutputUnlearningLlmRun(BaseModel):
@@ -86,6 +92,7 @@ class OutputUnlearningLlmRun(BaseModel):
     id: int
     name: str
     run_id: str
+    model: OutputLlm | None = None
     status: str
     created_at: datetime.datetime
 
@@ -223,7 +230,7 @@ def launch_eval_run(
             or response_payload.get("id")
         )
         if not run_identifier:
-            raise LlmBehaviorEvalError(
+            raise HirundoLlmBehaviorEvalError(
                 "Unable to determine the run ID from the response payload."
             )
         return run_identifier
@@ -242,7 +249,7 @@ def cancel_by_id(run_id: str) -> None:
 
     def cancel(self) -> None:
         if not self.run_id:
-            raise ValueError("No run has been started")
+            raise HirundoLlmBehaviorEvalError("No run has been started")
         self.cancel_by_id(self.run_id)
 
     @staticmethod
@@ -260,7 +267,7 @@ def rename_by_id(run_id: str, new_name: str) -> None:
 
     def rename(self, new_name: str) -> None:
         if not self.run_id:
-            raise ValueError("No run has been started")
+            raise HirundoLlmBehaviorEvalError("No run has been started")
         self.rename_by_id(self.run_id, new_name)
 
     @staticmethod
@@ -277,7 +284,7 @@ def archive_by_id(run_id: str) -> None:
 
     def archive(self) -> None:
         if not self.run_id:
-            raise ValueError("No run has been started")
+            raise HirundoLlmBehaviorEvalError("No run has been started")
         self.archive_by_id(self.run_id)
 
     @staticmethod
@@ -294,7 +301,7 @@ def restore_by_id(run_id: str) -> None:
 
     def restore(self) -> None:
         if not self.run_id:
-            raise ValueError("No run has been started")
+            raise HirundoLlmBehaviorEvalError("No run has been started")
         self.restore_by_id(self.run_id)
 
     @staticmethod
@@ -336,10 +343,24 @@ def list_runs(
         ]
 
     @staticmethod
-    def stream_results_by_id(run_id: str) -> Generator[dict, None, None]:
-        """
-        Stream evaluation results for a run.
-        """
+    def _resolve_model_name(run_info: EvalRunRecord) -> str | None:
+        if run_info.model and isinstance(
+            run_info.model.model_source, HuggingFaceTransformersModelOutput
+        ):
+            return run_info.model.model_source.model_name
+        if (
+            run_info.source_run
+            and run_info.source_run.model
+            and isinstance(
+                run_info.source_run.model.model_source,
+                HuggingFaceTransformersModelOutput,
+            )
+        ):
+            return run_info.source_run.model.model_source.model_name
+        return None
+
+    @staticmethod
+    def _check_run_by_id(run_id: str) -> Generator[SseRunEventData, None, None]:
         with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client:
             for sse_event in iter_sse_retrying(
                 client,
@@ -349,22 +370,133 @@ def stream_results_by_id(run_id: str) -> Generator[dict, None, None]:
             ):
                 if sse_event.event == "ping":
                     continue
-                try:
-                    yield json.loads(sse_event.data)
-                except json.JSONDecodeError:
-                    logger.debug("Non-JSON SSE payload received: %s", sse_event.data)
-                    yield {"data": sse_event.data}
+                yield _parse_sse_payload(sse_event.data)
+
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: typing.Literal[True]
+    ) -> LlmBehaviorEvalResults | None: ...
+
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: typing.Literal[False] = False
+    ) -> LlmBehaviorEvalResults: ...
+
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: bool
+    ) -> LlmBehaviorEvalResults | None: ...
+
+    @staticmethod
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: bool = False
+    ) -> LlmBehaviorEvalResults | None:
+        """
+        Check the status of a run given its ID
+
+        Args:
+            run_id: The `run_id` produced by a `launch_eval_run` call
+            stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval
 
-    def stream_results(self) -> Generator[dict, None, None]:
+        Returns:
+            An LlmBehaviorEvalResults object with the results of the evaluation run
+
+        Raises:
+            HirundoLlmBehaviorEvalError: If the maximum number of retries is reached or if the run fails
+        """
+        logger.debug("Checking run with ID: %s", run_id)
+        with logging_redirect_tqdm():
+            progress_bar = tqdm(total=100.0)
+            for iteration in LlmBehaviorEval._check_run_by_id(run_id):
+                state = get_state(iteration, ("state",))
+                if state in STATUS_TO_PROGRESS_MAP:
+                    progress_bar.set_description(STATUS_TO_TEXT_MAP[state])
+                    progress_bar.n = STATUS_TO_PROGRESS_MAP[state]
+                    logger.debug("Setting progress to %s", progress_bar.n)
+                    progress_bar.refresh()
+                    if state in [
+                        RunStatus.FAILURE.value,
+                        RunStatus.REJECTED.value,
+                        RunStatus.REVOKED.value,
+                    ]:
+                        logger.error(
+                            "State is failure, rejected, or revoked: %s",
+                            state,
+                        )
+                        handle_run_failure(
+                            iteration,
+                            error_cls=HirundoLlmBehaviorEvalError,
+                            run_label="LLM behavior eval",
+                        )
+                    elif state == RunStatus.SUCCESS.value:
+                        progress_bar.close()
+                        zip_temporary_url = iteration.result
+                        if not zip_temporary_url or not isinstance(
+                            zip_temporary_url, str
+                        ):
+                            raise HirundoLlmBehaviorEvalError(
+                                "LLM behavior eval run completed without a results URL."
+                            )
+                        run_info = LlmBehaviorEval.get_run_info_by_id(run_id)
+                        model_name = LlmBehaviorEval._resolve_model_name(run_info)
+                        return download_and_extract_llm_behavior_eval_zip(
+                            run_id,
+                            zip_temporary_url,
+                            model_name,
+                        )
+                    elif (
+                        state == RunStatus.AWAITING_MANUAL_APPROVAL.value
+                        and stop_on_manual_approval
+                    ):
+                        progress_bar.close()
+                        return None
+                elif state is None:
+                    update_progress_from_result(
+                        iteration,
+                        progress_bar,
+                        uploading_text="LLM behavior eval run completed. Uploading results",
+                        log=logger,
+                    )
+        raise HirundoLlmBehaviorEvalError(
+            "LLM behavior eval run failed with an unknown error in check_run_by_id"
+        )
+
+    @overload
+    def check_run(
+        self, stop_on_manual_approval: typing.Literal[True]
+    ) -> LlmBehaviorEvalResults | None: ...
+
+    @overload
+    def check_run(
+        self, stop_on_manual_approval: typing.Literal[False] = False
+    ) -> LlmBehaviorEvalResults: ...
+
+    def check_run(
+        self, stop_on_manual_approval: bool = False
+    ) -> LlmBehaviorEvalResults | None:
+        """
+        Check the status of the current active instance's run.
+
+        Returns:
+            An LlmBehaviorEvalResults object with the results of the evaluation run
+        """
         if not self.run_id:
-            raise ValueError("No run has been started")
-        yield from self.stream_results_by_id(self.run_id)
+            raise HirundoLlmBehaviorEvalError("No run has been started")
+        return self.check_run_by_id(self.run_id, stop_on_manual_approval)
 
     @staticmethod
-    async def astream_results_by_id(run_id: str) -> AsyncGenerator[dict, None]:
+    async def acheck_run_by_id(run_id: str) -> AsyncGenerator[SseRunEventData, None]:
         """
-        Async stream evaluation results for a run.
+        Async version of :func:`check_run_by_id`
+
+        Check the status of a run given its ID.
+
+        This generator will produce values to show progress of the run.
         """
+        logger.debug("Checking run with ID: %s", run_id)
         async with httpx.AsyncClient(
             timeout=httpx.Timeout(None, connect=5.0)
         ) as client:
@@ -377,14 +509,19 @@ async def astream_results_by_id(run_id: str) -> AsyncGenerator[dict, None]:
             async for sse_event in async_iterator:
                 if sse_event.event == "ping":
                     continue
-                try:
-                    yield json.loads(sse_event.data)
-                except json.JSONDecodeError:
-                    logger.debug("Non-JSON SSE payload received: %s", sse_event.data)
-                    yield {"data": sse_event.data}
+                yield _parse_sse_payload(sse_event.data)
 
-    async def astream_results(self) -> AsyncGenerator[dict, None]:
+    async def acheck_run(self) -> AsyncGenerator[SseRunEventData, None]:
+        """
+        Async version of :func:`check_run`
+
+        Check the status of the current active instance's run.
+
+        This generator will produce values to show progress of the run.
+
+        Note: This function does not handle errors nor show progress. It is expected that you do that.
+        """
         if not self.run_id:
-            raise ValueError("No run has been started")
-        async for payload in self.astream_results_by_id(self.run_id):
-            yield payload
+            raise HirundoLlmBehaviorEvalError("No run has been started")
+        async for iteration in self.acheck_run_by_id(self.run_id):
+            yield iteration
diff --git a/hirundo/llm_behavior_eval_results.py b/hirundo/llm_behavior_eval_results.py
new file mode 100644
index 00000000..f13e9e0b
--- /dev/null
+++ b/hirundo/llm_behavior_eval_results.py
@@ -0,0 +1,27 @@
+import typing
+from pathlib import Path
+
+from pydantic import BaseModel
+
+T = typing.TypeVar("T")
+
+
+class LlmBehaviorEvalResults(BaseModel, typing.Generic[T]):
+    model_config = {"arbitrary_types_allowed": True}
+
+    cached_zip_path: Path
+    """
+    The path to the cached zip file of the results
+    """
+    model_name: str | None = None
+    """
+    The model name used to locate results in the zip file
+    """
+    summary_brief: T
+    """
+    A polars/pandas DataFrame containing the summary_brief CSV
+    """
+    summary_full: T
+    """
+    A polars/pandas DataFrame containing the summary_full CSV
+    """
diff --git a/hirundo/llm_bias_type.py b/hirundo/llm_bias_type.py
new file mode 100644
index 00000000..bd45501d
--- /dev/null
+++ b/hirundo/llm_bias_type.py
@@ -0,0 +1,11 @@
+from enum import Enum
+
+
+class BiasType(str, Enum):
+    ALL = "ALL"
+    RACE = "RACE"
+    NATIONALITY = "NATIONALITY"
+    GENDER = "GENDER"
+    PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE"
+    RELIGION = "RELIGION"
+    AGE = "AGE"
diff --git a/hirundo/unlearning_llm.py b/hirundo/unlearning_llm.py
index b36ea14f..19a58e04 100644
--- a/hirundo/unlearning_llm.py
+++ b/hirundo/unlearning_llm.py
@@ -12,6 +12,7 @@
 from hirundo._headers import get_headers
 from hirundo._http import raise_for_status_with_reason, requests
 from hirundo._llm_pipeline import get_hf_pipeline_for_run_given_model
+from hirundo._llm_sources import LlmSources, LlmSourcesOutput
 from hirundo._run_checking import (
     STATUS_TO_PROGRESS_MAP,
     RunStatus,
@@ -24,6 +25,7 @@
 )
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.dataset_qa import HirundoError
+from hirundo.llm_bias_type import BiasType
 from hirundo.logger import get_logger
 
 if TYPE_CHECKING:
@@ -34,45 +36,6 @@
 logger = get_logger(__name__)
 
 
-class ModelSourceType(str, Enum):
-    HUGGINGFACE_TRANSFORMERS = "huggingface_transformers"
-    LOCAL_TRANSFORMERS = "local_transformers"
-
-
-class HuggingFaceTransformersModel(BaseModel):
-    model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
-
-    type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = (
-        ModelSourceType.HUGGINGFACE_TRANSFORMERS
-    )
-    revision: str | None = None
-    code_revision: str | None = None
-    model_name: str
-    token: str | None = None
-
-
-class HuggingFaceTransformersModelOutput(BaseModel):
-    model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
-
-    type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = (
-        ModelSourceType.HUGGINGFACE_TRANSFORMERS
-    )
-    model_name: str
-
-
-class LocalTransformersModel(BaseModel):
-    type: Literal[ModelSourceType.LOCAL_TRANSFORMERS] = (
-        ModelSourceType.LOCAL_TRANSFORMERS
-    )
-    revision: None = None
-    code_revision: None = None
-    local_path: str
-
-
-LlmSources = HuggingFaceTransformersModel | LocalTransformersModel
-LlmSourcesOutput = HuggingFaceTransformersModelOutput | LocalTransformersModel
-
-
 class LlmModel(BaseModel):
     model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
 
@@ -235,16 +198,6 @@ class UnlearningLlmAdvancedOptions(BaseModel):
     max_tokens_for_model: dict[DatasetType, int] | int | None = None
 
 
-class BiasType(str, Enum):
-    ALL = "ALL"
-    RACE = "RACE"
-    NATIONALITY = "NATIONALITY"
-    GENDER = "GENDER"
-    PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE"
-    RELIGION = "RELIGION"
-    AGE = "AGE"
-
-
 class UtilityType(str, Enum):
     DEFAULT = "DEFAULT"
     CUSTOM = "CUSTOM"
diff --git a/hirundo/unzip.py b/hirundo/unzip.py
index 113f3be7..f0cf3e22 100644
--- a/hirundo/unzip.py
+++ b/hirundo/unzip.py
@@ -23,6 +23,7 @@
     DataFrameType,
     DatasetQAResults,
 )
+from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults
 from hirundo.logger import get_logger
 
 ZIP_FILE_CHUNK_SIZE = 50 * 1024 * 1024  # 50 MB
@@ -150,11 +151,11 @@ def download_and_extract_zip(
         headers=headers,
         timeout=DOWNLOAD_READ_TIMEOUT,
         stream=True,
-    ) as r:
-        r.raise_for_status()
-        with open(zip_file_path, "wb") as f:
-            for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
-                f.write(chunk)
+    ) as response:
+        response.raise_for_status()
+        with open(zip_file_path, "wb") as output_file:
+            for chunk in response.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
+                output_file.write(chunk)
         logger.info(
             "Successfully downloaded the result zip file for run ID %s to %s",
             run_id,
@@ -224,6 +225,79 @@ def download_and_extract_zip(
             )
 
 
+def download_and_extract_llm_behavior_eval_zip(
+    run_id: str,
+    zip_url: str,
+    model_name: str | None = None,
+) -> LlmBehaviorEvalResults[DataFrameType]:
+    """
+    Download and extract the LLM behavior evaluation results zip file.
+
+    Args:
+        run_id: The ID of the LLM behavior eval run.
+        zip_url: The URL of the zip file to download.
+        model_name (optional): The full model name to resolve the folder within the zip.
+
+    Returns:
+        The LLM behavior eval results object.
+    """
+    cache_dir = Path.home() / ".hirundo" / "cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    zip_file_path = cache_dir / f"{run_id}.zip"
+
+    headers = None
+    if Url(zip_url).scheme == "file":
+        zip_url = f"{API_HOST}/llm-behavior-eval/run/local-download" + zip_url.replace(
+            "file://", ""
+        )
+        headers = _get_auth_headers()
+    with requests.get(
+        zip_url,
+        headers=headers,
+        timeout=DOWNLOAD_READ_TIMEOUT,
+        stream=True,
+    ) as r:
+        r.raise_for_status()
+        with open(zip_file_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
+                f.write(chunk)
+        logger.info(
+            "Successfully downloaded the LLM behavior eval result zip file for run ID %s to %s",
+            run_id,
+            zip_file_path,
+        )
+
+        if model_name:
+            model_folder = model_name.split("/")[-1]
+            summary_brief_name = f"{model_folder}/summary_brief.csv"
+            summary_full_name = f"{model_folder}/summary_full.csv"
+
+            with zipfile.ZipFile(zip_file_path, "r") as zip_file:
+                filenames = [file.filename for file in zip_file.filelist]
+                if summary_brief_name not in filenames:
+                    raise ValueError(
+                        f"Missing {summary_brief_name} in LLM behavior eval zip for run {run_id}"
+                    )
+                if summary_full_name not in filenames:
+                    raise ValueError(
+                        f"Missing {summary_full_name} in LLM behavior eval zip for run {run_id}"
+                    )
+                with zip_file.open(summary_brief_name) as summary_brief_file:
+                    summary_brief_df = load_df(summary_brief_file)
+                with zip_file.open(summary_full_name) as summary_full_file:
+                    summary_full_df = load_df(summary_full_file)
+        else:
+            summary_brief_df = None
+            summary_full_df = None
+
+        return LlmBehaviorEvalResults[DataFrameType](
+            cached_zip_path=zip_file_path,
+            model_name=model_name,
+            summary_brief=summary_brief_df,
+            summary_full=summary_full_df,
+        )
+
+
 def load_from_zip(
     zip_path: Path, file_name: str
 ) -> "pd.DataFrame | pl.DataFrame | None":
diff --git a/tests/llm-behavior-eval/llm_behavior_eval_test.py b/tests/llm-behavior-eval/llm_behavior_eval_test.py
new file mode 100644
index 00000000..a7de894a
--- /dev/null
+++ b/tests/llm-behavior-eval/llm_behavior_eval_test.py
@@ -0,0 +1,40 @@
+import logging
+import os
+
+from hirundo import (
+    BiasType,
+    EvalRunInfo,
+    HuggingFaceTransformersModel,
+    LlmBehaviorEval,
+    LlmModel,
+    ModelOrRun,
+    PresetType,
+)
+from tests.testing_utils import get_unique_id
+
+logger = logging.getLogger(__name__)
+
+unique_id = get_unique_id()
+
+
+def test_llm_behavior_eval():
+    llm = LlmModel(
+        model_name=f"TEST-LLM-BEHAVIOR-EVAL-Granite-4-micro-{unique_id}",
+        model_source=HuggingFaceTransformersModel(
+            model_name="ibm-granite/granite-4.0-micro",
+        ),
+    )
+    llm_id = llm.create()
+    run_info = EvalRunInfo(
+        name=f"TEST-LLM-BEHAVIOR-EVAL-RUN-{unique_id}",
+        model_id=llm_id,
+        preset_type=PresetType.BBQ_BIAS,
+        bias_type=BiasType.ALL,
+    )
+    assert llm_id is not None
+    if os.getenv("FULL_TEST", "false") == "true":
+        run_id = LlmBehaviorEval.launch_eval_run(ModelOrRun.MODEL, run_info)
+        assert run_id is not None
+        results = LlmBehaviorEval.check_run_by_id(run_id)
+        assert results is not None
+        assert results.cached_zip_path is not None

From b9027d8020c7ff32d939ed6a370a83b1bae6e57e Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 28 Jan 2026 21:50:34 +0200
Subject: [PATCH 05/24] Add AGENTS.md and new `dependency_groups` entry of
 `dev` for development

---
 .envrc                  |  2 +-
 .pre-commit-config.yaml |  2 +-
 AGENTS.md               | 39 +++++++++++++++++++++++++++++++++++++++
 pyproject.toml          |  3 +++
 uv.lock                 |  8 ++++++++
 5 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 AGENTS.md

diff --git a/.envrc b/.envrc
index b2d2268b..550b4cae 100644
--- a/.envrc
+++ b/.envrc
@@ -1,2 +1,2 @@
 watch_file uv.lock
-uv sync --all-extras && source .venv/bin/activate
+uv sync --group dev && source .venv/bin/activate
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 25c82ebb..314b9f2a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -75,4 +75,4 @@ repos:
     hooks:
       - id: uv-lock
       - id: uv-sync
-        args: ["--extra", "dev", "--extra", "docs", "--extra", "pandas", "--extra", "polars", "--extra", "transformers"]
+        args: ["--group", "dev"]
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 00000000..c2cd2baa
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,39 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+
+- `hirundo/` holds the SDK source (CLI entry point is `hirundo.cli:app`).
+- `tests/` contains pytest-based test coverage.
+- `docs/` and `source/` contain Sphinx documentation assets.
+- `notebooks/` and `on_prem_test_notebook.ipynb` provide example workflows.
+- `requirements/` stores compiled dependency sets (for dev, docs, pandas, polars, transformers).
+
+## Build, Test, and Development Commands
+
+- `uv sync --group dev`: fast dependency sync with extras.
+- `ruff check` / `ruff format`: lint and auto-format (run before PRs).
+- `pytest`: run the test suite.
+- `python -m build`: build the package artifacts.
+- `pre-commit install`: enable git hooks (optional, but recommended).
+
+## Coding Style & Naming Conventions
+
+- Python 3.10+ codebase, 4-space indentation, line length 88 (Ruff defaults).
+- Follow Ruff linting rules (`pyproject.toml`), with tests allowing `assert` usage.
+- Prefer descriptive names; avoid short, cryptic identifiers in new code.
+
+## Testing Guidelines
+
+- Frameworks: `pytest` and `pytest-asyncio`.
+- Place tests in `tests/`; name files `test_*.py`.
+- Run locally with `pytest` before opening a PR (CI runs lint + integration tests).
+
+## Commit & Pull Request Guidelines
+
+- Recent commit history favors `SDK-<id>: <summary>` (e.g., `SDK-78: Migrate to basedpyright`).
+- Include issue/PR references when available (e.g., `(#190)`).
+- PRs should describe changes clearly and confirm `ruff check` and `ruff format` passed.
+
+## Security & Configuration Tips
+
+- Supported Python versions: CPython 3.10–3.13.
diff --git a/pyproject.toml b/pyproject.toml
index 0af15c33..544e0a03 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,6 +102,9 @@ pandas = ["pandas>=2.2.3"]
 polars = ["polars>=1.0.0"]
 transformers = ["transformers>=4.57.3", "peft>=0.18.1", "accelerate>=1.12.0"]
 
+[dependency-groups]
+dev = ["hirundo[dev,docs,pandas,polars,transformers]"]
+
 [tool.bumpver]
 current_version = "0.1.3b1"
 version_pattern = "MAJOR.MINOR.PATCH[PYTAGNUM]"
diff --git a/uv.lock b/uv.lock
index 82686b7a..bd1e3f97 100644
--- a/uv.lock
+++ b/uv.lock
@@ -619,6 +619,11 @@ transformers = [
     { name = "transformers" },
 ]
 
+[package.dev-dependencies]
+dev = [
+    { name = "hirundo", extra = ["dev", "docs", "pandas", "polars", "transformers"] },
+]
+
 [package.metadata]
 requires-dist = [
     { name = "accelerate", marker = "extra == 'transformers'", specifier = ">=1.12.0" },
@@ -680,6 +685,9 @@ requires-dist = [
 ]
 provides-extras = ["dev", "docs", "pandas", "polars", "transformers"]
 
+[package.metadata.requires-dev]
+dev = [{ name = "hirundo", extras = ["dev", "docs", "pandas", "polars", "transformers"] }]
+
 [[package]]
 name = "httpcore"
 version = "1.0.9"

From d141b9d1fc174068c5af0b93f4e9c33834485b98 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 28 Jan 2026 21:54:23 +0200
Subject: [PATCH 06/24] Fix `RunStatus` circular dependency

---
 hirundo/__init__.py               |  2 ++
 hirundo/_run_checking.py          | 13 +------------
 hirundo/_run_status.py            | 12 ++++++++++++
 hirundo/_sse_event_data.py        |  2 +-
 hirundo/dataset_qa.py             |  2 +-
 hirundo/llm_behavior_eval.py      |  2 +-
 hirundo/unlearning_llm.py         |  2 +-
 scripts/cleanup_test_artifacts.py |  3 ++-
 tests/dataset_qa_shared.py        |  2 +-
 9 files changed, 22 insertions(+), 18 deletions(-)
 create mode 100644 hirundo/_run_status.py

diff --git a/hirundo/__init__.py b/hirundo/__init__.py
index 2cb0ca16..7a7b0981 100644
--- a/hirundo/__init__.py
+++ b/hirundo/__init__.py
@@ -7,6 +7,7 @@
     LocalTransformersModel,
     ModelSourceType,
 )
+from ._run_status import RunStatus
 from .dataset_enum import (
     DatasetMetadataType,
     LabelingType,
@@ -105,6 +106,7 @@
     "load_df",
     "load_from_zip",
     "ModelSourceType",
+    "RunStatus",
 ]
 
 __version__ = "0.1.22"
diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py
index 9bf68622..b36f2232 100644
--- a/hirundo/_run_checking.py
+++ b/hirundo/_run_checking.py
@@ -1,11 +1,11 @@
 import json
 from collections.abc import AsyncGenerator, Generator
-from enum import Enum
 
 import httpx
 from tqdm import tqdm
 
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
+from hirundo._run_status import RunStatus
 from hirundo._sse_event_data import SseRunEventData
 from hirundo.logger import get_logger
 
@@ -14,17 +14,6 @@
 DEFAULT_MAX_RETRIES = 200
 
 
-class RunStatus(Enum):
-    PENDING = "PENDING"
-    STARTED = "STARTED"
-    SUCCESS = "SUCCESS"
-    FAILURE = "FAILURE"
-    AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL"
-    REVOKED = "REVOKED"
-    REJECTED = "REJECTED"
-    RETRY = "RETRY"
-
-
 STATUS_TO_PROGRESS_MAP = {
     RunStatus.STARTED.value: 0.0,
     RunStatus.PENDING.value: 0.0,
diff --git a/hirundo/_run_status.py b/hirundo/_run_status.py
new file mode 100644
index 00000000..1cc78f53
--- /dev/null
+++ b/hirundo/_run_status.py
@@ -0,0 +1,12 @@
+from enum import Enum
+
+
+class RunStatus(str, Enum):
+    PENDING = "PENDING"
+    STARTED = "STARTED"
+    SUCCESS = "SUCCESS"
+    FAILURE = "FAILURE"
+    AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL"
+    REVOKED = "REVOKED"
+    REJECTED = "REJECTED"
+    RETRY = "RETRY"
diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py
index 00155f6f..877051db 100644
--- a/hirundo/_sse_event_data.py
+++ b/hirundo/_sse_event_data.py
@@ -2,7 +2,7 @@
 
 from pydantic import BaseModel, TypeAdapter
 
-from hirundo._run_checking import RunStatus
+from hirundo._run_status import RunStatus
 
 
 class SseRunEventData(BaseModel):
diff --git a/hirundo/dataset_qa.py b/hirundo/dataset_qa.py
index 99e211f6..8428b69e 100644
--- a/hirundo/dataset_qa.py
+++ b/hirundo/dataset_qa.py
@@ -15,7 +15,6 @@
 from hirundo._http import raise_for_status_with_reason, requests
 from hirundo._run_checking import (
     STATUS_TO_PROGRESS_MAP,
-    RunStatus,
     aiter_run_events,
     build_status_text_map,
     get_state,
@@ -23,6 +22,7 @@
     iter_run_events,
     update_progress_from_result,
 )
+from hirundo._run_status import RunStatus
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo._urls import HirundoUrl
 from hirundo.dataset_enum import DatasetMetadataType, LabelingType
diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index 72c27ba6..979443f6 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -17,12 +17,12 @@
 from hirundo._llm_sources import HuggingFaceTransformersModelOutput, LlmSourcesOutput
 from hirundo._run_checking import (
     STATUS_TO_PROGRESS_MAP,
-    RunStatus,
     build_status_text_map,
     get_state,
     handle_run_failure,
     update_progress_from_result,
 )
+from hirundo._run_status import RunStatus
 from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults
diff --git a/hirundo/unlearning_llm.py b/hirundo/unlearning_llm.py
index 19a58e04..a56bd7e4 100644
--- a/hirundo/unlearning_llm.py
+++ b/hirundo/unlearning_llm.py
@@ -15,7 +15,6 @@
 from hirundo._llm_sources import LlmSources, LlmSourcesOutput
 from hirundo._run_checking import (
     STATUS_TO_PROGRESS_MAP,
-    RunStatus,
     aiter_run_events,
     build_status_text_map,
     get_state,
@@ -23,6 +22,7 @@
     iter_run_events,
     update_progress_from_result,
 )
+from hirundo._run_status import RunStatus
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.dataset_qa import HirundoError
 from hirundo.llm_bias_type import BiasType
diff --git a/scripts/cleanup_test_artifacts.py b/scripts/cleanup_test_artifacts.py
index 1307075e..e6357583 100644
--- a/scripts/cleanup_test_artifacts.py
+++ b/scripts/cleanup_test_artifacts.py
@@ -3,7 +3,8 @@
 
 import requests
 from hirundo import GitRepo, QADataset, StorageConfig
-from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut, RunStatus
+from hirundo._run_status import RunStatus
+from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut
 from hirundo.logger import get_logger
 from hirundo.storage import ResponseStorageConfig
 from hirundo.unlearning_llm import (
diff --git a/tests/dataset_qa_shared.py b/tests/dataset_qa_shared.py
index 126789fb..014b3957 100644
--- a/tests/dataset_qa_shared.py
+++ b/tests/dataset_qa_shared.py
@@ -9,7 +9,7 @@
     RunArgs,
     StorageConfig,
 )
-from hirundo.dataset_qa import RunStatus
+from hirundo._run_status import RunStatus
 from hirundo.logger import get_logger
 
 logger = get_logger(__name__)

From c62aad36e65d485482207037f57c9e281ae05106 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 28 Jan 2026 21:56:34 +0200
Subject: [PATCH 07/24] Drop unnecessary `TypeAdapter` and add error log for
 invalid SSE payload to make it more debuggable

---
 hirundo/_sse_event_data.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py
index 877051db..48675392 100644
--- a/hirundo/_sse_event_data.py
+++ b/hirundo/_sse_event_data.py
@@ -1,8 +1,11 @@
 from typing import Any
 
-from pydantic import BaseModel, TypeAdapter
+from pydantic import BaseModel
 
 from hirundo._run_status import RunStatus
+from hirundo.logger import get_logger
+
+logger = get_logger(__name__)
 
 
 class SseRunEventData(BaseModel):
@@ -11,14 +14,12 @@ class SseRunEventData(BaseModel):
     result: str | dict | None
 
 
-_SSE_RUN_EVENT_ADAPTER = TypeAdapter(SseRunEventData)
-
-
 def _parse_sse_payload(payload: Any) -> SseRunEventData:
     if isinstance(payload, dict):
         if "data" in payload:
             data = payload["data"]
             if isinstance(data, dict):
-                return _SSE_RUN_EVENT_ADAPTER.validate_python(data)
+                return SseRunEventData.model_validate(data)
 
+    logger.error("Invalid SSE payload: %s", payload)
     raise ValueError(f"Invalid SSE payload: {payload}")

From d42c9454192f736309d23b3d6869bbe8fa399e19 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 28 Jan 2026 22:48:14 +0200
Subject: [PATCH 08/24] Update `AGENTS.md` to use context7 and not use 1-3
 character variable names

---
 AGENTS.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index c2cd2baa..e1d12ee7 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,5 +1,11 @@
 # Repository Guidelines
 
+## Instructions
+
+- Always use context7 when I need code generation, setup or configuration steps, or
+  library/API documentation. This means you should automatically use the Context7 MCP
+  tools to resolve library id and get library docs without me having to explicitly ask.
+
 ## Project Structure & Module Organization
 
 - `hirundo/` holds the SDK source (CLI entry point is `hirundo.cli:app`).
@@ -21,6 +27,8 @@
 - Python 3.10+ codebase, 4-space indentation, line length 88 (Ruff defaults).
 - Follow Ruff linting rules (`pyproject.toml`), with tests allowing `assert` usage.
 - Prefer descriptive names; avoid short, cryptic identifiers in new code.
+- Avoid 1-3 character variable names in new or refactored code. Use descriptive names
+  even in small scopes.
 
 ## Testing Guidelines
 

From 808aed79158cd6509b5f2735dbb37ba5f47dda6b Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 28 Jan 2026 22:52:53 +0200
Subject: [PATCH 09/24] Add assertion for `summary_brief` and `summary_full` to
 LLM behavior eval test

---
 tests/llm-behavior-eval/llm_behavior_eval_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/llm-behavior-eval/llm_behavior_eval_test.py b/tests/llm-behavior-eval/llm_behavior_eval_test.py
index a7de894a..d4a8359c 100644
--- a/tests/llm-behavior-eval/llm_behavior_eval_test.py
+++ b/tests/llm-behavior-eval/llm_behavior_eval_test.py
@@ -38,3 +38,5 @@ def test_llm_behavior_eval():
         results = LlmBehaviorEval.check_run_by_id(run_id)
         assert results is not None
         assert results.cached_zip_path is not None
+        assert results.summary_brief is not None
+        assert results.summary_full is not None

From c872f890b7096449b6ecc72174581cad80724ad6 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 28 Jan 2026 22:53:12 +0200
Subject: [PATCH 10/24] Fix SSE payload parsing

---
 hirundo/_sse_event_data.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py
index 48675392..34b704b8 100644
--- a/hirundo/_sse_event_data.py
+++ b/hirundo/_sse_event_data.py
@@ -1,7 +1,6 @@
-from typing import Any
-
-from pydantic import BaseModel
+from pydantic import BaseModel, ValidationError
 
+from hirundo import HirundoError
 from hirundo._run_status import RunStatus
 from hirundo.logger import get_logger
 
@@ -14,12 +13,13 @@ class SseRunEventData(BaseModel):
     result: str | dict | None
 
 
-def _parse_sse_payload(payload: Any) -> SseRunEventData:
-    if isinstance(payload, dict):
-        if "data" in payload:
-            data = payload["data"]
-            if isinstance(data, dict):
-                return SseRunEventData.model_validate(data)
+class SseRunEventDataPayload(BaseModel):
+    data: SseRunEventData
+
 
-    logger.error("Invalid SSE payload: %s", payload)
-    raise ValueError(f"Invalid SSE payload: {payload}")
+def _parse_sse_payload(payload: str) -> SseRunEventData:
+    try:
+        return SseRunEventDataPayload.model_validate_json(payload).data
+    except ValidationError as e:
+        logger.error("Invalid SSE payload: %s: %s", payload, exc_info=e)
+        raise HirundoError(f"Invalid SSE payload: {payload}") from e

From be354c44a427d9a822344fb03f03c00181828533 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 28 Jan 2026 23:36:30 +0200
Subject: [PATCH 11/24] Try to fix `unzip` for LLM behavior eval results

---
 hirundo/unzip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hirundo/unzip.py b/hirundo/unzip.py
index f0cf3e22..8fdad079 100644
--- a/hirundo/unzip.py
+++ b/hirundo/unzip.py
@@ -269,8 +269,8 @@ def download_and_extract_llm_behavior_eval_zip(
 
         if model_name:
             model_folder = model_name.split("/")[-1]
-            summary_brief_name = f"{model_folder}/summary_brief.csv"
-            summary_full_name = f"{model_folder}/summary_full.csv"
+            summary_brief_name = f"responses/{model_folder}/summary_brief.csv"
+            summary_full_name = f"responses/{model_folder}/summary_full.csv"
 
             with zipfile.ZipFile(zip_file_path, "r") as zip_file:
                 filenames = [file.filename for file in zip_file.filelist]

From 5391f9637b6f325294ed9714ef2528e3daf10ac5 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Tue, 3 Feb 2026 13:57:38 +0200
Subject: [PATCH 12/24] Apply Greptile suggestions from code review

These look good to me :)

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
Signed-off-by: Ben Lewis <hello@blewis.me>
---
 hirundo/_hirundo_error.py  |  2 +-
 hirundo/_run_checking.py   |  2 +-
 hirundo/_sse_event_data.py |  2 +-
 hirundo/unzip.py           | 10 +++++-----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/hirundo/_hirundo_error.py b/hirundo/_hirundo_error.py
index 5f798aff..9b67978a 100644
--- a/hirundo/_hirundo_error.py
+++ b/hirundo/_hirundo_error.py
@@ -1,6 +1,6 @@
 class HirundoError(Exception):
     """
-    Custom exception used to indicate errors in `hirundo` dataset QA runs
+    Base exception class for Hirundo SDK errors
     """
 
     pass
diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py
index b36f2232..10dcce1b 100644
--- a/hirundo/_run_checking.py
+++ b/hirundo/_run_checking.py
@@ -66,7 +66,7 @@ def get_state(
         The first non-null state value, or None if none are present.
     """
     for key in status_keys:
-        value = payload.get(key) if isinstance(payload, dict) else getattr(payload, key)
+        value = payload.get(key) if isinstance(payload, dict) else getattr(payload, key, None)
         if value is not None:
             return value
     return None
diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py
index 34b704b8..940d8675 100644
--- a/hirundo/_sse_event_data.py
+++ b/hirundo/_sse_event_data.py
@@ -21,5 +21,5 @@ def _parse_sse_payload(payload: str) -> SseRunEventData:
     try:
         return SseRunEventDataPayload.model_validate_json(payload).data
     except ValidationError as e:
-        logger.error("Invalid SSE payload: %s: %s", payload, exc_info=e)
+        logger.error("Invalid SSE payload: %s", payload, exc_info=True)
         raise HirundoError(f"Invalid SSE payload: {payload}") from e
diff --git a/hirundo/unzip.py b/hirundo/unzip.py
index 8fdad079..3c2f0880 100644
--- a/hirundo/unzip.py
+++ b/hirundo/unzip.py
@@ -256,11 +256,11 @@ def download_and_extract_llm_behavior_eval_zip(
         headers=headers,
         timeout=DOWNLOAD_READ_TIMEOUT,
         stream=True,
-    ) as r:
-        r.raise_for_status()
-        with open(zip_file_path, "wb") as f:
-            for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
-                f.write(chunk)
+    ) as response:
+        response.raise_for_status()
+        with open(zip_file_path, "wb") as output_file:
+            for chunk in response.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
+                output_file.write(chunk)
         logger.info(
             "Successfully downloaded the LLM behavior eval result zip file for run ID %s to %s",
             run_id,

From 7612832d40166eb51c2cd786d1968f3b18a447ef Mon Sep 17 00:00:00 2001
From: Ben Lewis <hello@blewis.me>
Date: Tue, 3 Feb 2026 14:33:58 +0200
Subject: [PATCH 13/24] SDK-79: Guard SSE progress and retry eval stream

---
 hirundo/_run_checking.py     | 29 ++++++++++++++++++++++++++-
 hirundo/llm_behavior_eval.py | 39 ++++++++++++++++++++++++++----------
 2 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py
index 10dcce1b..a2b123c8 100644
--- a/hirundo/_run_checking.py
+++ b/hirundo/_run_checking.py
@@ -239,7 +239,7 @@ def update_progress_from_result(
         result_outer.get("result") if isinstance(result_outer, dict) else result_outer
     )
 
-    if result_inner:
+    if isinstance(result_inner, str):
         result_info = result_inner.split(":")
         if len(result_info) > 1:
             stage = result_info[0]
@@ -258,6 +258,33 @@ def update_progress_from_result(
         log.debug("Setting progress to %s", progress.n)
         progress.refresh()
         return True
+    if isinstance(result_inner, dict):
+        stage = (
+            result_inner.get("stage")
+            or result_inner.get("state")
+            or result_inner.get("status")
+        )
+        progress_value = result_inner.get("progress")
+        if progress_value is None:
+            progress_value = result_inner.get("percentage")
+        if progress_value is None:
+            progress_value = result_inner.get("percent")
+        if isinstance(progress_value, str):
+            progress_value = progress_value.strip().removesuffix("%")
+        if isinstance(progress_value, (int, float, str)):
+            try:
+                current_progress_percentage = float(progress_value)
+            except (TypeError, ValueError):
+                current_progress_percentage = progress.n
+            else:
+                desc = uploading_text if current_progress_percentage == 100.0 else stage
+                if desc:
+                    progress.set_description(desc)
+                progress.n = current_progress_percentage
+                log.debug("Setting progress to %s", progress.n)
+                progress.refresh()
+                return True
+        log.debug("Skipping non-string progress result payload: %s", result_inner)
     return False
 
 
diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index 979443f6..0bf29d94 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -16,6 +16,7 @@
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
 from hirundo._llm_sources import HuggingFaceTransformersModelOutput, LlmSourcesOutput
 from hirundo._run_checking import (
+    DEFAULT_MAX_RETRIES,
     STATUS_TO_PROGRESS_MAP,
     build_status_text_map,
     get_state,
@@ -360,17 +361,33 @@ def _resolve_model_name(run_info: EvalRunRecord) -> str | None:
         return None
 
     @staticmethod
-    def _check_run_by_id(run_id: str) -> Generator[SseRunEventData, None, None]:
-        with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client:
-            for sse_event in iter_sse_retrying(
-                client,
-                "GET",
-                f"{API_HOST}/llm-behavior-eval/run/{run_id}",
-                headers=get_headers(),
-            ):
-                if sse_event.event == "ping":
-                    continue
-                yield _parse_sse_payload(sse_event.data)
+    def _check_run_by_id(
+        run_id: str, *, max_retries: int = DEFAULT_MAX_RETRIES
+    ) -> Generator[SseRunEventData, None, None]:
+        retry_count = 0
+        while True:
+            if retry_count > max_retries:
+                raise HirundoLlmBehaviorEvalError("Max retries reached")
+            last_payload = None
+            with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client:
+                for sse_event in iter_sse_retrying(
+                    client,
+                    "GET",
+                    f"{API_HOST}/llm-behavior-eval/run/{run_id}",
+                    headers=get_headers(),
+                ):
+                    if sse_event.event == "ping":
+                        continue
+                    payload = _parse_sse_payload(sse_event.data)
+                    last_payload = payload
+                    yield payload
+            last_state = (
+                get_state(last_payload, ("state",)) if last_payload else None
+            )
+            if last_payload is None or last_state == RunStatus.PENDING.value:
+                retry_count += 1
+                continue
+            return
 
     @staticmethod
     @overload

From bc1d309db088450c5a9e6cd557efa84cebc9fdf9 Mon Sep 17 00:00:00 2001
From: Ben Lewis <hello@blewis.me>
Date: Tue, 3 Feb 2026 14:34:05 +0200
Subject: [PATCH 14/24] SDK-79: Skip non-string SSE progress

---
 hirundo/_run_checking.py | 33 ++++++---------------------------
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py
index a2b123c8..8d493055 100644
--- a/hirundo/_run_checking.py
+++ b/hirundo/_run_checking.py
@@ -66,7 +66,11 @@ def get_state(
         The first non-null state value, or None if none are present.
     """
     for key in status_keys:
-        value = payload.get(key) if isinstance(payload, dict) else getattr(payload, key, None)
+        value = (
+            payload.get(key)
+            if isinstance(payload, dict)
+            else getattr(payload, key, None)
+        )
         if value is not None:
             return value
     return None
@@ -258,32 +262,7 @@ def update_progress_from_result(
         log.debug("Setting progress to %s", progress.n)
         progress.refresh()
         return True
-    if isinstance(result_inner, dict):
-        stage = (
-            result_inner.get("stage")
-            or result_inner.get("state")
-            or result_inner.get("status")
-        )
-        progress_value = result_inner.get("progress")
-        if progress_value is None:
-            progress_value = result_inner.get("percentage")
-        if progress_value is None:
-            progress_value = result_inner.get("percent")
-        if isinstance(progress_value, str):
-            progress_value = progress_value.strip().removesuffix("%")
-        if isinstance(progress_value, (int, float, str)):
-            try:
-                current_progress_percentage = float(progress_value)
-            except (TypeError, ValueError):
-                current_progress_percentage = progress.n
-            else:
-                desc = uploading_text if current_progress_percentage == 100.0 else stage
-                if desc:
-                    progress.set_description(desc)
-                progress.n = current_progress_percentage
-                log.debug("Setting progress to %s", progress.n)
-                progress.refresh()
-                return True
+    if result_inner is not None:
         log.debug("Skipping non-string progress result payload: %s", result_inner)
     return False
 

From 4374468c7967df62bb7e7eae290dd774119dd25e Mon Sep 17 00:00:00 2001
From: Ben Lewis <hello@blewis.me>
Date: Tue, 3 Feb 2026 17:21:26 +0200
Subject: [PATCH 15/24] SDK-79: Apply ruff format to llm behavior eval

---
 hirundo/llm_behavior_eval.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index 0bf29d94..23971d9e 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -381,9 +381,7 @@ def _check_run_by_id(
                     payload = _parse_sse_payload(sse_event.data)
                     last_payload = payload
                     yield payload
-            last_state = (
-                get_state(last_payload, ("state",)) if last_payload else None
-            )
+            last_state = get_state(last_payload, ("state",)) if last_payload else None
             if last_payload is None or last_state == RunStatus.PENDING.value:
                 retry_count += 1
                 continue

From 6bc31260fade12f3c88c1513d501ab7376311d0e Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 16:29:51 +0200
Subject: [PATCH 16/24] Update `README.md` and documentation (docs)

---
 README.md                                  | 40 +++++++++++++++++++++-
 docs/hirundo.llm_behavior_eval.rst         | 10 ++++++
 docs/hirundo.llm_behavior_eval_results.rst | 10 ++++++
 docs/hirundo.llm_bias_type.rst             | 10 ++++++
 docs/hirundo.rst                           |  3 ++
 docs/index.rst                             | 12 +++++++
 docs/llm_behavior_eval_example.py          | 32 +++++++++++++++++
 7 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 docs/hirundo.llm_behavior_eval.rst
 create mode 100644 docs/hirundo.llm_behavior_eval_results.rst
 create mode 100644 docs/hirundo.llm_bias_type.rst
 create mode 100644 docs/llm_behavior_eval_example.py

diff --git a/README.md b/README.md
index aec8d5af..adb54aca 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 The Hirundo Python SDK lets you:
 
 - Launch and monitor LLM behavior unlearning runs.
+- Run LLM behavior evaluations for bias, hallucination, and prompt injection.
 - Run dataset QA for ML datasets (classification, object detection, and more).
 - Fetch QA results as `pandas` or `polars` DataFrames.
 
@@ -22,7 +23,7 @@ pip install hirundo
 Optional extras:
 
 - LLM behavior unlearning (Transformers + PEFT): `pip install hirundo[transformers]`
-- Dataset QA results as DataFrames: `pip install hirundo[pandas]` or `pip install hirundo[polars]`
+- Dataset QA or LLM behavior eval results as DataFrames: `pip install hirundo[pandas]` or `pip install hirundo[polars]`
 
 If you want to install from source, clone this repository and run:
 
@@ -70,6 +71,43 @@ result = LlmUnlearningRun.check_run(run_id)
 new_adapter = llm.get_hf_pipeline_for_run(run_id)
 ```
 
+## Quickstart: LLM behavior eval
+
+If you want results as DataFrames, install `hirundo[pandas]` or `hirundo[polars]`.
+
+```python
+from hirundo import (
+    BiasType,
+    EvalRunInfo,
+    HuggingFaceTransformersModel,
+    LlmBehaviorEval,
+    LlmModel,
+    ModelOrRun,
+    PresetType,
+)
+
+llm = LlmModel(
+    model_name="Nemotron-Flash-1B",
+    model_source=HuggingFaceTransformersModel(
+        model_name="nvidia/Nemotron-Flash-1B",
+    ),
+)
+llm_id = llm.create()
+
+run_id = LlmBehaviorEval.launch_eval_run(
+    ModelOrRun.MODEL,
+    EvalRunInfo(
+        name="Nemotron BBQ bias eval",
+        model_id=llm_id,
+        preset_type=PresetType.BBQ_BIAS,
+        bias_type=BiasType.ALL,
+    ),
+)
+
+results = LlmBehaviorEval.check_run_by_id(run_id)
+print(results.summary_brief)
+```
+
 ## Quickstart: Dataset QA
 
 ### Classification
diff --git a/docs/hirundo.llm_behavior_eval.rst b/docs/hirundo.llm_behavior_eval.rst
new file mode 100644
index 00000000..500b4c4f
--- /dev/null
+++ b/docs/hirundo.llm_behavior_eval.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.llm_behavior_eval module
+=============================
+
+.. automodule:: hirundo.llm_behavior_eval
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/hirundo.llm_behavior_eval_results.rst b/docs/hirundo.llm_behavior_eval_results.rst
new file mode 100644
index 00000000..61a30892
--- /dev/null
+++ b/docs/hirundo.llm_behavior_eval_results.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.llm_behavior_eval_results module
+=============================
+
+.. automodule:: hirundo.llm_behavior_eval_results
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/hirundo.llm_bias_type.rst b/docs/hirundo.llm_bias_type.rst
new file mode 100644
index 00000000..54a39208
--- /dev/null
+++ b/docs/hirundo.llm_bias_type.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.llm_bias_type module
+=============================
+
+.. automodule:: hirundo.llm_bias_type
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/hirundo.rst b/docs/hirundo.rst
index abb3ad42..214fa564 100644
--- a/docs/hirundo.rst
+++ b/docs/hirundo.rst
@@ -17,6 +17,9 @@ Submodules
    hirundo.git
    hirundo.labeling
    hirundo.logger
+   hirundo.llm_behavior_eval
+   hirundo.llm_behavior_eval_results
+   hirundo.llm_bias_type
    hirundo.storage
    hirundo.unlearning_llm
    hirundo.unzip
diff --git a/docs/index.rst b/docs/index.rst
index 887f7bd7..9fa8877b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,6 +13,7 @@ Welcome to the ``hirundo`` client library documentation. This SDK connects to th
 Hirundo platform and provides APIs for:
 
 - LLM behavior unlearning runs (reducing bias, prompt injections and other unwanted behaviors).
+- LLM behavior eval runs (measuring bias, hallucination, prompt injection, and more).
 - Dataset QA for machine learning datasets.
 
 Getting started
@@ -45,6 +46,17 @@ Example:
 .. literalinclude:: llm_unlearning_example.py
    :language: python
 
+LLM behavior eval
+-----------------
+
+Run standardized evaluations over an LLM or an unlearning run to quantify
+behavior changes (bias, hallucination, prompt injections, and more).
+
+Example:
+
+.. literalinclude:: llm_behavior_eval_example.py
+   :language: python
+
 Dataset QA
 ----------
 
diff --git a/docs/llm_behavior_eval_example.py b/docs/llm_behavior_eval_example.py
new file mode 100644
index 00000000..0cce0338
--- /dev/null
+++ b/docs/llm_behavior_eval_example.py
@@ -0,0 +1,32 @@
+"""Examples for docs/index.rst literalinclude blocks."""
+
+from hirundo import (
+    BiasType,
+    EvalRunInfo,
+    HuggingFaceTransformersModel,
+    LlmBehaviorEval,
+    LlmModel,
+    ModelOrRun,
+    PresetType,
+)
+
+llm = LlmModel(
+    model_name="Nemotron-Flash-1B",
+    model_source=HuggingFaceTransformersModel(
+        model_name="nvidia/Nemotron-Flash-1B",
+    ),
+)
+llm_id = llm.create()
+
+run_id = LlmBehaviorEval.launch_eval_run(
+    ModelOrRun.MODEL,
+    EvalRunInfo(
+        name="Nemotron BBQ bias eval",
+        model_id=llm_id,
+        preset_type=PresetType.BBQ_BIAS,
+        bias_type=BiasType.ALL,
+    ),
+)
+
+results = LlmBehaviorEval.check_run_by_id(run_id)
+print(results.summary_brief)

From 263f19fae980ccbc45990054fb539c8747b88c4f Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 16:33:25 +0200
Subject: [PATCH 17/24] Drop Python code from `README.md`

---
 README.md                                     | 154 +-----------------
 ...y => dataset_qa_classification_example.py} |   0
 docs/dataset_qa_object_detection_example.py   |  41 +++++
 docs/index.rst                                |   9 +-
 4 files changed, 52 insertions(+), 152 deletions(-)
 rename docs/{dataset_qa_example.py => dataset_qa_classification_example.py} (100%)
 create mode 100644 docs/dataset_qa_object_detection_example.py

diff --git a/README.md b/README.md
index adb54aca..ff27cf46 100644
--- a/README.md
+++ b/README.md
@@ -41,157 +41,11 @@ hirundo setup
 
 This writes `API_KEY` (and optionally `API_HOST`) to `.env` in the current directory or `~/.hirundo.conf`.
 
-## Quickstart: LLM behavior unlearning
-
-Make sure you have the `transformers` extra installed (`pip install hirundo[transformers]`).
-
-```python
-from hirundo import (
-    BiasRunInfo,
-    BiasType,
-    HuggingFaceTransformersModel,
-    LlmModel,
-    LlmUnlearningRun,
-)
-
-llm = LlmModel(
-    model_name="Nemotron-Flash-1B",
-    model_source=HuggingFaceTransformersModel(
-        model_name="nvidia/Nemotron-Flash-1B",
-    ),
-)
-llm_id = llm.create()
-
-run_id = LlmUnlearningRun.launch(
-    llm_id,
-    BiasRunInfo(bias_type=BiasType.ALL),
-)
-
-result = LlmUnlearningRun.check_run(run_id)
-new_adapter = llm.get_hf_pipeline_for_run(run_id)
-```
-
-## Quickstart: LLM behavior eval
-
-If you want results as DataFrames, install `hirundo[pandas]` or `hirundo[polars]`.
-
-```python
-from hirundo import (
-    BiasType,
-    EvalRunInfo,
-    HuggingFaceTransformersModel,
-    LlmBehaviorEval,
-    LlmModel,
-    ModelOrRun,
-    PresetType,
-)
-
-llm = LlmModel(
-    model_name="Nemotron-Flash-1B",
-    model_source=HuggingFaceTransformersModel(
-        model_name="nvidia/Nemotron-Flash-1B",
-    ),
-)
-llm_id = llm.create()
-
-run_id = LlmBehaviorEval.launch_eval_run(
-    ModelOrRun.MODEL,
-    EvalRunInfo(
-        name="Nemotron BBQ bias eval",
-        model_id=llm_id,
-        preset_type=PresetType.BBQ_BIAS,
-        bias_type=BiasType.ALL,
-    ),
-)
-
-results = LlmBehaviorEval.check_run_by_id(run_id)
-print(results.summary_brief)
-```
+## Quickstart examples
 
-## Quickstart: Dataset QA
-
-### Classification
-
-```python
-import json
-import os
-
-from hirundo import (
-    HirundoCSV,
-    LabelingType,
-    QADataset,
-    StorageConfig,
-    StorageGCP,
-    StorageTypes,
-)
-
-gcp_bucket = StorageGCP(
-    bucket_name="cifar100bucket",
-    project="Hirundo-global",
-    credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
-)
-
-test_dataset = QADataset(
-    name="TEST-GCP cifar 100 classification dataset",
-    labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
-    storage_config=StorageConfig(
-        name="cifar100bucket",
-        type=StorageTypes.GCP,
-        gcp=gcp_bucket,
-    ),
-    data_root_url=gcp_bucket.get_url(path="/pytorch-cifar/data"),
-    labeling_info=HirundoCSV(
-        csv_url=gcp_bucket.get_url(path="/pytorch-cifar/data/cifar100.csv"),
-    ),
-    classes=cifar100_classes,
-)
-
-test_dataset.run_qa()
-results = test_dataset.check_run()
-print(results)
-```
-
-### Object detection
-
-```python
-from hirundo import (
-    GitRepo,
-    HirundoCSV,
-    LabelingType,
-    QADataset,
-    StorageConfig,
-    StorageGit,
-    StorageTypes,
-)
-
-git_storage = StorageGit(
-    repo=GitRepo(
-        name="BDD-100k-validation-dataset",
-        repository_url="https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only",
-    ),
-    branch="main",
-)
-
-test_dataset = QADataset(
-    name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
-    labeling_type=LabelingType.OBJECT_DETECTION,
-    storage_config=StorageConfig(
-        name="BDD-100k-validation-dataset",
-        type=StorageTypes.GIT,
-        git=git_storage,
-    ),
-    data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"),
-    labeling_info=HirundoCSV(
-        csv_url=git_storage.get_url(
-            path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
-        ),
-    ),
-)
-
-test_dataset.run_qa()
-results = test_dataset.check_run()
-print(results)
-```
+The full quickstart examples now live in the Sphinx docs so they can be linted,
+formatted, and type-checked as real Python files. See the examples embedded in
+`docs/index.rst`, which are sourced from `docs/*.py` files.
 
 ## Supported dataset storage
 
diff --git a/docs/dataset_qa_example.py b/docs/dataset_qa_classification_example.py
similarity index 100%
rename from docs/dataset_qa_example.py
rename to docs/dataset_qa_classification_example.py
diff --git a/docs/dataset_qa_object_detection_example.py b/docs/dataset_qa_object_detection_example.py
new file mode 100644
index 00000000..7dc464dc
--- /dev/null
+++ b/docs/dataset_qa_object_detection_example.py
@@ -0,0 +1,41 @@
+"""Examples for docs/index.rst literalinclude blocks."""
+
+from hirundo import (
+    GitRepo,
+    HirundoCSV,
+    LabelingType,
+    QADataset,
+    StorageConfig,
+    StorageGit,
+    StorageTypes,
+)
+
+git_storage = StorageGit(
+    repo=GitRepo(
+        name="BDD-100k-validation-dataset",
+        repository_url=(
+            "https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only"
+        ),
+    ),
+    branch="main",
+)
+
+test_dataset = QADataset(
+    name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
+    labeling_type=LabelingType.OBJECT_DETECTION,
+    storage_config=StorageConfig(
+        name="BDD-100k-validation-dataset",
+        type=StorageTypes.GIT,
+        git=git_storage,
+    ),
+    data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"),
+    labeling_info=HirundoCSV(
+        csv_url=git_storage.get_url(
+            path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
+        ),
+    ),
+)
+
+test_dataset.run_qa()
+results = test_dataset.check_run()
+print(results)
diff --git a/docs/index.rst b/docs/index.rst
index 9fa8877b..0841e123 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -75,9 +75,14 @@ Supported storage backends include:
 - Google Cloud Storage (GCS)
 - Git repositories with LFS (GitHub, Hugging Face)
 
-Example:
+Classification example:
+
+.. literalinclude:: dataset_qa_classification_example.py
+   :language: python
+
+Object detection example:
 
-.. literalinclude:: dataset_qa_example.py
+.. literalinclude:: dataset_qa_object_detection_example.py
    :language: python
 
 API reference

From 6a96100ba220a935bf00ec5fb7183e73b6f8c25c Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 16:34:24 +0200
Subject: [PATCH 18/24] Fix circular import

Thank you Cursor (bugbot)
---
 hirundo/_sse_event_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py
index 940d8675..131d1c45 100644
--- a/hirundo/_sse_event_data.py
+++ b/hirundo/_sse_event_data.py
@@ -1,6 +1,6 @@
 from pydantic import BaseModel, ValidationError
 
-from hirundo import HirundoError
+from hirundo._hirundo_error import HirundoError
 from hirundo._run_status import RunStatus
 from hirundo.logger import get_logger
 

From 97b24318c8bfa3a8844bda2e9356530bd45b2586 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 16:41:30 +0200
Subject: [PATCH 19/24] Rename `BiasType` to `BBQBiasType`

---
 docs/llm_behavior_eval_example.py                 | 4 ++--
 docs/llm_unlearning_example.py                    | 4 ++--
 hirundo/__init__.py                               | 4 ++--
 hirundo/llm_behavior_eval.py                      | 6 +++---
 hirundo/llm_bias_type.py                          | 2 +-
 hirundo/unlearning_llm.py                         | 6 +++---
 tests/llm-behavior-eval/llm_behavior_eval_test.py | 4 ++--
 tests/unlearning-llm/unlearn_llm_behavior_test.py | 4 ++--
 8 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/llm_behavior_eval_example.py b/docs/llm_behavior_eval_example.py
index 0cce0338..7db7ee98 100644
--- a/docs/llm_behavior_eval_example.py
+++ b/docs/llm_behavior_eval_example.py
@@ -1,7 +1,7 @@
 """Examples for docs/index.rst literalinclude blocks."""
 
 from hirundo import (
-    BiasType,
+    BBQBiasType,
     EvalRunInfo,
     HuggingFaceTransformersModel,
     LlmBehaviorEval,
@@ -24,7 +24,7 @@
         name="Nemotron BBQ bias eval",
         model_id=llm_id,
         preset_type=PresetType.BBQ_BIAS,
-        bias_type=BiasType.ALL,
+        bias_type=BBQBiasType.ALL,
     ),
 )
 
diff --git a/docs/llm_unlearning_example.py b/docs/llm_unlearning_example.py
index 2d9c545c..813a4f72 100644
--- a/docs/llm_unlearning_example.py
+++ b/docs/llm_unlearning_example.py
@@ -1,8 +1,8 @@
 """Examples for docs/index.rst literalinclude blocks."""
 
 from hirundo import (
+    BBQBiasType,
     BiasRunInfo,
-    BiasType,
     HuggingFaceTransformersModel,
     LlmModel,
     LlmUnlearningRun,
@@ -17,7 +17,7 @@
 llm_id = llm.create()
 run_id = LlmUnlearningRun.launch(
     llm_id,
-    BiasRunInfo(bias_type=BiasType.ALL),
+    BiasRunInfo(bias_type=BBQBiasType.ALL),
 )
 result = LlmUnlearningRun.check_run(run_id)
 new_adapter = llm.get_hf_pipeline_for_run(run_id)
diff --git a/hirundo/__init__.py b/hirundo/__init__.py
index 88a77577..c6be677c 100644
--- a/hirundo/__init__.py
+++ b/hirundo/__init__.py
@@ -43,7 +43,7 @@
     PresetType,
 )
 from .llm_behavior_eval_results import LlmBehaviorEvalResults
-from .llm_bias_type import BiasType
+from .llm_bias_type import BBQBiasType
 from .storage import (
     StorageConfig,
     StorageGCP,
@@ -70,7 +70,7 @@
     "KeylabsObjDetVideo",
     "KeylabsObjSegImages",
     "KeylabsObjSegVideo",
-    "BiasType",
+    "BBQBiasType",
     "QADataset",
     "EvalRunInfo",
     "EvalRunRecord",
diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index 23971d9e..ca130873 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -27,7 +27,7 @@
 from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults
-from hirundo.llm_bias_type import BiasType
+from hirundo.llm_bias_type import BBQBiasType
 from hirundo.logger import get_logger
 from hirundo.unzip import download_and_extract_llm_behavior_eval_zip
 
@@ -70,7 +70,7 @@ class EvalRunInfo(BaseModel):
     source_run_id: str | None = None
     file_path: str | None = None
     preset_type: PresetType | None = None
-    bias_type: BiasType | None = None
+    bias_type: BBQBiasType | None = None
     judge_model: JudgeModel | None = None
 
 
@@ -124,7 +124,7 @@ class EvalRunRecord(BaseModel):
     source_run: OutputUnlearningLlmRun | None
     file_path: str | None
     preset_type: PresetType | None
-    bias_type: BiasType | None
+    bias_type: BBQBiasType | None
     judge_model: JudgeModel | None
     run_id: str
     mlflow_run_id: str | None
diff --git a/hirundo/llm_bias_type.py b/hirundo/llm_bias_type.py
index bd45501d..5b0b72cc 100644
--- a/hirundo/llm_bias_type.py
+++ b/hirundo/llm_bias_type.py
@@ -1,7 +1,7 @@
 from enum import Enum
 
 
-class BiasType(str, Enum):
+class BBQBiasType(str, Enum):
     ALL = "ALL"
     RACE = "RACE"
     NATIONALITY = "NATIONALITY"
diff --git a/hirundo/unlearning_llm.py b/hirundo/unlearning_llm.py
index a56bd7e4..00375dc1 100644
--- a/hirundo/unlearning_llm.py
+++ b/hirundo/unlearning_llm.py
@@ -25,7 +25,7 @@
 from hirundo._run_status import RunStatus
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.dataset_qa import HirundoError
-from hirundo.llm_bias_type import BiasType
+from hirundo.llm_bias_type import BBQBiasType
 from hirundo.logger import get_logger
 
 if TYPE_CHECKING:
@@ -227,7 +227,7 @@ class CustomUtility(BaseModel):
 
 class BiasBehavior(BaseModel):
     type: Literal["BIAS"] = "BIAS"
-    bias_type: BiasType
+    bias_type: BBQBiasType
 
 
 class HallucinationType(str, Enum):
@@ -270,7 +270,7 @@ class LlmRunInfo(BaseModel):
 
 
 class BiasRunInfo(BaseModel):
-    bias_type: BiasType
+    bias_type: BBQBiasType
     organization_id: int | None = None
     name: str | None = None
     target_utilities: list[TargetUtility] | None = None
diff --git a/tests/llm-behavior-eval/llm_behavior_eval_test.py b/tests/llm-behavior-eval/llm_behavior_eval_test.py
index d4a8359c..090d1a82 100644
--- a/tests/llm-behavior-eval/llm_behavior_eval_test.py
+++ b/tests/llm-behavior-eval/llm_behavior_eval_test.py
@@ -2,7 +2,7 @@
 import os
 
 from hirundo import (
-    BiasType,
+    BBQBiasType,
     EvalRunInfo,
     HuggingFaceTransformersModel,
     LlmBehaviorEval,
@@ -29,7 +29,7 @@ def test_llm_behavior_eval():
         name=f"TEST-LLM-BEHAVIOR-EVAL-RUN-{unique_id}",
         model_id=llm_id,
         preset_type=PresetType.BBQ_BIAS,
-        bias_type=BiasType.ALL,
+        bias_type=BBQBiasType.ALL,
     )
     assert llm_id is not None
     if os.getenv("FULL_TEST", "false") == "true":
diff --git a/tests/unlearning-llm/unlearn_llm_behavior_test.py b/tests/unlearning-llm/unlearn_llm_behavior_test.py
index 13144936..3506c982 100644
--- a/tests/unlearning-llm/unlearn_llm_behavior_test.py
+++ b/tests/unlearning-llm/unlearn_llm_behavior_test.py
@@ -2,8 +2,8 @@
 import os
 
 from hirundo import (
+    BBQBiasType,
     BiasRunInfo,
-    BiasType,
     HuggingFaceTransformersModel,
     LlmModel,
     LlmUnlearningRun,
@@ -25,7 +25,7 @@ def test_unlearn_llm_behavior():
     )
     llm_id = llm.create()
     run_info = BiasRunInfo(
-        bias_type=BiasType.ALL,
+        bias_type=BBQBiasType.ALL,
     )
     assert llm_id is not None
     if os.getenv("FULL_TEST", "false") == "true":

From ba0392a45646089b1241cc34850258bfd86d0d90 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 16:47:46 +0200
Subject: [PATCH 20/24] Fix `progress_bar` not being closed if there is an
 error with the run

---
 hirundo/llm_behavior_eval.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index ca130873..e2563805 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -441,6 +441,7 @@ def check_run_by_id(
                             "State is failure, rejected, or revoked: %s",
                             state,
                         )
+                        progress_bar.close()
                         handle_run_failure(
                             iteration,
                             error_cls=HirundoLlmBehaviorEvalError,

From beb20a6fd1b49152132544c5ef8bc55a23a87a05 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 16:48:40 +0200
Subject: [PATCH 21/24] Add `deleted_at` to `EvalRunRecord`

---
 hirundo/llm_behavior_eval.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index e2563805..662cdcae 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -136,6 +136,8 @@ class EvalRunRecord(BaseModel):
     metrics: LlmEvalMetrics | None = None
     responses_zip_url: str | None = None
 
+    deleted_at: datetime.datetime | None = None
+
 
 class LlmBehaviorEval:
     def __init__(self, run_id: str | None = None):

From 56b538f362018b32680b1db30167f9070a79ffad Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 16:55:29 +0200
Subject: [PATCH 22/24] Add cleanup for LLM behavior eval runs

---
 scripts/cleanup_test_artifacts.py | 35 +++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/scripts/cleanup_test_artifacts.py b/scripts/cleanup_test_artifacts.py
index e6357583..6d65b6d3 100644
--- a/scripts/cleanup_test_artifacts.py
+++ b/scripts/cleanup_test_artifacts.py
@@ -2,9 +2,10 @@
 from datetime import timedelta, timezone
 
 import requests
-from hirundo import GitRepo, QADataset, StorageConfig
+from hirundo import GitRepo, LlmBehaviorEval, QADataset, StorageConfig
 from hirundo._run_status import RunStatus
 from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut
+from hirundo.llm_behavior_eval import EvalRunRecord
 from hirundo.logger import get_logger
 from hirundo.storage import ResponseStorageConfig
 from hirundo.unlearning_llm import (
@@ -70,7 +71,7 @@ def _delete_llm(
 
 def _should_delete_resource(
     resource_name: str,
-    runs: list[DataQARunOut | OutputUnlearningLlmRun],
+    runs: list[DataQARunOut | OutputUnlearningLlmRun | EvalRunRecord],
     expiry_date: datetime.datetime,
 ) -> bool:
     """
@@ -291,12 +292,42 @@ def _handle_llm_cleanup(one_week_ago: datetime.datetime):
         )
 
 
+def _handle_llm_behavior_eval_cleanup(one_week_ago: datetime.datetime) -> None:
+    archived_runs = set[str]()
+    trying_to_archive_runs = set[str]()
+    live_runs = LlmBehaviorEval.list_runs(archived=False)
+    for run in live_runs:
+        if _should_delete_resource(run.name, [run], one_week_ago):
+            trying_to_archive_runs.add(run.run_id)
+            try:
+                LlmBehaviorEval.archive_by_id(run.run_id)
+                archived_runs.add(run.run_id)
+            except (HirundoError, requests.HTTPError) as exc:
+                logger.warning(
+                    "Failed to archive LLM behavior eval run with ID %s: %s",
+                    run.run_id,
+                    exc,
+                )
+    logger.info(
+        "Archived %s (%s) LLM behavior eval runs",
+        archived_runs,
+        len(archived_runs),
+    )
+    if trying_to_archive_runs != archived_runs:
+        logger.warning(
+            "Tried to archive %s LLM behavior eval runs, but only archived %s runs",
+            trying_to_archive_runs,
+            archived_runs,
+        )
+
+
 def main() -> None:
     now = datetime.datetime.now(timezone.utc)
     one_week_ago = now - timedelta(days=7)
 
     _handle_datasets_cleanup(one_week_ago)
     _handle_llm_cleanup(one_week_ago)
+    _handle_llm_behavior_eval_cleanup(one_week_ago)
     _cleanup_storage_configs(one_week_ago)
 
 

From 6c641ec5ac7ca4f57c1072027295a1dce51eac27 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 17:35:43 +0200
Subject: [PATCH 23/24] Add `UnqoverBiasType` as per @mishana 's PR comment

---
 hirundo/__init__.py          | 3 ++-
 hirundo/llm_behavior_eval.py | 6 +++---
 hirundo/llm_bias_type.py     | 8 ++++++++
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/hirundo/__init__.py b/hirundo/__init__.py
index c6be677c..c47c2fbc 100644
--- a/hirundo/__init__.py
+++ b/hirundo/__init__.py
@@ -43,7 +43,7 @@
     PresetType,
 )
 from .llm_behavior_eval_results import LlmBehaviorEvalResults
-from .llm_bias_type import BBQBiasType
+from .llm_bias_type import BBQBiasType, UnqoverBiasType
 from .storage import (
     StorageConfig,
     StorageGCP,
@@ -71,6 +71,7 @@
     "KeylabsObjSegImages",
     "KeylabsObjSegVideo",
     "BBQBiasType",
+    "UnqoverBiasType",
     "QADataset",
     "EvalRunInfo",
     "EvalRunRecord",
diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
index 662cdcae..561204f2 100644
--- a/hirundo/llm_behavior_eval.py
+++ b/hirundo/llm_behavior_eval.py
@@ -27,7 +27,7 @@
 from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults
-from hirundo.llm_bias_type import BBQBiasType
+from hirundo.llm_bias_type import BBQBiasType, UnqoverBiasType
 from hirundo.logger import get_logger
 from hirundo.unzip import download_and_extract_llm_behavior_eval_zip
 
@@ -70,7 +70,7 @@ class EvalRunInfo(BaseModel):
     source_run_id: str | None = None
     file_path: str | None = None
     preset_type: PresetType | None = None
-    bias_type: BBQBiasType | None = None
+    bias_type: BBQBiasType | UnqoverBiasType | None = None
     judge_model: JudgeModel | None = None
 
 
@@ -124,7 +124,7 @@ class EvalRunRecord(BaseModel):
     source_run: OutputUnlearningLlmRun | None
     file_path: str | None
     preset_type: PresetType | None
-    bias_type: BBQBiasType | None
+    bias_type: BBQBiasType | UnqoverBiasType | None
     judge_model: JudgeModel | None
     run_id: str
     mlflow_run_id: str | None
diff --git a/hirundo/llm_bias_type.py b/hirundo/llm_bias_type.py
index 5b0b72cc..e5fc94dd 100644
--- a/hirundo/llm_bias_type.py
+++ b/hirundo/llm_bias_type.py
@@ -9,3 +9,11 @@ class BBQBiasType(str, Enum):
     PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE"
     RELIGION = "RELIGION"
     AGE = "AGE"
+
+
+class UnqoverBiasType(str, Enum):
+    ALL = "ALL"
+    RACE = "RACE"
+    NATIONALITY = "NATIONALITY"
+    GENDER = "GENDER"
+    RELIGION = "RELIGION"

From c1eba15515c12b8a8c0101f0fc4db920283aeb69 Mon Sep 17 00:00:00 2001
From: Ben Lewis <blewis@hirundo.io>
Date: Wed, 4 Feb 2026 17:38:00 +0200
Subject: [PATCH 24/24] Fix Cursor's bugbot's comment

---
 hirundo/_sse_event_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py
index 131d1c45..429c942b 100644
--- a/hirundo/_sse_event_data.py
+++ b/hirundo/_sse_event_data.py
@@ -9,7 +9,7 @@
 
 class SseRunEventData(BaseModel):
     id: str
-    state: RunStatus
+    state: RunStatus | None
     result: str | dict | None