diff --git a/.envrc b/.envrc
index b2d2268b..550b4cae 100644
--- a/.envrc
+++ b/.envrc
@@ -1,2 +1,2 @@
 watch_file uv.lock
-uv sync --all-extras && source .venv/bin/activate
+uv sync --group dev && source .venv/bin/activate
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 25c82ebb..314b9f2a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -75,4 +75,4 @@ repos:
     hooks:
       - id: uv-lock
       - id: uv-sync
-        args: ["--extra", "dev", "--extra", "docs", "--extra", "pandas", "--extra", "polars", "--extra", "transformers"]
+        args: ["--group", "dev"]
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 00000000..e1d12ee7
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,47 @@
+# Repository Guidelines
+
+## Instructions
+
+- Always use context7 when I need code generation, setup or configuration steps, or
+  library/API documentation. This means you should automatically use the Context7 MCP
+  tools to resolve library id and get library docs without me having to explicitly ask.
+
+## Project Structure & Module Organization
+
+- `hirundo/` holds the SDK source (CLI entry point is `hirundo.cli:app`).
+- `tests/` contains pytest-based test coverage.
+- `docs/` and `source/` contain Sphinx documentation assets.
+- `notebooks/` and `on_prem_test_notebook.ipynb` provide example workflows.
+- `requirements/` stores compiled dependency sets (for dev, docs, pandas, polars, transformers).
+
+## Build, Test, and Development Commands
+
+- `uv sync --group dev`: fast dependency sync with extras.
+- `ruff check` / `ruff format`: lint and auto-format (run before PRs).
+- `pytest`: run the test suite.
+- `python -m build`: build the package artifacts.
+- `pre-commit install`: enable git hooks (optional, but recommended).
+
+## Coding Style & Naming Conventions
+
+- Python 3.10+ codebase, 4-space indentation, line length 88 (Ruff defaults).
+- Follow Ruff linting rules (`pyproject.toml`), with tests allowing `assert` usage.
+- Prefer descriptive names; avoid short, cryptic identifiers in new code.
+- Avoid 1-3 character variable names in new or refactored code. Use descriptive names
+  even in small scopes.
+
+## Testing Guidelines
+
+- Frameworks: `pytest` and `pytest-asyncio`.
+- Place tests in `tests/`; name files `test_*.py`.
+- Run locally with `pytest` before opening a PR (CI runs lint + integration tests).
+
+## Commit & Pull Request Guidelines
+
+- Recent commit history favors `SDK-<id>: <summary>` (e.g., `SDK-78: Migrate to basedpyright`).
+- Include issue/PR references when available (e.g., `(#190)`).
+- PRs should describe changes clearly and confirm `ruff check` and `ruff format` passed.
+
+## Security & Configuration Tips
+
+- Supported Python versions: CPython 3.10–3.13.
diff --git a/README.md b/README.md
index aec8d5af..ff27cf46 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 The Hirundo Python SDK lets you:
 
 - Launch and monitor LLM behavior unlearning runs.
+- Run LLM behavior evaluations for bias, hallucination, and prompt injection.
 - Run dataset QA for ML datasets (classification, object detection, and more).
 - Fetch QA results as `pandas` or `polars` DataFrames.
 
@@ -22,7 +23,7 @@ pip install hirundo
 Optional extras:
 
 - LLM behavior unlearning (Transformers + PEFT): `pip install hirundo[transformers]`
-- Dataset QA results as DataFrames: `pip install hirundo[pandas]` or `pip install hirundo[polars]`
+- Dataset QA or LLM behavior eval results as DataFrames: `pip install hirundo[pandas]` or `pip install hirundo[polars]`
 
 If you want to install from source, clone this repository and run:
 
@@ -40,120 +41,11 @@ hirundo setup
 
 This writes `API_KEY` (and optionally `API_HOST`) to `.env` in the current directory or `~/.hirundo.conf`.
 
-## Quickstart: LLM behavior unlearning
-
-Make sure you have the `transformers` extra installed (`pip install hirundo[transformers]`).
-
-```python
-from hirundo import (
-    BiasRunInfo,
-    BiasType,
-    HuggingFaceTransformersModel,
-    LlmModel,
-    LlmUnlearningRun,
-)
-
-llm = LlmModel(
-    model_name="Nemotron-Flash-1B",
-    model_source=HuggingFaceTransformersModel(
-        model_name="nvidia/Nemotron-Flash-1B",
-    ),
-)
-llm_id = llm.create()
-
-run_id = LlmUnlearningRun.launch(
-    llm_id,
-    BiasRunInfo(bias_type=BiasType.ALL),
-)
-
-result = LlmUnlearningRun.check_run(run_id)
-new_adapter = llm.get_hf_pipeline_for_run(run_id)
-```
-
-## Quickstart: Dataset QA
-
-### Classification
-
-```python
-import json
-import os
-
-from hirundo import (
-    HirundoCSV,
-    LabelingType,
-    QADataset,
-    StorageConfig,
-    StorageGCP,
-    StorageTypes,
-)
-
-gcp_bucket = StorageGCP(
-    bucket_name="cifar100bucket",
-    project="Hirundo-global",
-    credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]),
-)
-
-test_dataset = QADataset(
-    name="TEST-GCP cifar 100 classification dataset",
-    labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION,
-    storage_config=StorageConfig(
-        name="cifar100bucket",
-        type=StorageTypes.GCP,
-        gcp=gcp_bucket,
-    ),
-    data_root_url=gcp_bucket.get_url(path="/pytorch-cifar/data"),
-    labeling_info=HirundoCSV(
-        csv_url=gcp_bucket.get_url(path="/pytorch-cifar/data/cifar100.csv"),
-    ),
-    classes=cifar100_classes,
-)
-
-test_dataset.run_qa()
-results = test_dataset.check_run()
-print(results)
-```
+## Quickstart examples
 
-### Object detection
-
-```python
-from hirundo import (
-    GitRepo,
-    HirundoCSV,
-    LabelingType,
-    QADataset,
-    StorageConfig,
-    StorageGit,
-    StorageTypes,
-)
-
-git_storage = StorageGit(
-    repo=GitRepo(
-        name="BDD-100k-validation-dataset",
-        repository_url="https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only",
-    ),
-    branch="main",
-)
-
-test_dataset = QADataset(
-    name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
-    labeling_type=LabelingType.OBJECT_DETECTION,
-    storage_config=StorageConfig(
-        name="BDD-100k-validation-dataset",
-        type=StorageTypes.GIT,
-        git=git_storage,
-    ),
-    data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"),
-    labeling_info=HirundoCSV(
-        csv_url=git_storage.get_url(
-            path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
-        ),
-    ),
-)
-
-test_dataset.run_qa()
-results = test_dataset.check_run()
-print(results)
-```
+The full quickstart examples now live in the Sphinx docs so they can be linted,
+formatted, and type-checked as real Python files. See the examples embedded in
+`docs/index.rst`, which are sourced from `docs/*.py` files.
 
 ## Supported dataset storage
 
diff --git a/docs/dataset_qa_example.py b/docs/dataset_qa_classification_example.py
similarity index 100%
rename from docs/dataset_qa_example.py
rename to docs/dataset_qa_classification_example.py
diff --git a/docs/dataset_qa_object_detection_example.py b/docs/dataset_qa_object_detection_example.py
new file mode 100644
index 00000000..7dc464dc
--- /dev/null
+++ b/docs/dataset_qa_object_detection_example.py
@@ -0,0 +1,41 @@
+"""Examples for docs/index.rst literalinclude blocks."""
+
+from hirundo import (
+    GitRepo,
+    HirundoCSV,
+    LabelingType,
+    QADataset,
+    StorageConfig,
+    StorageGit,
+    StorageTypes,
+)
+
+git_storage = StorageGit(
+    repo=GitRepo(
+        name="BDD-100k-validation-dataset",
+        repository_url=(
+            "https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only"
+        ),
+    ),
+    branch="main",
+)
+
+test_dataset = QADataset(
+    name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset",
+    labeling_type=LabelingType.OBJECT_DETECTION,
+    storage_config=StorageConfig(
+        name="BDD-100k-validation-dataset",
+        type=StorageTypes.GIT,
+        git=git_storage,
+    ),
+    data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"),
+    labeling_info=HirundoCSV(
+        csv_url=git_storage.get_url(
+            path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv"
+        ),
+    ),
+)
+
+test_dataset.run_qa()
+results = test_dataset.check_run()
+print(results)
diff --git a/docs/hirundo.llm_behavior_eval.rst b/docs/hirundo.llm_behavior_eval.rst
new file mode 100644
index 00000000..500b4c4f
--- /dev/null
+++ b/docs/hirundo.llm_behavior_eval.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.llm_behavior_eval module
+=============================
+
+.. automodule:: hirundo.llm_behavior_eval
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/hirundo.llm_behavior_eval_results.rst b/docs/hirundo.llm_behavior_eval_results.rst
new file mode 100644
index 00000000..61a30892
--- /dev/null
+++ b/docs/hirundo.llm_behavior_eval_results.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.llm_behavior_eval_results module
+=============================
+
+.. automodule:: hirundo.llm_behavior_eval_results
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/hirundo.llm_bias_type.rst b/docs/hirundo.llm_bias_type.rst
new file mode 100644
index 00000000..54a39208
--- /dev/null
+++ b/docs/hirundo.llm_bias_type.rst
@@ -0,0 +1,10 @@
+.. meta::
+   :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none'
+
+hirundo.llm_bias_type module
+=============================
+
+.. automodule:: hirundo.llm_bias_type
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/hirundo.rst b/docs/hirundo.rst
index abb3ad42..214fa564 100644
--- a/docs/hirundo.rst
+++ b/docs/hirundo.rst
@@ -17,6 +17,9 @@ Submodules
    hirundo.git
    hirundo.labeling
    hirundo.logger
+   hirundo.llm_behavior_eval
+   hirundo.llm_behavior_eval_results
+   hirundo.llm_bias_type
    hirundo.storage
    hirundo.unlearning_llm
    hirundo.unzip
diff --git a/docs/index.rst b/docs/index.rst
index 887f7bd7..0841e123 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,6 +13,7 @@ Welcome to the ``hirundo`` client library documentation. This SDK connects to th
 Hirundo platform and provides APIs for:
 
 - LLM behavior unlearning runs (reducing bias, prompt injections and other unwanted behaviors).
+- LLM behavior eval runs (measuring bias, hallucination, prompt injection, and more).
 - Dataset QA for machine learning datasets.
 
 Getting started
@@ -45,6 +46,17 @@ Example:
 .. literalinclude:: llm_unlearning_example.py
    :language: python
 
+LLM behavior eval
+-----------------
+
+Run standardized evaluations over an LLM or an unlearning run to quantify
+behavior changes (bias, hallucination, prompt injections, and more).
+
+Example:
+
+.. literalinclude:: llm_behavior_eval_example.py
+   :language: python
+
 Dataset QA
 ----------
 
@@ -63,9 +75,14 @@ Supported storage backends include:
 - Google Cloud Storage (GCS)
 - Git repositories with LFS (GitHub, Hugging Face)
 
-Example:
+Classification example:
+
+.. literalinclude:: dataset_qa_classification_example.py
+   :language: python
+
+Object detection example:
 
-.. literalinclude:: dataset_qa_example.py
+.. literalinclude:: dataset_qa_object_detection_example.py
    :language: python
 
 API reference
diff --git a/docs/llm_behavior_eval_example.py b/docs/llm_behavior_eval_example.py
new file mode 100644
index 00000000..7db7ee98
--- /dev/null
+++ b/docs/llm_behavior_eval_example.py
@@ -0,0 +1,32 @@
+"""Examples for docs/index.rst literalinclude blocks."""
+
+from hirundo import (
+    BBQBiasType,
+    EvalRunInfo,
+    HuggingFaceTransformersModel,
+    LlmBehaviorEval,
+    LlmModel,
+    ModelOrRun,
+    PresetType,
+)
+
+llm = LlmModel(
+    model_name="Nemotron-Flash-1B",
+    model_source=HuggingFaceTransformersModel(
+        model_name="nvidia/Nemotron-Flash-1B",
+    ),
+)
+llm_id = llm.create()
+
+run_id = LlmBehaviorEval.launch_eval_run(
+    ModelOrRun.MODEL,
+    EvalRunInfo(
+        name="Nemotron BBQ bias eval",
+        model_id=llm_id,
+        preset_type=PresetType.BBQ_BIAS,
+        bias_type=BBQBiasType.ALL,
+    ),
+)
+
+results = LlmBehaviorEval.check_run_by_id(run_id)
+print(results.summary_brief)
diff --git a/docs/llm_unlearning_example.py b/docs/llm_unlearning_example.py
index 2d9c545c..813a4f72 100644
--- a/docs/llm_unlearning_example.py
+++ b/docs/llm_unlearning_example.py
@@ -1,8 +1,8 @@
 """Examples for docs/index.rst literalinclude blocks."""
 
 from hirundo import (
+    BBQBiasType,
     BiasRunInfo,
-    BiasType,
     HuggingFaceTransformersModel,
     LlmModel,
     LlmUnlearningRun,
@@ -17,7 +17,7 @@
 llm_id = llm.create()
 run_id = LlmUnlearningRun.launch(
     llm_id,
-    BiasRunInfo(bias_type=BiasType.ALL),
+    BiasRunInfo(bias_type=BBQBiasType.ALL),
 )
 result = LlmUnlearningRun.check_run(run_id)
 new_adapter = llm.get_hf_pipeline_for_run(run_id)
diff --git a/hirundo/__init__.py b/hirundo/__init__.py
index d18f5ba3..c47c2fbc 100644
--- a/hirundo/__init__.py
+++ b/hirundo/__init__.py
@@ -1,3 +1,13 @@
+from ._hirundo_error import HirundoError
+from ._llm_sources import (
+    HuggingFaceTransformersModel,
+    HuggingFaceTransformersModelOutput,
+    LlmSources,
+    LlmSourcesOutput,
+    LocalTransformersModel,
+    ModelSourceType,
+)
+from ._run_status import RunStatus
 from .dataset_enum import (
     DatasetMetadataType,
     LabelingType,
@@ -5,7 +15,7 @@
 )
 from .dataset_qa import (
     ClassificationRunArgs,
-    HirundoError,
+    HirundoDatasetQaError,
     ModalityType,
     ObjectDetectionRunArgs,
     QADataset,
@@ -23,6 +33,17 @@
     KeylabsObjSegImages,
     KeylabsObjSegVideo,
 )
+from .llm_behavior_eval import (
+    EvalRunInfo,
+    EvalRunRecord,
+    HirundoLlmBehaviorEvalError,
+    JudgeModel,
+    LlmBehaviorEval,
+    ModelOrRun,
+    PresetType,
+)
+from .llm_behavior_eval_results import LlmBehaviorEvalResults
+from .llm_bias_type import BBQBiasType, UnqoverBiasType
 from .storage import (
     StorageConfig,
     StorageGCP,
@@ -32,27 +53,34 @@
 )
 from .unlearning_llm import (
     BiasRunInfo,
-    BiasType,
-    HuggingFaceTransformersModel,
     LlmModel,
-    LlmSources,
     LlmUnlearningRun,
-    LocalTransformersModel,
 )
 from .unzip import load_df, load_from_zip
 
 __all__ = [
     "COCO",
     "YOLO",
-    "HirundoError",
     "HirundoCSV",
+    "HirundoError",
+    "HirundoDatasetQaError",
+    "HirundoLlmBehaviorEvalError",
     "KeylabsAuth",
     "KeylabsObjDetImages",
     "KeylabsObjDetVideo",
     "KeylabsObjSegImages",
     "KeylabsObjSegVideo",
+    "BBQBiasType",
+    "UnqoverBiasType",
     "QADataset",
+    "EvalRunInfo",
+    "EvalRunRecord",
+    "JudgeModel",
+    "LlmBehaviorEval",
+    "LlmBehaviorEvalResults",
     "ModalityType",
+    "ModelOrRun",
+    "PresetType",
     "RunArgs",
     "ClassificationRunArgs",
     "ObjectDetectionRunArgs",
@@ -69,14 +97,17 @@
     "StorageConfig",
     "DatasetQAResults",
     "BiasRunInfo",
-    "BiasType",
     "HuggingFaceTransformersModel",
+    "HuggingFaceTransformersModelOutput",
     "LlmModel",
     "LlmSources",
+    "LlmSourcesOutput",
     "LlmUnlearningRun",
     "LocalTransformersModel",
     "load_df",
     "load_from_zip",
+    "ModelSourceType",
+    "RunStatus",
 ]
 
 __version__ = "0.2.3.post1"
diff --git a/hirundo/_hirundo_error.py b/hirundo/_hirundo_error.py
new file mode 100644
index 00000000..9b67978a
--- /dev/null
+++ b/hirundo/_hirundo_error.py
@@ -0,0 +1,6 @@
+class HirundoError(Exception):
+    """
+    Base exception class for Hirundo SDK errors
+    """
+
+    pass
diff --git a/hirundo/_llm_pipeline.py b/hirundo/_llm_pipeline.py
index 5224e732..a78c6b39 100644
--- a/hirundo/_llm_pipeline.py
+++ b/hirundo/_llm_pipeline.py
@@ -48,9 +48,11 @@ def get_hf_pipeline_for_run_given_model(
     from transformers.models.auto.tokenization_auto import AutoTokenizer
     from transformers.pipelines import pipeline
 
-    from hirundo.unlearning_llm import (
+    from hirundo._llm_sources import (
         HuggingFaceTransformersModel,
         HuggingFaceTransformersModelOutput,
+    )
+    from hirundo.unlearning_llm import (
         LlmUnlearningRun,
     )
 
diff --git a/hirundo/_llm_sources.py b/hirundo/_llm_sources.py
new file mode 100644
index 00000000..03e1a112
--- /dev/null
+++ b/hirundo/_llm_sources.py
@@ -0,0 +1,43 @@
+from enum import Enum
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict
+
+
+class ModelSourceType(str, Enum):
+    HUGGINGFACE_TRANSFORMERS = "huggingface_transformers"
+    LOCAL_TRANSFORMERS = "local_transformers"
+
+
+class HuggingFaceTransformersModel(BaseModel):
+    model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
+
+    type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = (
+        ModelSourceType.HUGGINGFACE_TRANSFORMERS
+    )
+    revision: str | None = None
+    code_revision: str | None = None
+    model_name: str
+    token: str | None = None
+
+
+class HuggingFaceTransformersModelOutput(BaseModel):
+    model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
+
+    type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = (
+        ModelSourceType.HUGGINGFACE_TRANSFORMERS
+    )
+    model_name: str
+
+
+class LocalTransformersModel(BaseModel):
+    type: Literal[ModelSourceType.LOCAL_TRANSFORMERS] = (
+        ModelSourceType.LOCAL_TRANSFORMERS
+    )
+    revision: None = None
+    code_revision: None = None
+    local_path: str
+
+
+LlmSources = HuggingFaceTransformersModel | LocalTransformersModel
+LlmSourcesOutput = HuggingFaceTransformersModelOutput | LocalTransformersModel
diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py
index 4866454b..8d493055 100644
--- a/hirundo/_run_checking.py
+++ b/hirundo/_run_checking.py
@@ -1,11 +1,12 @@
 import json
 from collections.abc import AsyncGenerator, Generator
-from enum import Enum
 
 import httpx
 from tqdm import tqdm
 
 from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
+from hirundo._run_status import RunStatus
+from hirundo._sse_event_data import SseRunEventData
 from hirundo.logger import get_logger
 
 _logger = get_logger(__name__)
@@ -13,17 +14,6 @@
 DEFAULT_MAX_RETRIES = 200
 
 
-class RunStatus(Enum):
-    PENDING = "PENDING"
-    STARTED = "STARTED"
-    SUCCESS = "SUCCESS"
-    FAILURE = "FAILURE"
-    AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL"
-    REVOKED = "REVOKED"
-    REJECTED = "REJECTED"
-    RETRY = "RETRY"
-
-
 STATUS_TO_PROGRESS_MAP = {
     RunStatus.STARTED.value: 0.0,
     RunStatus.PENDING.value: 0.0,
@@ -62,7 +52,9 @@ def build_status_text_map(
     }
 
 
-def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None:
+def get_state(
+    payload: dict | SseRunEventData, status_keys: tuple[str, ...]
+) -> str | None:
     """
     Return the first non-null state value from a payload using a list of keys.
 
@@ -74,7 +66,11 @@ def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None:
         The first non-null state value, or None if none are present.
     """
     for key in status_keys:
-        value = payload.get(key)
+        value = (
+            payload.get(key)
+            if isinstance(payload, dict)
+            else getattr(payload, key, None)
+        )
         if value is not None:
             return value
     return None
@@ -222,7 +218,7 @@ async def aiter_run_events(
 
 
 def update_progress_from_result(
-    iteration: dict,
+    iteration: dict | SseRunEventData,
     progress: tqdm,
     *,
     uploading_text: str,
@@ -240,13 +236,15 @@ def update_progress_from_result(
     Returns:
         True if a progress update occurred, False otherwise.
     """
-    if (
-        iteration.get("result")
-        and isinstance(iteration["result"], dict)
-        and iteration["result"].get("result")
-        and isinstance(iteration["result"]["result"], str)
-    ):
-        result_info = iteration["result"]["result"].split(":")
+    result_outer = (
+        iteration.get("result") if isinstance(iteration, dict) else iteration.result
+    )
+    result_inner = (
+        result_outer.get("result") if isinstance(result_outer, dict) else result_outer
+    )
+
+    if isinstance(result_inner, str):
+        result_info = result_inner.split(":")
         if len(result_info) > 1:
             stage = result_info[0]
             current_progress_percentage = float(
@@ -264,11 +262,13 @@ def update_progress_from_result(
         log.debug("Setting progress to %s", progress.n)
         progress.refresh()
         return True
+    if result_inner is not None:
+        log.debug("Skipping non-string progress result payload: %s", result_inner)
     return False
 
 
 def handle_run_failure(
-    iteration: dict, *, error_cls: type[Exception], run_label: str
+    iteration: dict | SseRunEventData, *, error_cls: type[Exception], run_label: str
 ) -> None:
     """
     Raise a run-specific failure exception based on the iteration payload.
@@ -278,6 +278,10 @@ def handle_run_failure(
         error_cls: Exception type to raise.
         run_label: Human-readable label for the run type.
     """
-    if iteration.get("result"):
-        raise error_cls(f"{run_label} run failed with error: {iteration['result']}")
+    if (
+        result := iteration.get("result")
+        if isinstance(iteration, dict)
+        else iteration.result
+    ):
+        raise error_cls(f"{run_label} run failed with error: {result}")
     raise error_cls(f"{run_label} run failed with an unknown error")
diff --git a/hirundo/_run_status.py b/hirundo/_run_status.py
new file mode 100644
index 00000000..1cc78f53
--- /dev/null
+++ b/hirundo/_run_status.py
@@ -0,0 +1,12 @@
+from enum import Enum
+
+
+class RunStatus(str, Enum):
+    PENDING = "PENDING"
+    STARTED = "STARTED"
+    SUCCESS = "SUCCESS"
+    FAILURE = "FAILURE"
+    AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL"
+    REVOKED = "REVOKED"
+    REJECTED = "REJECTED"
+    RETRY = "RETRY"
diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py
new file mode 100644
index 00000000..429c942b
--- /dev/null
+++ b/hirundo/_sse_event_data.py
@@ -0,0 +1,25 @@
+from pydantic import BaseModel, ValidationError
+
+from hirundo._hirundo_error import HirundoError
+from hirundo._run_status import RunStatus
+from hirundo.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class SseRunEventData(BaseModel):
+    id: str
+    state: RunStatus | None
+    result: str | dict | None
+
+
+class SseRunEventDataPayload(BaseModel):
+    data: SseRunEventData
+
+
+def _parse_sse_payload(payload: str) -> SseRunEventData:
+    try:
+        return SseRunEventDataPayload.model_validate_json(payload).data
+    except ValidationError as e:
+        logger.error("Invalid SSE payload: %s", payload, exc_info=True)
+        raise HirundoError(f"Invalid SSE payload: {payload}") from e
diff --git a/hirundo/dataset_qa.py b/hirundo/dataset_qa.py
index 44f0bca9..8428b69e 100644
--- a/hirundo/dataset_qa.py
+++ b/hirundo/dataset_qa.py
@@ -11,10 +11,10 @@
 from hirundo._constraints import validate_labeling_info, validate_url
 from hirundo._env import API_HOST
 from hirundo._headers import get_headers
+from hirundo._hirundo_error import HirundoError
 from hirundo._http import raise_for_status_with_reason, requests
 from hirundo._run_checking import (
     STATUS_TO_PROGRESS_MAP,
-    RunStatus,
     aiter_run_events,
     build_status_text_map,
     get_state,
@@ -22,6 +22,7 @@
     iter_run_events,
     update_progress_from_result,
 )
+from hirundo._run_status import RunStatus
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo._urls import HirundoUrl
 from hirundo.dataset_enum import DatasetMetadataType, LabelingType
@@ -34,11 +35,7 @@
 logger = get_logger(__name__)
 
 
-class HirundoError(Exception):
-    """
-    Custom exception used to indicate errors in `hirundo` dataset QA runs
-    """
-
+class HirundoDatasetQaError(HirundoError):
     pass
 
 
diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py
new file mode 100644
index 00000000..561204f2
--- /dev/null
+++ b/hirundo/llm_behavior_eval.py
@@ -0,0 +1,545 @@
+import datetime
+import typing
+from collections.abc import AsyncGenerator, Generator
+from enum import Enum
+from typing import overload
+
+import httpx
+from pydantic import BaseModel, ConfigDict
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
+
+from hirundo._env import API_HOST
+from hirundo._headers import get_headers
+from hirundo._hirundo_error import HirundoError
+from hirundo._http import raise_for_status_with_reason, requests
+from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying
+from hirundo._llm_sources import HuggingFaceTransformersModelOutput, LlmSourcesOutput
+from hirundo._run_checking import (
+    DEFAULT_MAX_RETRIES,
+    STATUS_TO_PROGRESS_MAP,
+    build_status_text_map,
+    get_state,
+    handle_run_failure,
+    update_progress_from_result,
+)
+from hirundo._run_status import RunStatus
+from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload
+from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
+from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults
+from hirundo.llm_bias_type import BBQBiasType, UnqoverBiasType
+from hirundo.logger import get_logger
+from hirundo.unzip import download_and_extract_llm_behavior_eval_zip
+
+logger = get_logger(__name__)
+
+
+STATUS_TO_TEXT_MAP = build_status_text_map("LLM behavior eval")
+
+
+class HirundoLlmBehaviorEvalError(HirundoError):
+    pass
+
+
+class ModelOrRun(str, Enum):
+    MODEL = "model"
+    RUN = "run"
+
+
+class PresetType(str, Enum):
+    BBQ_BIAS = "BBQ_BIAS"
+    BBQ_UNBIAS = "BBQ_UNBIAS"
+    UNQOVER_BIAS = "UNQOVER_BIAS"
+    HALU_EVAL = "HALU_EVAL"
+    MED_HALLU = "MED_HALLU"
+    INJECTION_EVAL = "INJECTION_EVAL"
+
+
+class JudgeModel(BaseModel):
+    path_or_repo_id: str
+    token: str | None = None
+    batch_size: int | None = None
+    output_tokens: int | None = None
+    use_4bit: bool | None = None
+
+
+class EvalRunInfo(BaseModel):
+    organization_id: int | None = None
+    name: str | None = None
+    model_id: int | None = None
+    source_run_id: str | None = None
+    file_path: str | None = None
+    preset_type: PresetType | None = None
+    bias_type: BBQBiasType | UnqoverBiasType | None = None
+    judge_model: JudgeModel | None = None
+
+
+class OutputLlm(BaseModel):
+    model_config = {"extra": "allow"}
+
+    id: int
+    organization_id: int
+    creator_id: int
+    creator_name: str
+    created_at: datetime.datetime
+    updated_at: datetime.datetime
+    model_name: str
+    model_source: LlmSourcesOutput
+
+
+class OutputUnlearningLlmRun(BaseModel):
+    model_config = {"extra": "allow"}
+
+    id: int
+    name: str
+    run_id: str
+    model: OutputLlm | None = None
+    status: str
+    created_at: datetime.datetime
+
+
+class LlmEvalMetricRow(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    benchmark: str
+    metric: str
+    original: float | str | None = None
+    post_unlearning: float | str | None = None
+    reduction_percent: float | None = None
+    subset: str | None = None
+
+
+class LlmEvalMetrics(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+    rows: list[LlmEvalMetricRow]
+
+
+class EvalRunRecord(BaseModel):
+    id: int
+    name: str
+    model_id: int | None
+    model: OutputLlm | None
+    source_run_id: str | None
+    source_run: OutputUnlearningLlmRun | None
+    file_path: str | None
+    preset_type: PresetType | None
+    bias_type: BBQBiasType | UnqoverBiasType | None
+    judge_model: JudgeModel | None
+    run_id: str
+    mlflow_run_id: str | None
+    status: str
+    created_at: datetime.datetime
+    pre_process_progress: float
+    optimization_progress: float
+    post_process_progress: float
+    metrics: LlmEvalMetrics | None = None
+    responses_zip_url: str | None = None
+
+    deleted_at: datetime.datetime | None = None
+
+
+class LlmBehaviorEval:
+    def __init__(self, run_id: str | None = None):
+        self.run_id = run_id
+
+    @staticmethod
+    def _parse_eval_run_record(response_payload: dict) -> EvalRunRecord:
+        model_payload = response_payload.get("model")
+        source_run_payload = response_payload.get("source_run")
+        judge_model_payload = response_payload.get("judge_model")
+        metrics_payload = response_payload.get("metrics")
+
+        model = (
+            OutputLlm.model_validate(model_payload)
+            if isinstance(model_payload, dict)
+            else None
+        )
+        source_run = (
+            OutputUnlearningLlmRun.model_validate(source_run_payload)
+            if isinstance(source_run_payload, dict)
+            else None
+        )
+        judge_model = (
+            JudgeModel.model_validate(judge_model_payload)
+            if isinstance(judge_model_payload, dict)
+            else None
+        )
+        if isinstance(metrics_payload, dict):
+            metrics = LlmEvalMetrics.model_validate(metrics_payload)
+        elif isinstance(metrics_payload, list):
+            metric_rows = [
+                LlmEvalMetricRow.model_validate(metric_row)
+                for metric_row in metrics_payload
+                if isinstance(metric_row, dict)
+            ]
+            metrics = LlmEvalMetrics(rows=metric_rows)
+        else:
+            metrics = None
+
+        return EvalRunRecord(
+            id=response_payload["id"],
+            name=response_payload["name"],
+            model_id=response_payload.get("model_id"),
+            model=model,
+            source_run_id=response_payload.get("source_run_id"),
+            source_run=source_run,
+            file_path=response_payload.get("file_path"),
+            preset_type=response_payload.get("preset_type"),
+            bias_type=response_payload.get("bias_type"),
+            judge_model=judge_model,
+            run_id=response_payload["run_id"],
+            mlflow_run_id=response_payload.get("mlflow_run_id"),
+            status=response_payload["status"],
+            created_at=response_payload["created_at"],
+            pre_process_progress=response_payload.get("pre_process_progress", 0.0),
+            optimization_progress=response_payload.get("optimization_progress", 0.0),
+            post_process_progress=response_payload.get("post_process_progress", 0.0),
+            metrics=metrics,
+            responses_zip_url=response_payload.get("responses_zip_url"),
+        )
+
+    @staticmethod
+    def launch_eval_run(
+        model_or_run: ModelOrRun | str,
+        run_info: EvalRunInfo,
+    ) -> str:
+        """
+        Launch an LLM behavior evaluation run.
+
+        Args:
+            model_or_run: Whether the evaluation is based on a model or a run.
+            run_info: The evaluation run parameters.
+
+        Returns:
+            The ID of the created evaluation run.
+        """
+        if isinstance(model_or_run, str):
+            model_or_run_value = ModelOrRun(model_or_run)
+        else:
+            model_or_run_value = model_or_run
+
+        response = requests.post(
+            f"{API_HOST}/llm-behavior-eval/run/{model_or_run_value.value}",
+            json=run_info.model_dump(mode="json"),
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        response_payload = response.json()
+        run_identifier = (
+            response_payload.get("run_id")
+            or response_payload.get("hir_run_id")
+            or response_payload.get("id")
+        )
+        if not run_identifier:
+            raise HirundoLlmBehaviorEvalError(
+                "Unable to determine the run ID from the response payload."
+            )
+        return run_identifier
+
+    @staticmethod
+    def cancel_by_id(run_id: str) -> None:
+        """
+        Cancel a running evaluation.
+        """
+        response = requests.patch(
+            f"{API_HOST}/llm-behavior-eval/run/cancel/{run_id}",
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+
+    def cancel(self) -> None:
+        if not self.run_id:
+            raise HirundoLlmBehaviorEvalError("No run has been started")
+        self.cancel_by_id(self.run_id)
+
+    @staticmethod
+    def rename_by_id(run_id: str, new_name: str) -> None:
+        """
+        Rename an evaluation run.
+        """
+        response = requests.patch(
+            f"{API_HOST}/llm-behavior-eval/run/rename/{run_id}",
+            json={"new_name": new_name},
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+
+    def rename(self, new_name: str) -> None:
+        if not self.run_id:
+            raise HirundoLlmBehaviorEvalError("No run has been started")
+        self.rename_by_id(self.run_id, new_name)
+
+    @staticmethod
+    def archive_by_id(run_id: str) -> None:
+        """
+        Archive an evaluation run.
+        """
+        response = requests.patch(
+            f"{API_HOST}/llm-behavior-eval/run/archive/{run_id}",
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+
+    def archive(self) -> None:
+        if not self.run_id:
+            raise HirundoLlmBehaviorEvalError("No run has been started")
+        self.archive_by_id(self.run_id)
+
+    @staticmethod
+    def restore_by_id(run_id: str) -> None:
+        """
+        Restore an archived evaluation run.
+        """
+        response = requests.patch(
+            f"{API_HOST}/llm-behavior-eval/run/restore/{run_id}",
+            headers=get_headers(),
+            timeout=MODIFY_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+
+    def restore(self) -> None:
+        if not self.run_id:
+            raise HirundoLlmBehaviorEvalError("No run has been started")
+        self.restore_by_id(self.run_id)
+
+    @staticmethod
+    def get_run_info_by_id(run_id: str) -> EvalRunRecord:
+        """
+        Retrieve the metadata for an evaluation run.
+        """
+        response = requests.get(
+            f"{API_HOST}/llm-behavior-eval/run/info/{run_id}",
+            headers=get_headers(),
+            timeout=READ_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        response_payload = response.json()
+        return LlmBehaviorEval._parse_eval_run_record(response_payload)
+
+    @staticmethod
+    def list_runs(
+        organization_id: int | None = None,
+        archived: bool = False,
+    ) -> list[EvalRunRecord]:
+        """
+        List evaluation runs.
+        """
+        response = requests.get(
+            f"{API_HOST}/llm-behavior-eval/run/list",
+            params={
+                "eval_organization_id": organization_id,
+                "archived": archived,
+            },
+            headers=get_headers(),
+            timeout=READ_TIMEOUT,
+        )
+        raise_for_status_with_reason(response)
+        response_payload = response.json()
+        return [
+            LlmBehaviorEval._parse_eval_run_record(run_record)
+            for run_record in response_payload
+        ]
+
+    @staticmethod
+    def _resolve_model_name(run_info: EvalRunRecord) -> str | None:
+        if run_info.model and isinstance(
+            run_info.model.model_source, HuggingFaceTransformersModelOutput
+        ):
+            return run_info.model.model_source.model_name
+        if (
+            run_info.source_run
+            and run_info.source_run.model
+            and isinstance(
+                run_info.source_run.model.model_source,
+                HuggingFaceTransformersModelOutput,
+            )
+        ):
+            return run_info.source_run.model.model_source.model_name
+        return None
+
+    @staticmethod
+    def _check_run_by_id(
+        run_id: str, *, max_retries: int = DEFAULT_MAX_RETRIES
+    ) -> Generator[SseRunEventData, None, None]:
+        retry_count = 0
+        while True:
+            if retry_count > max_retries:
+                raise HirundoLlmBehaviorEvalError("Max retries reached")
+            last_payload = None
+            with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client:
+                for sse_event in iter_sse_retrying(
+                    client,
+                    "GET",
+                    f"{API_HOST}/llm-behavior-eval/run/{run_id}",
+                    headers=get_headers(),
+                ):
+                    if sse_event.event == "ping":
+                        continue
+                    payload = _parse_sse_payload(sse_event.data)
+                    last_payload = payload
+                    yield payload
+            last_state = get_state(last_payload, ("state",)) if last_payload else None
+            if last_payload is None or last_state == RunStatus.PENDING.value:
+                retry_count += 1
+                continue
+            return
+
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: typing.Literal[True]
+    ) -> LlmBehaviorEvalResults | None: ...
+
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: typing.Literal[False] = False
+    ) -> LlmBehaviorEvalResults: ...
+
+    @staticmethod
+    @overload
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: bool
+    ) -> LlmBehaviorEvalResults | None: ...
+
+    @staticmethod
+    def check_run_by_id(
+        run_id: str, stop_on_manual_approval: bool = False
+    ) -> LlmBehaviorEvalResults | None:
+        """
+        Check the status of a run given its ID
+
+        Args:
+            run_id: The `run_id` produced by a `launch_eval_run` call
+            stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval
+
+        Returns:
+            An LlmBehaviorEvalResults object with the results of the evaluation run
+
+        Raises:
+            HirundoLlmBehaviorEvalError: If the maximum number of retries is reached or if the run fails
+        """
+        logger.debug("Checking run with ID: %s", run_id)
+        with logging_redirect_tqdm():
+            progress_bar = tqdm(total=100.0)
+            for iteration in LlmBehaviorEval._check_run_by_id(run_id):
+                state = get_state(iteration, ("state",))
+                if state in STATUS_TO_PROGRESS_MAP:
+                    progress_bar.set_description(STATUS_TO_TEXT_MAP[state])
+                    progress_bar.n = STATUS_TO_PROGRESS_MAP[state]
+                    logger.debug("Setting progress to %s", progress_bar.n)
+                    progress_bar.refresh()
+                    if state in [
+                        RunStatus.FAILURE.value,
+                        RunStatus.REJECTED.value,
+                        RunStatus.REVOKED.value,
+                    ]:
+                        logger.error(
+                            "State is failure, rejected, or revoked: %s",
+                            state,
+                        )
+                        progress_bar.close()
+                        handle_run_failure(
+                            iteration,
+                            error_cls=HirundoLlmBehaviorEvalError,
+                            run_label="LLM behavior eval",
+                        )
+                    elif state == RunStatus.SUCCESS.value:
+                        progress_bar.close()
+                        zip_temporary_url = iteration.result
+                        if not zip_temporary_url or not isinstance(
+                            zip_temporary_url, str
+                        ):
+                            raise HirundoLlmBehaviorEvalError(
+                                "LLM behavior eval run completed without a results URL."
+                            )
+                        run_info = LlmBehaviorEval.get_run_info_by_id(run_id)
+                        model_name = LlmBehaviorEval._resolve_model_name(run_info)
+                        return download_and_extract_llm_behavior_eval_zip(
+                            run_id,
+                            zip_temporary_url,
+                            model_name,
+                        )
+                    elif (
+                        state == RunStatus.AWAITING_MANUAL_APPROVAL.value
+                        and stop_on_manual_approval
+                    ):
+                        progress_bar.close()
+                        return None
+                elif state is None:
+                    update_progress_from_result(
+                        iteration,
+                        progress_bar,
+                        uploading_text="LLM behavior eval run completed. Uploading results",
+                        log=logger,
+                    )
+        raise HirundoLlmBehaviorEvalError(
+            "LLM behavior eval run failed with an unknown error in check_run_by_id"
+        )
+
+    @overload
+    def check_run(
+        self, stop_on_manual_approval: typing.Literal[True]
+    ) -> LlmBehaviorEvalResults | None: ...
+
+    @overload
+    def check_run(
+        self, stop_on_manual_approval: typing.Literal[False] = False
+    ) -> LlmBehaviorEvalResults: ...
+
+    def check_run(
+        self, stop_on_manual_approval: bool = False
+    ) -> LlmBehaviorEvalResults | None:
+        """
+        Check the status of the current active instance's run.
+
+        Returns:
+            An LlmBehaviorEvalResults object with the results of the evaluation run
+        """
+        if not self.run_id:
+            raise HirundoLlmBehaviorEvalError("No run has been started")
+        return self.check_run_by_id(self.run_id, stop_on_manual_approval)
+
+    @staticmethod
+    async def acheck_run_by_id(run_id: str) -> AsyncGenerator[SseRunEventData, None]:
+        """
+        Async version of :func:`check_run_by_id`
+
+        Check the status of a run given its ID.
+
+        This generator will produce values to show progress of the run.
+        """
+        logger.debug("Checking run with ID: %s", run_id)
+        async with httpx.AsyncClient(
+            timeout=httpx.Timeout(None, connect=5.0)
+        ) as client:
+            async_iterator = await aiter_sse_retrying(
+                client,
+                "GET",
+                f"{API_HOST}/llm-behavior-eval/run/{run_id}",
+                headers=get_headers(),
+            )
+            async for sse_event in async_iterator:
+                if sse_event.event == "ping":
+                    continue
+                yield _parse_sse_payload(sse_event.data)
+
+    async def acheck_run(self) -> AsyncGenerator[SseRunEventData, None]:
+        """
+        Async version of :func:`check_run`
+
+        Check the status of the current active instance's run.
+
+        This generator will produce values to show progress of the run.
+
+        Note: This function does not handle errors nor show progress. It is expected that you do that.
+        """
+        if not self.run_id:
+            raise HirundoLlmBehaviorEvalError("No run has been started")
+        async for iteration in self.acheck_run_by_id(self.run_id):
+            yield iteration
diff --git a/hirundo/llm_behavior_eval_results.py b/hirundo/llm_behavior_eval_results.py
new file mode 100644
index 00000000..f13e9e0b
--- /dev/null
+++ b/hirundo/llm_behavior_eval_results.py
@@ -0,0 +1,27 @@
+import typing
+from pathlib import Path
+
+from pydantic import BaseModel
+
+T = typing.TypeVar("T")
+
+
+class LlmBehaviorEvalResults(BaseModel, typing.Generic[T]):
+    model_config = {"arbitrary_types_allowed": True}
+
+    cached_zip_path: Path
+    """
+    The path to the cached zip file of the results
+    """
+    model_name: str | None = None
+    """
+    The model name used to locate results in the zip file
+    """
+    summary_brief: T
+    """
+    A polars/pandas DataFrame containing the summary_brief CSV
+    """
+    summary_full: T
+    """
+    A polars/pandas DataFrame containing the summary_full CSV
+    """
diff --git a/hirundo/llm_bias_type.py b/hirundo/llm_bias_type.py
new file mode 100644
index 00000000..e5fc94dd
--- /dev/null
+++ b/hirundo/llm_bias_type.py
@@ -0,0 +1,19 @@
+from enum import Enum
+
+
+class BBQBiasType(str, Enum):
+    ALL = "ALL"
+    RACE = "RACE"
+    NATIONALITY = "NATIONALITY"
+    GENDER = "GENDER"
+    PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE"
+    RELIGION = "RELIGION"
+    AGE = "AGE"
+
+
+class UnqoverBiasType(str, Enum):
+    ALL = "ALL"
+    RACE = "RACE"
+    NATIONALITY = "NATIONALITY"
+    GENDER = "GENDER"
+    RELIGION = "RELIGION"
diff --git a/hirundo/unlearning_llm.py b/hirundo/unlearning_llm.py
index b36ea14f..00375dc1 100644
--- a/hirundo/unlearning_llm.py
+++ b/hirundo/unlearning_llm.py
@@ -12,9 +12,9 @@
 from hirundo._headers import get_headers
 from hirundo._http import raise_for_status_with_reason, requests
 from hirundo._llm_pipeline import get_hf_pipeline_for_run_given_model
+from hirundo._llm_sources import LlmSources, LlmSourcesOutput
 from hirundo._run_checking import (
     STATUS_TO_PROGRESS_MAP,
-    RunStatus,
     aiter_run_events,
     build_status_text_map,
     get_state,
@@ -22,8 +22,10 @@
     iter_run_events,
     update_progress_from_result,
 )
+from hirundo._run_status import RunStatus
 from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT
 from hirundo.dataset_qa import HirundoError
+from hirundo.llm_bias_type import BBQBiasType
 from hirundo.logger import get_logger
 
 if TYPE_CHECKING:
@@ -34,45 +36,6 @@
 logger = get_logger(__name__)
 
 
-class ModelSourceType(str, Enum):
-    HUGGINGFACE_TRANSFORMERS = "huggingface_transformers"
-    LOCAL_TRANSFORMERS = "local_transformers"
-
-
-class HuggingFaceTransformersModel(BaseModel):
-    model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
-
-    type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = (
-        ModelSourceType.HUGGINGFACE_TRANSFORMERS
-    )
-    revision: str | None = None
-    code_revision: str | None = None
-    model_name: str
-    token: str | None = None
-
-
-class HuggingFaceTransformersModelOutput(BaseModel):
-    model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
-
-    type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = (
-        ModelSourceType.HUGGINGFACE_TRANSFORMERS
-    )
-    model_name: str
-
-
-class LocalTransformersModel(BaseModel):
-    type: Literal[ModelSourceType.LOCAL_TRANSFORMERS] = (
-        ModelSourceType.LOCAL_TRANSFORMERS
-    )
-    revision: None = None
-    code_revision: None = None
-    local_path: str
-
-
-LlmSources = HuggingFaceTransformersModel | LocalTransformersModel
-LlmSourcesOutput = HuggingFaceTransformersModelOutput | LocalTransformersModel
-
-
 class LlmModel(BaseModel):
     model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump"))
 
@@ -235,16 +198,6 @@ class UnlearningLlmAdvancedOptions(BaseModel):
     max_tokens_for_model: dict[DatasetType, int] | int | None = None
 
 
-class BiasType(str, Enum):
-    ALL = "ALL"
-    RACE = "RACE"
-    NATIONALITY = "NATIONALITY"
-    GENDER = "GENDER"
-    PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE"
-    RELIGION = "RELIGION"
-    AGE = "AGE"
-
-
 class UtilityType(str, Enum):
     DEFAULT = "DEFAULT"
     CUSTOM = "CUSTOM"
@@ -274,7 +227,7 @@ class CustomUtility(BaseModel):
 
 class BiasBehavior(BaseModel):
     type: Literal["BIAS"] = "BIAS"
-    bias_type: BiasType
+    bias_type: BBQBiasType
 
 
 class HallucinationType(str, Enum):
@@ -317,7 +270,7 @@ class LlmRunInfo(BaseModel):
 
 
 class BiasRunInfo(BaseModel):
-    bias_type: BiasType
+    bias_type: BBQBiasType
     organization_id: int | None = None
     name: str | None = None
     target_utilities: list[TargetUtility] | None = None
diff --git a/hirundo/unzip.py b/hirundo/unzip.py
index 113f3be7..3c2f0880 100644
--- a/hirundo/unzip.py
+++ b/hirundo/unzip.py
@@ -23,6 +23,7 @@
     DataFrameType,
     DatasetQAResults,
 )
+from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults
 from hirundo.logger import get_logger
 
 ZIP_FILE_CHUNK_SIZE = 50 * 1024 * 1024  # 50 MB
@@ -150,11 +151,11 @@ def download_and_extract_zip(
         headers=headers,
         timeout=DOWNLOAD_READ_TIMEOUT,
         stream=True,
-    ) as r:
-        r.raise_for_status()
-        with open(zip_file_path, "wb") as f:
-            for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
-                f.write(chunk)
+    ) as response:
+        response.raise_for_status()
+        with open(zip_file_path, "wb") as output_file:
+            for chunk in response.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
+                output_file.write(chunk)
         logger.info(
             "Successfully downloaded the result zip file for run ID %s to %s",
             run_id,
@@ -224,6 +225,79 @@ def download_and_extract_zip(
             )
 
 
+def download_and_extract_llm_behavior_eval_zip(
+    run_id: str,
+    zip_url: str,
+    model_name: str | None = None,
+) -> LlmBehaviorEvalResults[DataFrameType]:
+    """
+    Download and extract the LLM behavior evaluation results zip file.
+
+    Args:
+        run_id: The ID of the LLM behavior eval run.
+        zip_url: The URL of the zip file to download.
+        model_name (optional): The full model name to resolve the folder within the zip.
+
+    Returns:
+        The LLM behavior eval results object.
+    """
+    cache_dir = Path.home() / ".hirundo" / "cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    zip_file_path = cache_dir / f"{run_id}.zip"
+
+    headers = None
+    if Url(zip_url).scheme == "file":
+        zip_url = f"{API_HOST}/llm-behavior-eval/run/local-download" + zip_url.replace(
+            "file://", ""
+        )
+        headers = _get_auth_headers()
+    with requests.get(
+        zip_url,
+        headers=headers,
+        timeout=DOWNLOAD_READ_TIMEOUT,
+        stream=True,
+    ) as response:
+        response.raise_for_status()
+        with open(zip_file_path, "wb") as output_file:
+            for chunk in response.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE):
+                output_file.write(chunk)
+        logger.info(
+            "Successfully downloaded the LLM behavior eval result zip file for run ID %s to %s",
+            run_id,
+            zip_file_path,
+        )
+
+        if model_name:
+            model_folder = model_name.split("/")[-1]
+            summary_brief_name = f"responses/{model_folder}/summary_brief.csv"
+            summary_full_name = f"responses/{model_folder}/summary_full.csv"
+
+            with zipfile.ZipFile(zip_file_path, "r") as zip_file:
+                filenames = [file.filename for file in zip_file.filelist]
+                if summary_brief_name not in filenames:
+                    raise ValueError(
+                        f"Missing {summary_brief_name} in LLM behavior eval zip for run {run_id}"
+                    )
+                if summary_full_name not in filenames:
+                    raise ValueError(
+                        f"Missing {summary_full_name} in LLM behavior eval zip for run {run_id}"
+                    )
+                with zip_file.open(summary_brief_name) as summary_brief_file:
+                    summary_brief_df = load_df(summary_brief_file)
+                with zip_file.open(summary_full_name) as summary_full_file:
+                    summary_full_df = load_df(summary_full_file)
+        else:
+            summary_brief_df = None
+            summary_full_df = None
+
+        return LlmBehaviorEvalResults[DataFrameType](
+            cached_zip_path=zip_file_path,
+            model_name=model_name,
+            summary_brief=summary_brief_df,
+            summary_full=summary_full_df,
+        )
+
+
 def load_from_zip(
     zip_path: Path, file_name: str
 ) -> "pd.DataFrame | pl.DataFrame | None":
diff --git a/pyproject.toml b/pyproject.toml
index c70a17f5..23894fe3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,6 +102,9 @@ pandas = ["pandas>=2.2.3"]
 polars = ["polars>=1.0.0"]
 transformers = ["transformers>=4.57.3", "peft>=0.18.1", "accelerate>=1.12.0"]
 
+[dependency-groups]
+dev = ["hirundo[dev,docs,pandas,polars,transformers]"]
+
 [tool.bumpver]
 current_version = "0.2.3.post1"
 version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"
diff --git a/scripts/cleanup_test_artifacts.py b/scripts/cleanup_test_artifacts.py
index 1307075e..6d65b6d3 100644
--- a/scripts/cleanup_test_artifacts.py
+++ b/scripts/cleanup_test_artifacts.py
@@ -2,8 +2,10 @@
 from datetime import timedelta, timezone
 
 import requests
-from hirundo import GitRepo, QADataset, StorageConfig
-from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut, RunStatus
+from hirundo import GitRepo, LlmBehaviorEval, QADataset, StorageConfig
+from hirundo._run_status import RunStatus
+from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut
+from hirundo.llm_behavior_eval import EvalRunRecord
 from hirundo.logger import get_logger
 from hirundo.storage import ResponseStorageConfig
 from hirundo.unlearning_llm import (
@@ -69,7 +71,7 @@ def _delete_llm(
 
 def _should_delete_resource(
     resource_name: str,
-    runs: list[DataQARunOut | OutputUnlearningLlmRun],
+    runs: list[DataQARunOut | OutputUnlearningLlmRun | EvalRunRecord],
     expiry_date: datetime.datetime,
 ) -> bool:
     """
@@ -290,12 +292,42 @@ def _handle_llm_cleanup(one_week_ago: datetime.datetime):
         )
 
 
+def _handle_llm_behavior_eval_cleanup(one_week_ago: datetime.datetime) -> None:
+    archived_runs = set[str]()
+    trying_to_archive_runs = set[str]()
+    live_runs = LlmBehaviorEval.list_runs(archived=False)
+    for run in live_runs:
+        if _should_delete_resource(run.name, [run], one_week_ago):
+            trying_to_archive_runs.add(run.run_id)
+            try:
+                LlmBehaviorEval.archive_by_id(run.run_id)
+                archived_runs.add(run.run_id)
+            except (HirundoError, requests.HTTPError) as exc:
+                logger.warning(
+                    "Failed to archive LLM behavior eval run with ID %s: %s",
+                    run.run_id,
+                    exc,
+                )
+    logger.info(
+        "Archived %s (%s) LLM behavior eval runs",
+        archived_runs,
+        len(archived_runs),
+    )
+    if trying_to_archive_runs != archived_runs:
+        logger.warning(
+            "Tried to archive %s LLM behavior eval runs, but only archived %s runs",
+            trying_to_archive_runs,
+            archived_runs,
+        )
+
+
 def main() -> None:
     now = datetime.datetime.now(timezone.utc)
     one_week_ago = now - timedelta(days=7)
 
     _handle_datasets_cleanup(one_week_ago)
     _handle_llm_cleanup(one_week_ago)
+    _handle_llm_behavior_eval_cleanup(one_week_ago)
     _cleanup_storage_configs(one_week_ago)
 
 
diff --git a/tests/dataset_qa_shared.py b/tests/dataset_qa_shared.py
index 126789fb..014b3957 100644
--- a/tests/dataset_qa_shared.py
+++ b/tests/dataset_qa_shared.py
@@ -9,7 +9,7 @@
     RunArgs,
     StorageConfig,
 )
-from hirundo.dataset_qa import RunStatus
+from hirundo._run_status import RunStatus
 from hirundo.logger import get_logger
 
 logger = get_logger(__name__)
diff --git a/tests/llm-behavior-eval/llm_behavior_eval_test.py b/tests/llm-behavior-eval/llm_behavior_eval_test.py
new file mode 100644
index 00000000..090d1a82
--- /dev/null
+++ b/tests/llm-behavior-eval/llm_behavior_eval_test.py
@@ -0,0 +1,42 @@
+import logging
+import os
+
+from hirundo import (
+    BBQBiasType,
+    EvalRunInfo,
+    HuggingFaceTransformersModel,
+    LlmBehaviorEval,
+    LlmModel,
+    ModelOrRun,
+    PresetType,
+)
+from tests.testing_utils import get_unique_id
+
+logger = logging.getLogger(__name__)
+
+unique_id = get_unique_id()
+
+
+def test_llm_behavior_eval():
+    llm = LlmModel(
+        model_name=f"TEST-LLM-BEHAVIOR-EVAL-Granite-4-micro-{unique_id}",
+        model_source=HuggingFaceTransformersModel(
+            model_name="ibm-granite/granite-4.0-micro",
+        ),
+    )
+    llm_id = llm.create()
+    run_info = EvalRunInfo(
+        name=f"TEST-LLM-BEHAVIOR-EVAL-RUN-{unique_id}",
+        model_id=llm_id,
+        preset_type=PresetType.BBQ_BIAS,
+        bias_type=BBQBiasType.ALL,
+    )
+    assert llm_id is not None
+    if os.getenv("FULL_TEST", "false") == "true":
+        run_id = LlmBehaviorEval.launch_eval_run(ModelOrRun.MODEL, run_info)
+        assert run_id is not None
+        results = LlmBehaviorEval.check_run_by_id(run_id)
+        assert results is not None
+        assert results.cached_zip_path is not None
+        assert results.summary_brief is not None
+        assert results.summary_full is not None
diff --git a/tests/unlearning-llm/unlearn_llm_behavior_test.py b/tests/unlearning-llm/unlearn_llm_behavior_test.py
index 13144936..3506c982 100644
--- a/tests/unlearning-llm/unlearn_llm_behavior_test.py
+++ b/tests/unlearning-llm/unlearn_llm_behavior_test.py
@@ -2,8 +2,8 @@
 import os
 
 from hirundo import (
+    BBQBiasType,
     BiasRunInfo,
-    BiasType,
     HuggingFaceTransformersModel,
     LlmModel,
     LlmUnlearningRun,
@@ -25,7 +25,7 @@ def test_unlearn_llm_behavior():
     )
     llm_id = llm.create()
     run_info = BiasRunInfo(
-        bias_type=BiasType.ALL,
+        bias_type=BBQBiasType.ALL,
     )
     assert llm_id is not None
     if os.getenv("FULL_TEST", "false") == "true":
diff --git a/uv.lock b/uv.lock
index 0139e8f1..b1f2c0af 100644
--- a/uv.lock
+++ b/uv.lock
@@ -619,6 +619,11 @@ transformers = [
     { name = "transformers" },
 ]
 
+[package.dev-dependencies]
+dev = [
+    { name = "hirundo", extra = ["dev", "docs", "pandas", "polars", "transformers"] },
+]
+
 [package.metadata]
 requires-dist = [
     { name = "accelerate", marker = "extra == 'transformers'", specifier = ">=1.12.0" },
@@ -680,6 +685,9 @@ requires-dist = [
 ]
 provides-extras = ["dev", "docs", "pandas", "polars", "transformers"]
 
+[package.metadata.requires-dev]
+dev = [{ name = "hirundo", extras = ["dev", "docs", "pandas", "polars", "transformers"] }]
+
 [[package]]
 name = "httpcore"
 version = "1.0.9"