diff --git a/.envrc b/.envrc index b2d2268b..550b4cae 100644 --- a/.envrc +++ b/.envrc @@ -1,2 +1,2 @@ watch_file uv.lock -uv sync --all-extras && source .venv/bin/activate +uv sync --group dev && source .venv/bin/activate diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 25c82ebb..314b9f2a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -75,4 +75,4 @@ repos: hooks: - id: uv-lock - id: uv-sync - args: ["--extra", "dev", "--extra", "docs", "--extra", "pandas", "--extra", "polars", "--extra", "transformers"] + args: ["--group", "dev"] diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..e1d12ee7 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,47 @@ +# Repository Guidelines + +## Instructions + +- Always use context7 when I need code generation, setup or configuration steps, or + library/API documentation. This means you should automatically use the Context7 MCP + tools to resolve library id and get library docs without me having to explicitly ask. + +## Project Structure & Module Organization + +- `hirundo/` holds the SDK source (CLI entry point is `hirundo.cli:app`). +- `tests/` contains pytest-based test coverage. +- `docs/` and `source/` contain Sphinx documentation assets. +- `notebooks/` and `on_prem_test_notebook.ipynb` provide example workflows. +- `requirements/` stores compiled dependency sets (for dev, docs, pandas, polars, transformers). + +## Build, Test, and Development Commands + +- `uv sync --group dev`: fast dependency sync with extras. +- `ruff check` / `ruff format`: lint and auto-format (run before PRs). +- `pytest`: run the test suite. +- `python -m build`: build the package artifacts. +- `pre-commit install`: enable git hooks (optional, but recommended). + +## Coding Style & Naming Conventions + +- Python 3.10+ codebase, 4-space indentation, line length 88 (Ruff defaults). +- Follow Ruff linting rules (`pyproject.toml`), with tests allowing `assert` usage. +- Prefer descriptive names; avoid short, cryptic identifiers in new code. +- Avoid 1-3 character variable names in new or refactored code. Use descriptive names + even in small scopes. + +## Testing Guidelines + +- Frameworks: `pytest` and `pytest-asyncio`. +- Place tests in `tests/`; name files `test_*.py`. +- Run locally with `pytest` before opening a PR (CI runs lint + integration tests). + +## Commit & Pull Request Guidelines + +- Recent commit history favors `SDK-: ` (e.g., `SDK-78: Migrate to basedpyright`). +- Include issue/PR references when available (e.g., `(#190)`). +- PRs should describe changes clearly and confirm `ruff check` and `ruff format` passed. + +## Security & Configuration Tips + +- Supported Python versions: CPython 3.10–3.13. diff --git a/README.md b/README.md index aec8d5af..ff27cf46 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ The Hirundo Python SDK lets you: - Launch and monitor LLM behavior unlearning runs. +- Run LLM behavior evaluations for bias, hallucination, and prompt injection. - Run dataset QA for ML datasets (classification, object detection, and more). - Fetch QA results as `pandas` or `polars` DataFrames. @@ -22,7 +23,7 @@ pip install hirundo Optional extras: - LLM behavior unlearning (Transformers + PEFT): `pip install hirundo[transformers]` -- Dataset QA results as DataFrames: `pip install hirundo[pandas]` or `pip install hirundo[polars]` +- Dataset QA or LLM behavior eval results as DataFrames: `pip install hirundo[pandas]` or `pip install hirundo[polars]` If you want to install from source, clone this repository and run: @@ -40,120 +41,11 @@ hirundo setup This writes `API_KEY` (and optionally `API_HOST`) to `.env` in the current directory or `~/.hirundo.conf`. -## Quickstart: LLM behavior unlearning - -Make sure you have the `transformers` extra installed (`pip install hirundo[transformers]`). - -```python -from hirundo import ( - BiasRunInfo, - BiasType, - HuggingFaceTransformersModel, - LlmModel, - LlmUnlearningRun, -) - -llm = LlmModel( - model_name="Nemotron-Flash-1B", - model_source=HuggingFaceTransformersModel( - model_name="nvidia/Nemotron-Flash-1B", - ), -) -llm_id = llm.create() - -run_id = LlmUnlearningRun.launch( - llm_id, - BiasRunInfo(bias_type=BiasType.ALL), -) - -result = LlmUnlearningRun.check_run(run_id) -new_adapter = llm.get_hf_pipeline_for_run(run_id) -``` - -## Quickstart: Dataset QA - -### Classification - -```python -import json -import os - -from hirundo import ( - HirundoCSV, - LabelingType, - QADataset, - StorageConfig, - StorageGCP, - StorageTypes, -) - -gcp_bucket = StorageGCP( - bucket_name="cifar100bucket", - project="Hirundo-global", - credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]), -) - -test_dataset = QADataset( - name="TEST-GCP cifar 100 classification dataset", - labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION, - storage_config=StorageConfig( - name="cifar100bucket", - type=StorageTypes.GCP, - gcp=gcp_bucket, - ), - data_root_url=gcp_bucket.get_url(path="/pytorch-cifar/data"), - labeling_info=HirundoCSV( - csv_url=gcp_bucket.get_url(path="/pytorch-cifar/data/cifar100.csv"), - ), - classes=cifar100_classes, -) - -test_dataset.run_qa() -results = test_dataset.check_run() -print(results) -``` +## Quickstart examples -### Object detection - -```python -from hirundo import ( - GitRepo, - HirundoCSV, - LabelingType, - QADataset, - StorageConfig, - StorageGit, - StorageTypes, -) - -git_storage = StorageGit( - repo=GitRepo( - name="BDD-100k-validation-dataset", - repository_url="https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only", - ), - branch="main", -) - -test_dataset = QADataset( - name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset", - labeling_type=LabelingType.OBJECT_DETECTION, - storage_config=StorageConfig( - name="BDD-100k-validation-dataset", - type=StorageTypes.GIT, - git=git_storage, - ), - data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"), - labeling_info=HirundoCSV( - csv_url=git_storage.get_url( - path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv" - ), - ), -) - -test_dataset.run_qa() -results = test_dataset.check_run() -print(results) -``` +The full quickstart examples now live in the Sphinx docs so they can be linted, +formatted, and type-checked as real Python files. See the examples embedded in +`docs/index.rst`, which are sourced from `docs/*.py` files. ## Supported dataset storage diff --git a/docs/dataset_qa_example.py b/docs/dataset_qa_classification_example.py similarity index 100% rename from docs/dataset_qa_example.py rename to docs/dataset_qa_classification_example.py diff --git a/docs/dataset_qa_object_detection_example.py b/docs/dataset_qa_object_detection_example.py new file mode 100644 index 00000000..7dc464dc --- /dev/null +++ b/docs/dataset_qa_object_detection_example.py @@ -0,0 +1,41 @@ +"""Examples for docs/index.rst literalinclude blocks.""" + +from hirundo import ( + GitRepo, + HirundoCSV, + LabelingType, + QADataset, + StorageConfig, + StorageGit, + StorageTypes, +) + +git_storage = StorageGit( + repo=GitRepo( + name="BDD-100k-validation-dataset", + repository_url=( + "https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only" + ), + ), + branch="main", +) + +test_dataset = QADataset( + name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset", + labeling_type=LabelingType.OBJECT_DETECTION, + storage_config=StorageConfig( + name="BDD-100k-validation-dataset", + type=StorageTypes.GIT, + git=git_storage, + ), + data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"), + labeling_info=HirundoCSV( + csv_url=git_storage.get_url( + path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv" + ), + ), +) + +test_dataset.run_qa() +results = test_dataset.check_run() +print(results) diff --git a/docs/hirundo.llm_behavior_eval.rst b/docs/hirundo.llm_behavior_eval.rst new file mode 100644 index 00000000..500b4c4f --- /dev/null +++ b/docs/hirundo.llm_behavior_eval.rst @@ -0,0 +1,10 @@ +.. meta:: + :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none' + +hirundo.llm_behavior_eval module +============================= + +.. automodule:: hirundo.llm_behavior_eval + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/hirundo.llm_behavior_eval_results.rst b/docs/hirundo.llm_behavior_eval_results.rst new file mode 100644 index 00000000..61a30892 --- /dev/null +++ b/docs/hirundo.llm_behavior_eval_results.rst @@ -0,0 +1,10 @@ +.. meta:: + :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none' + +hirundo.llm_behavior_eval_results module +============================= + +.. automodule:: hirundo.llm_behavior_eval_results + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/hirundo.llm_bias_type.rst b/docs/hirundo.llm_bias_type.rst new file mode 100644 index 00000000..54a39208 --- /dev/null +++ b/docs/hirundo.llm_bias_type.rst @@ -0,0 +1,10 @@ +.. meta:: + :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none' + +hirundo.llm_bias_type module +============================= + +.. automodule:: hirundo.llm_bias_type + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/hirundo.rst b/docs/hirundo.rst index abb3ad42..214fa564 100644 --- a/docs/hirundo.rst +++ b/docs/hirundo.rst @@ -17,6 +17,9 @@ Submodules hirundo.git hirundo.labeling hirundo.logger + hirundo.llm_behavior_eval + hirundo.llm_behavior_eval_results + hirundo.llm_bias_type hirundo.storage hirundo.unlearning_llm hirundo.unzip diff --git a/docs/index.rst b/docs/index.rst index 887f7bd7..0841e123 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,6 +13,7 @@ Welcome to the ``hirundo`` client library documentation. This SDK connects to th Hirundo platform and provides APIs for: - LLM behavior unlearning runs (reducing bias, prompt injections and other unwanted behaviors). +- LLM behavior eval runs (measuring bias, hallucination, prompt injection, and more). - Dataset QA for machine learning datasets. Getting started @@ -45,6 +46,17 @@ Example: .. literalinclude:: llm_unlearning_example.py :language: python +LLM behavior eval +----------------- + +Run standardized evaluations over an LLM or an unlearning run to quantify +behavior changes (bias, hallucination, prompt injections, and more). + +Example: + +.. literalinclude:: llm_behavior_eval_example.py + :language: python + Dataset QA ---------- @@ -63,9 +75,14 @@ Supported storage backends include: - Google Cloud Storage (GCS) - Git repositories with LFS (GitHub, Hugging Face) -Example: +Classification example: + +.. literalinclude:: dataset_qa_classification_example.py + :language: python + +Object detection example: -.. literalinclude:: dataset_qa_example.py +.. literalinclude:: dataset_qa_object_detection_example.py :language: python API reference diff --git a/docs/llm_behavior_eval_example.py b/docs/llm_behavior_eval_example.py new file mode 100644 index 00000000..7db7ee98 --- /dev/null +++ b/docs/llm_behavior_eval_example.py @@ -0,0 +1,32 @@ +"""Examples for docs/index.rst literalinclude blocks.""" + +from hirundo import ( + BBQBiasType, + EvalRunInfo, + HuggingFaceTransformersModel, + LlmBehaviorEval, + LlmModel, + ModelOrRun, + PresetType, +) + +llm = LlmModel( + model_name="Nemotron-Flash-1B", + model_source=HuggingFaceTransformersModel( + model_name="nvidia/Nemotron-Flash-1B", + ), +) +llm_id = llm.create() + +run_id = LlmBehaviorEval.launch_eval_run( + ModelOrRun.MODEL, + EvalRunInfo( + name="Nemotron BBQ bias eval", + model_id=llm_id, + preset_type=PresetType.BBQ_BIAS, + bias_type=BBQBiasType.ALL, + ), +) + +results = LlmBehaviorEval.check_run_by_id(run_id) +print(results.summary_brief) diff --git a/docs/llm_unlearning_example.py b/docs/llm_unlearning_example.py index 2d9c545c..813a4f72 100644 --- a/docs/llm_unlearning_example.py +++ b/docs/llm_unlearning_example.py @@ -1,8 +1,8 @@ """Examples for docs/index.rst literalinclude blocks.""" from hirundo import ( + BBQBiasType, BiasRunInfo, - BiasType, HuggingFaceTransformersModel, LlmModel, LlmUnlearningRun, @@ -17,7 +17,7 @@ llm_id = llm.create() run_id = LlmUnlearningRun.launch( llm_id, - BiasRunInfo(bias_type=BiasType.ALL), + BiasRunInfo(bias_type=BBQBiasType.ALL), ) result = LlmUnlearningRun.check_run(run_id) new_adapter = llm.get_hf_pipeline_for_run(run_id) diff --git a/hirundo/__init__.py b/hirundo/__init__.py index d18f5ba3..c47c2fbc 100644 --- a/hirundo/__init__.py +++ b/hirundo/__init__.py @@ -1,3 +1,13 @@ +from ._hirundo_error import HirundoError +from ._llm_sources import ( + HuggingFaceTransformersModel, + HuggingFaceTransformersModelOutput, + LlmSources, + LlmSourcesOutput, + LocalTransformersModel, + ModelSourceType, +) +from ._run_status import RunStatus from .dataset_enum import ( DatasetMetadataType, LabelingType, @@ -5,7 +15,7 @@ ) from .dataset_qa import ( ClassificationRunArgs, - HirundoError, + HirundoDatasetQaError, ModalityType, ObjectDetectionRunArgs, QADataset, @@ -23,6 +33,17 @@ KeylabsObjSegImages, KeylabsObjSegVideo, ) +from .llm_behavior_eval import ( + EvalRunInfo, + EvalRunRecord, + HirundoLlmBehaviorEvalError, + JudgeModel, + LlmBehaviorEval, + ModelOrRun, + PresetType, +) +from .llm_behavior_eval_results import LlmBehaviorEvalResults +from .llm_bias_type import BBQBiasType, UnqoverBiasType from .storage import ( StorageConfig, StorageGCP, @@ -32,27 +53,34 @@ ) from .unlearning_llm import ( BiasRunInfo, - BiasType, - HuggingFaceTransformersModel, LlmModel, - LlmSources, LlmUnlearningRun, - LocalTransformersModel, ) from .unzip import load_df, load_from_zip __all__ = [ "COCO", "YOLO", - "HirundoError", "HirundoCSV", + "HirundoError", + "HirundoDatasetQaError", + "HirundoLlmBehaviorEvalError", "KeylabsAuth", "KeylabsObjDetImages", "KeylabsObjDetVideo", "KeylabsObjSegImages", "KeylabsObjSegVideo", + "BBQBiasType", + "UnqoverBiasType", "QADataset", + "EvalRunInfo", + "EvalRunRecord", + "JudgeModel", + "LlmBehaviorEval", + "LlmBehaviorEvalResults", "ModalityType", + "ModelOrRun", + "PresetType", "RunArgs", "ClassificationRunArgs", "ObjectDetectionRunArgs", @@ -69,14 +97,17 @@ "StorageConfig", "DatasetQAResults", "BiasRunInfo", - "BiasType", "HuggingFaceTransformersModel", + "HuggingFaceTransformersModelOutput", "LlmModel", "LlmSources", + "LlmSourcesOutput", "LlmUnlearningRun", "LocalTransformersModel", "load_df", "load_from_zip", + "ModelSourceType", + "RunStatus", ] __version__ = "0.2.3.post1" diff --git a/hirundo/_hirundo_error.py b/hirundo/_hirundo_error.py new file mode 100644 index 00000000..9b67978a --- /dev/null +++ b/hirundo/_hirundo_error.py @@ -0,0 +1,6 @@ +class HirundoError(Exception): + """ + Base exception class for Hirundo SDK errors + """ + + pass diff --git a/hirundo/_llm_pipeline.py b/hirundo/_llm_pipeline.py index 5224e732..a78c6b39 100644 --- a/hirundo/_llm_pipeline.py +++ b/hirundo/_llm_pipeline.py @@ -48,9 +48,11 @@ def get_hf_pipeline_for_run_given_model( from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.pipelines import pipeline - from hirundo.unlearning_llm import ( + from hirundo._llm_sources import ( HuggingFaceTransformersModel, HuggingFaceTransformersModelOutput, + ) + from hirundo.unlearning_llm import ( LlmUnlearningRun, ) diff --git a/hirundo/_llm_sources.py b/hirundo/_llm_sources.py new file mode 100644 index 00000000..03e1a112 --- /dev/null +++ b/hirundo/_llm_sources.py @@ -0,0 +1,43 @@ +from enum import Enum +from typing import Literal + +from pydantic import BaseModel, ConfigDict + + +class ModelSourceType(str, Enum): + HUGGINGFACE_TRANSFORMERS = "huggingface_transformers" + LOCAL_TRANSFORMERS = "local_transformers" + + +class HuggingFaceTransformersModel(BaseModel): + model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) + + type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = ( + ModelSourceType.HUGGINGFACE_TRANSFORMERS + ) + revision: str | None = None + code_revision: str | None = None + model_name: str + token: str | None = None + + +class HuggingFaceTransformersModelOutput(BaseModel): + model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) + + type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = ( + ModelSourceType.HUGGINGFACE_TRANSFORMERS + ) + model_name: str + + +class LocalTransformersModel(BaseModel): + type: Literal[ModelSourceType.LOCAL_TRANSFORMERS] = ( + ModelSourceType.LOCAL_TRANSFORMERS + ) + revision: None = None + code_revision: None = None + local_path: str + + +LlmSources = HuggingFaceTransformersModel | LocalTransformersModel +LlmSourcesOutput = HuggingFaceTransformersModelOutput | LocalTransformersModel diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py index 4866454b..8d493055 100644 --- a/hirundo/_run_checking.py +++ b/hirundo/_run_checking.py @@ -1,11 +1,12 @@ import json from collections.abc import AsyncGenerator, Generator -from enum import Enum import httpx from tqdm import tqdm from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying +from hirundo._run_status import RunStatus +from hirundo._sse_event_data import SseRunEventData from hirundo.logger import get_logger _logger = get_logger(__name__) @@ -13,17 +14,6 @@ DEFAULT_MAX_RETRIES = 200 -class RunStatus(Enum): - PENDING = "PENDING" - STARTED = "STARTED" - SUCCESS = "SUCCESS" - FAILURE = "FAILURE" - AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL" - REVOKED = "REVOKED" - REJECTED = "REJECTED" - RETRY = "RETRY" - - STATUS_TO_PROGRESS_MAP = { RunStatus.STARTED.value: 0.0, RunStatus.PENDING.value: 0.0, @@ -62,7 +52,9 @@ def build_status_text_map( } -def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None: +def get_state( + payload: dict | SseRunEventData, status_keys: tuple[str, ...] +) -> str | None: """ Return the first non-null state value from a payload using a list of keys. @@ -74,7 +66,11 @@ def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None: The first non-null state value, or None if none are present. """ for key in status_keys: - value = payload.get(key) + value = ( + payload.get(key) + if isinstance(payload, dict) + else getattr(payload, key, None) + ) if value is not None: return value return None @@ -222,7 +218,7 @@ async def aiter_run_events( def update_progress_from_result( - iteration: dict, + iteration: dict | SseRunEventData, progress: tqdm, *, uploading_text: str, @@ -240,13 +236,15 @@ def update_progress_from_result( Returns: True if a progress update occurred, False otherwise. """ - if ( - iteration.get("result") - and isinstance(iteration["result"], dict) - and iteration["result"].get("result") - and isinstance(iteration["result"]["result"], str) - ): - result_info = iteration["result"]["result"].split(":") + result_outer = ( + iteration.get("result") if isinstance(iteration, dict) else iteration.result + ) + result_inner = ( + result_outer.get("result") if isinstance(result_outer, dict) else result_outer + ) + + if isinstance(result_inner, str): + result_info = result_inner.split(":") if len(result_info) > 1: stage = result_info[0] current_progress_percentage = float( @@ -264,11 +262,13 @@ def update_progress_from_result( log.debug("Setting progress to %s", progress.n) progress.refresh() return True + if result_inner is not None: + log.debug("Skipping non-string progress result payload: %s", result_inner) return False def handle_run_failure( - iteration: dict, *, error_cls: type[Exception], run_label: str + iteration: dict | SseRunEventData, *, error_cls: type[Exception], run_label: str ) -> None: """ Raise a run-specific failure exception based on the iteration payload. @@ -278,6 +278,10 @@ def handle_run_failure( error_cls: Exception type to raise. run_label: Human-readable label for the run type. """ - if iteration.get("result"): - raise error_cls(f"{run_label} run failed with error: {iteration['result']}") + if ( + result := iteration.get("result") + if isinstance(iteration, dict) + else iteration.result + ): + raise error_cls(f"{run_label} run failed with error: {result}") raise error_cls(f"{run_label} run failed with an unknown error") diff --git a/hirundo/_run_status.py b/hirundo/_run_status.py new file mode 100644 index 00000000..1cc78f53 --- /dev/null +++ b/hirundo/_run_status.py @@ -0,0 +1,12 @@ +from enum import Enum + + +class RunStatus(str, Enum): + PENDING = "PENDING" + STARTED = "STARTED" + SUCCESS = "SUCCESS" + FAILURE = "FAILURE" + AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL" + REVOKED = "REVOKED" + REJECTED = "REJECTED" + RETRY = "RETRY" diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py new file mode 100644 index 00000000..429c942b --- /dev/null +++ b/hirundo/_sse_event_data.py @@ -0,0 +1,25 @@ +from pydantic import BaseModel, ValidationError + +from hirundo._hirundo_error import HirundoError +from hirundo._run_status import RunStatus +from hirundo.logger import get_logger + +logger = get_logger(__name__) + + +class SseRunEventData(BaseModel): + id: str + state: RunStatus | None + result: str | dict | None + + +class SseRunEventDataPayload(BaseModel): + data: SseRunEventData + + +def _parse_sse_payload(payload: str) -> SseRunEventData: + try: + return SseRunEventDataPayload.model_validate_json(payload).data + except ValidationError as e: + logger.error("Invalid SSE payload: %s", payload, exc_info=True) + raise HirundoError(f"Invalid SSE payload: {payload}") from e diff --git a/hirundo/dataset_qa.py b/hirundo/dataset_qa.py index 44f0bca9..8428b69e 100644 --- a/hirundo/dataset_qa.py +++ b/hirundo/dataset_qa.py @@ -11,10 +11,10 @@ from hirundo._constraints import validate_labeling_info, validate_url from hirundo._env import API_HOST from hirundo._headers import get_headers +from hirundo._hirundo_error import HirundoError from hirundo._http import raise_for_status_with_reason, requests from hirundo._run_checking import ( STATUS_TO_PROGRESS_MAP, - RunStatus, aiter_run_events, build_status_text_map, get_state, @@ -22,6 +22,7 @@ iter_run_events, update_progress_from_result, ) +from hirundo._run_status import RunStatus from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo._urls import HirundoUrl from hirundo.dataset_enum import DatasetMetadataType, LabelingType @@ -34,11 +35,7 @@ logger = get_logger(__name__) -class HirundoError(Exception): - """ - Custom exception used to indicate errors in `hirundo` dataset QA runs - """ - +class HirundoDatasetQaError(HirundoError): pass diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py new file mode 100644 index 00000000..561204f2 --- /dev/null +++ b/hirundo/llm_behavior_eval.py @@ -0,0 +1,545 @@ +import datetime +import typing +from collections.abc import AsyncGenerator, Generator +from enum import Enum +from typing import overload + +import httpx +from pydantic import BaseModel, ConfigDict +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm + +from hirundo._env import API_HOST +from hirundo._headers import get_headers +from hirundo._hirundo_error import HirundoError +from hirundo._http import raise_for_status_with_reason, requests +from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying +from hirundo._llm_sources import HuggingFaceTransformersModelOutput, LlmSourcesOutput +from hirundo._run_checking import ( + DEFAULT_MAX_RETRIES, + STATUS_TO_PROGRESS_MAP, + build_status_text_map, + get_state, + handle_run_failure, + update_progress_from_result, +) +from hirundo._run_status import RunStatus +from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload +from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT +from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults +from hirundo.llm_bias_type import BBQBiasType, UnqoverBiasType +from hirundo.logger import get_logger +from hirundo.unzip import download_and_extract_llm_behavior_eval_zip + +logger = get_logger(__name__) + + +STATUS_TO_TEXT_MAP = build_status_text_map("LLM behavior eval") + + +class HirundoLlmBehaviorEvalError(HirundoError): + pass + + +class ModelOrRun(str, Enum): + MODEL = "model" + RUN = "run" + + +class PresetType(str, Enum): + BBQ_BIAS = "BBQ_BIAS" + BBQ_UNBIAS = "BBQ_UNBIAS" + UNQOVER_BIAS = "UNQOVER_BIAS" + HALU_EVAL = "HALU_EVAL" + MED_HALLU = "MED_HALLU" + INJECTION_EVAL = "INJECTION_EVAL" + + +class JudgeModel(BaseModel): + path_or_repo_id: str + token: str | None = None + batch_size: int | None = None + output_tokens: int | None = None + use_4bit: bool | None = None + + +class EvalRunInfo(BaseModel): + organization_id: int | None = None + name: str | None = None + model_id: int | None = None + source_run_id: str | None = None + file_path: str | None = None + preset_type: PresetType | None = None + bias_type: BBQBiasType | UnqoverBiasType | None = None + judge_model: JudgeModel | None = None + + +class OutputLlm(BaseModel): + model_config = {"extra": "allow"} + + id: int + organization_id: int + creator_id: int + creator_name: str + created_at: datetime.datetime + updated_at: datetime.datetime + model_name: str + model_source: LlmSourcesOutput + + +class OutputUnlearningLlmRun(BaseModel): + model_config = {"extra": "allow"} + + id: int + name: str + run_id: str + model: OutputLlm | None = None + status: str + created_at: datetime.datetime + + +class LlmEvalMetricRow(BaseModel): + model_config = ConfigDict(extra="allow") + + benchmark: str + metric: str + original: float | str | None = None + post_unlearning: float | str | None = None + reduction_percent: float | None = None + subset: str | None = None + + +class LlmEvalMetrics(BaseModel): + model_config = ConfigDict(extra="allow") + + rows: list[LlmEvalMetricRow] + + +class EvalRunRecord(BaseModel): + id: int + name: str + model_id: int | None + model: OutputLlm | None + source_run_id: str | None + source_run: OutputUnlearningLlmRun | None + file_path: str | None + preset_type: PresetType | None + bias_type: BBQBiasType | UnqoverBiasType | None + judge_model: JudgeModel | None + run_id: str + mlflow_run_id: str | None + status: str + created_at: datetime.datetime + pre_process_progress: float + optimization_progress: float + post_process_progress: float + metrics: LlmEvalMetrics | None = None + responses_zip_url: str | None = None + + deleted_at: datetime.datetime | None = None + + +class LlmBehaviorEval: + def __init__(self, run_id: str | None = None): + self.run_id = run_id + + @staticmethod + def _parse_eval_run_record(response_payload: dict) -> EvalRunRecord: + model_payload = response_payload.get("model") + source_run_payload = response_payload.get("source_run") + judge_model_payload = response_payload.get("judge_model") + metrics_payload = response_payload.get("metrics") + + model = ( + OutputLlm.model_validate(model_payload) + if isinstance(model_payload, dict) + else None + ) + source_run = ( + OutputUnlearningLlmRun.model_validate(source_run_payload) + if isinstance(source_run_payload, dict) + else None + ) + judge_model = ( + JudgeModel.model_validate(judge_model_payload) + if isinstance(judge_model_payload, dict) + else None + ) + if isinstance(metrics_payload, dict): + metrics = LlmEvalMetrics.model_validate(metrics_payload) + elif isinstance(metrics_payload, list): + metric_rows = [ + LlmEvalMetricRow.model_validate(metric_row) + for metric_row in metrics_payload + if isinstance(metric_row, dict) + ] + metrics = LlmEvalMetrics(rows=metric_rows) + else: + metrics = None + + return EvalRunRecord( + id=response_payload["id"], + name=response_payload["name"], + model_id=response_payload.get("model_id"), + model=model, + source_run_id=response_payload.get("source_run_id"), + source_run=source_run, + file_path=response_payload.get("file_path"), + preset_type=response_payload.get("preset_type"), + bias_type=response_payload.get("bias_type"), + judge_model=judge_model, + run_id=response_payload["run_id"], + mlflow_run_id=response_payload.get("mlflow_run_id"), + status=response_payload["status"], + created_at=response_payload["created_at"], + pre_process_progress=response_payload.get("pre_process_progress", 0.0), + optimization_progress=response_payload.get("optimization_progress", 0.0), + post_process_progress=response_payload.get("post_process_progress", 0.0), + metrics=metrics, + responses_zip_url=response_payload.get("responses_zip_url"), + ) + + @staticmethod + def launch_eval_run( + model_or_run: ModelOrRun | str, + run_info: EvalRunInfo, + ) -> str: + """ + Launch an LLM behavior evaluation run. + + Args: + model_or_run: Whether the evaluation is based on a model or a run. + run_info: The evaluation run parameters. + + Returns: + The ID of the created evaluation run. + """ + if isinstance(model_or_run, str): + model_or_run_value = ModelOrRun(model_or_run) + else: + model_or_run_value = model_or_run + + response = requests.post( + f"{API_HOST}/llm-behavior-eval/run/{model_or_run_value.value}", + json=run_info.model_dump(mode="json"), + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + response_payload = response.json() + run_identifier = ( + response_payload.get("run_id") + or response_payload.get("hir_run_id") + or response_payload.get("id") + ) + if not run_identifier: + raise HirundoLlmBehaviorEvalError( + "Unable to determine the run ID from the response payload." + ) + return run_identifier + + @staticmethod + def cancel_by_id(run_id: str) -> None: + """ + Cancel a running evaluation. + """ + response = requests.patch( + f"{API_HOST}/llm-behavior-eval/run/cancel/{run_id}", + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + + def cancel(self) -> None: + if not self.run_id: + raise HirundoLlmBehaviorEvalError("No run has been started") + self.cancel_by_id(self.run_id) + + @staticmethod + def rename_by_id(run_id: str, new_name: str) -> None: + """ + Rename an evaluation run. + """ + response = requests.patch( + f"{API_HOST}/llm-behavior-eval/run/rename/{run_id}", + json={"new_name": new_name}, + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + + def rename(self, new_name: str) -> None: + if not self.run_id: + raise HirundoLlmBehaviorEvalError("No run has been started") + self.rename_by_id(self.run_id, new_name) + + @staticmethod + def archive_by_id(run_id: str) -> None: + """ + Archive an evaluation run. + """ + response = requests.patch( + f"{API_HOST}/llm-behavior-eval/run/archive/{run_id}", + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + + def archive(self) -> None: + if not self.run_id: + raise HirundoLlmBehaviorEvalError("No run has been started") + self.archive_by_id(self.run_id) + + @staticmethod + def restore_by_id(run_id: str) -> None: + """ + Restore an archived evaluation run. + """ + response = requests.patch( + f"{API_HOST}/llm-behavior-eval/run/restore/{run_id}", + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + + def restore(self) -> None: + if not self.run_id: + raise HirundoLlmBehaviorEvalError("No run has been started") + self.restore_by_id(self.run_id) + + @staticmethod + def get_run_info_by_id(run_id: str) -> EvalRunRecord: + """ + Retrieve the metadata for an evaluation run. + """ + response = requests.get( + f"{API_HOST}/llm-behavior-eval/run/info/{run_id}", + headers=get_headers(), + timeout=READ_TIMEOUT, + ) + raise_for_status_with_reason(response) + response_payload = response.json() + return LlmBehaviorEval._parse_eval_run_record(response_payload) + + @staticmethod + def list_runs( + organization_id: int | None = None, + archived: bool = False, + ) -> list[EvalRunRecord]: + """ + List evaluation runs. + """ + response = requests.get( + f"{API_HOST}/llm-behavior-eval/run/list", + params={ + "eval_organization_id": organization_id, + "archived": archived, + }, + headers=get_headers(), + timeout=READ_TIMEOUT, + ) + raise_for_status_with_reason(response) + response_payload = response.json() + return [ + LlmBehaviorEval._parse_eval_run_record(run_record) + for run_record in response_payload + ] + + @staticmethod + def _resolve_model_name(run_info: EvalRunRecord) -> str | None: + if run_info.model and isinstance( + run_info.model.model_source, HuggingFaceTransformersModelOutput + ): + return run_info.model.model_source.model_name + if ( + run_info.source_run + and run_info.source_run.model + and isinstance( + run_info.source_run.model.model_source, + HuggingFaceTransformersModelOutput, + ) + ): + return run_info.source_run.model.model_source.model_name + return None + + @staticmethod + def _check_run_by_id( + run_id: str, *, max_retries: int = DEFAULT_MAX_RETRIES + ) -> Generator[SseRunEventData, None, None]: + retry_count = 0 + while True: + if retry_count > max_retries: + raise HirundoLlmBehaviorEvalError("Max retries reached") + last_payload = None + with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client: + for sse_event in iter_sse_retrying( + client, + "GET", + f"{API_HOST}/llm-behavior-eval/run/{run_id}", + headers=get_headers(), + ): + if sse_event.event == "ping": + continue + payload = _parse_sse_payload(sse_event.data) + last_payload = payload + yield payload + last_state = get_state(last_payload, ("state",)) if last_payload else None + if last_payload is None or last_state == RunStatus.PENDING.value: + retry_count += 1 + continue + return + + @staticmethod + @overload + def check_run_by_id( + run_id: str, stop_on_manual_approval: typing.Literal[True] + ) -> LlmBehaviorEvalResults | None: ... + + @staticmethod + @overload + def check_run_by_id( + run_id: str, stop_on_manual_approval: typing.Literal[False] = False + ) -> LlmBehaviorEvalResults: ... + + @staticmethod + @overload + def check_run_by_id( + run_id: str, stop_on_manual_approval: bool + ) -> LlmBehaviorEvalResults | None: ... + + @staticmethod + def check_run_by_id( + run_id: str, stop_on_manual_approval: bool = False + ) -> LlmBehaviorEvalResults | None: + """ + Check the status of a run given its ID + + Args: + run_id: The `run_id` produced by a `launch_eval_run` call + stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval + + Returns: + An LlmBehaviorEvalResults object with the results of the evaluation run + + Raises: + HirundoLlmBehaviorEvalError: If the maximum number of retries is reached or if the run fails + """ + logger.debug("Checking run with ID: %s", run_id) + with logging_redirect_tqdm(): + progress_bar = tqdm(total=100.0) + for iteration in LlmBehaviorEval._check_run_by_id(run_id): + state = get_state(iteration, ("state",)) + if state in STATUS_TO_PROGRESS_MAP: + progress_bar.set_description(STATUS_TO_TEXT_MAP[state]) + progress_bar.n = STATUS_TO_PROGRESS_MAP[state] + logger.debug("Setting progress to %s", progress_bar.n) + progress_bar.refresh() + if state in [ + RunStatus.FAILURE.value, + RunStatus.REJECTED.value, + RunStatus.REVOKED.value, + ]: + logger.error( + "State is failure, rejected, or revoked: %s", + state, + ) + progress_bar.close() + handle_run_failure( + iteration, + error_cls=HirundoLlmBehaviorEvalError, + run_label="LLM behavior eval", + ) + elif state == RunStatus.SUCCESS.value: + progress_bar.close() + zip_temporary_url = iteration.result + if not zip_temporary_url or not isinstance( + zip_temporary_url, str + ): + raise HirundoLlmBehaviorEvalError( + "LLM behavior eval run completed without a results URL." + ) + run_info = LlmBehaviorEval.get_run_info_by_id(run_id) + model_name = LlmBehaviorEval._resolve_model_name(run_info) + return download_and_extract_llm_behavior_eval_zip( + run_id, + zip_temporary_url, + model_name, + ) + elif ( + state == RunStatus.AWAITING_MANUAL_APPROVAL.value + and stop_on_manual_approval + ): + progress_bar.close() + return None + elif state is None: + update_progress_from_result( + iteration, + progress_bar, + uploading_text="LLM behavior eval run completed. Uploading results", + log=logger, + ) + raise HirundoLlmBehaviorEvalError( + "LLM behavior eval run failed with an unknown error in check_run_by_id" + ) + + @overload + def check_run( + self, stop_on_manual_approval: typing.Literal[True] + ) -> LlmBehaviorEvalResults | None: ... + + @overload + def check_run( + self, stop_on_manual_approval: typing.Literal[False] = False + ) -> LlmBehaviorEvalResults: ... + + def check_run( + self, stop_on_manual_approval: bool = False + ) -> LlmBehaviorEvalResults | None: + """ + Check the status of the current active instance's run. + + Returns: + An LlmBehaviorEvalResults object with the results of the evaluation run + """ + if not self.run_id: + raise HirundoLlmBehaviorEvalError("No run has been started") + return self.check_run_by_id(self.run_id, stop_on_manual_approval) + + @staticmethod + async def acheck_run_by_id(run_id: str) -> AsyncGenerator[SseRunEventData, None]: + """ + Async version of :func:`check_run_by_id` + + Check the status of a run given its ID. + + This generator will produce values to show progress of the run. + """ + logger.debug("Checking run with ID: %s", run_id) + async with httpx.AsyncClient( + timeout=httpx.Timeout(None, connect=5.0) + ) as client: + async_iterator = await aiter_sse_retrying( + client, + "GET", + f"{API_HOST}/llm-behavior-eval/run/{run_id}", + headers=get_headers(), + ) + async for sse_event in async_iterator: + if sse_event.event == "ping": + continue + yield _parse_sse_payload(sse_event.data) + + async def acheck_run(self) -> AsyncGenerator[SseRunEventData, None]: + """ + Async version of :func:`check_run` + + Check the status of the current active instance's run. + + This generator will produce values to show progress of the run. + + Note: This function does not handle errors nor show progress. It is expected that you do that. + """ + if not self.run_id: + raise HirundoLlmBehaviorEvalError("No run has been started") + async for iteration in self.acheck_run_by_id(self.run_id): + yield iteration diff --git a/hirundo/llm_behavior_eval_results.py b/hirundo/llm_behavior_eval_results.py new file mode 100644 index 00000000..f13e9e0b --- /dev/null +++ b/hirundo/llm_behavior_eval_results.py @@ -0,0 +1,27 @@ +import typing +from pathlib import Path + +from pydantic import BaseModel + +T = typing.TypeVar("T") + + +class LlmBehaviorEvalResults(BaseModel, typing.Generic[T]): + model_config = {"arbitrary_types_allowed": True} + + cached_zip_path: Path + """ + The path to the cached zip file of the results + """ + model_name: str | None = None + """ + The model name used to locate results in the zip file + """ + summary_brief: T + """ + A polars/pandas DataFrame containing the summary_brief CSV + """ + summary_full: T + """ + A polars/pandas DataFrame containing the summary_full CSV + """ diff --git a/hirundo/llm_bias_type.py b/hirundo/llm_bias_type.py new file mode 100644 index 00000000..e5fc94dd --- /dev/null +++ b/hirundo/llm_bias_type.py @@ -0,0 +1,19 @@ +from enum import Enum + + +class BBQBiasType(str, Enum): + ALL = "ALL" + RACE = "RACE" + NATIONALITY = "NATIONALITY" + GENDER = "GENDER" + PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE" + RELIGION = "RELIGION" + AGE = "AGE" + + +class UnqoverBiasType(str, Enum): + ALL = "ALL" + RACE = "RACE" + NATIONALITY = "NATIONALITY" + GENDER = "GENDER" + RELIGION = "RELIGION" diff --git a/hirundo/unlearning_llm.py b/hirundo/unlearning_llm.py index b36ea14f..00375dc1 100644 --- a/hirundo/unlearning_llm.py +++ b/hirundo/unlearning_llm.py @@ -12,9 +12,9 @@ from hirundo._headers import get_headers from hirundo._http import raise_for_status_with_reason, requests from hirundo._llm_pipeline import get_hf_pipeline_for_run_given_model +from hirundo._llm_sources import LlmSources, LlmSourcesOutput from hirundo._run_checking import ( STATUS_TO_PROGRESS_MAP, - RunStatus, aiter_run_events, build_status_text_map, get_state, @@ -22,8 +22,10 @@ iter_run_events, update_progress_from_result, ) +from hirundo._run_status import RunStatus from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo.dataset_qa import HirundoError +from hirundo.llm_bias_type import BBQBiasType from hirundo.logger import get_logger if TYPE_CHECKING: @@ -34,45 +36,6 @@ logger = get_logger(__name__) -class ModelSourceType(str, Enum): - HUGGINGFACE_TRANSFORMERS = "huggingface_transformers" - LOCAL_TRANSFORMERS = "local_transformers" - - -class HuggingFaceTransformersModel(BaseModel): - model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) - - type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = ( - ModelSourceType.HUGGINGFACE_TRANSFORMERS - ) - revision: str | None = None - code_revision: str | None = None - model_name: str - token: str | None = None - - -class HuggingFaceTransformersModelOutput(BaseModel): - model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) - - type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = ( - ModelSourceType.HUGGINGFACE_TRANSFORMERS - ) - model_name: str - - -class LocalTransformersModel(BaseModel): - type: Literal[ModelSourceType.LOCAL_TRANSFORMERS] = ( - ModelSourceType.LOCAL_TRANSFORMERS - ) - revision: None = None - code_revision: None = None - local_path: str - - -LlmSources = HuggingFaceTransformersModel | LocalTransformersModel -LlmSourcesOutput = HuggingFaceTransformersModelOutput | LocalTransformersModel - - class LlmModel(BaseModel): model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) @@ -235,16 +198,6 @@ class UnlearningLlmAdvancedOptions(BaseModel): max_tokens_for_model: dict[DatasetType, int] | int | None = None -class BiasType(str, Enum): - ALL = "ALL" - RACE = "RACE" - NATIONALITY = "NATIONALITY" - GENDER = "GENDER" - PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE" - RELIGION = "RELIGION" - AGE = "AGE" - - class UtilityType(str, Enum): DEFAULT = "DEFAULT" CUSTOM = "CUSTOM" @@ -274,7 +227,7 @@ class CustomUtility(BaseModel): class BiasBehavior(BaseModel): type: Literal["BIAS"] = "BIAS" - bias_type: BiasType + bias_type: BBQBiasType class HallucinationType(str, Enum): @@ -317,7 +270,7 @@ class LlmRunInfo(BaseModel): class BiasRunInfo(BaseModel): - bias_type: BiasType + bias_type: BBQBiasType organization_id: int | None = None name: str | None = None target_utilities: list[TargetUtility] | None = None diff --git a/hirundo/unzip.py b/hirundo/unzip.py index 113f3be7..3c2f0880 100644 --- a/hirundo/unzip.py +++ b/hirundo/unzip.py @@ -23,6 +23,7 @@ DataFrameType, DatasetQAResults, ) +from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults from hirundo.logger import get_logger ZIP_FILE_CHUNK_SIZE = 50 * 1024 * 1024 # 50 MB @@ -150,11 +151,11 @@ def download_and_extract_zip( headers=headers, timeout=DOWNLOAD_READ_TIMEOUT, stream=True, - ) as r: - r.raise_for_status() - with open(zip_file_path, "wb") as f: - for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE): - f.write(chunk) + ) as response: + response.raise_for_status() + with open(zip_file_path, "wb") as output_file: + for chunk in response.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE): + output_file.write(chunk) logger.info( "Successfully downloaded the result zip file for run ID %s to %s", run_id, @@ -224,6 +225,79 @@ def download_and_extract_zip( ) +def download_and_extract_llm_behavior_eval_zip( + run_id: str, + zip_url: str, + model_name: str | None = None, +) -> LlmBehaviorEvalResults[DataFrameType]: + """ + Download and extract the LLM behavior evaluation results zip file. + + Args: + run_id: The ID of the LLM behavior eval run. + zip_url: The URL of the zip file to download. + model_name (optional): The full model name to resolve the folder within the zip. + + Returns: + The LLM behavior eval results object. + """ + cache_dir = Path.home() / ".hirundo" / "cache" + cache_dir.mkdir(parents=True, exist_ok=True) + zip_file_path = cache_dir / f"{run_id}.zip" + + headers = None + if Url(zip_url).scheme == "file": + zip_url = f"{API_HOST}/llm-behavior-eval/run/local-download" + zip_url.replace( + "file://", "" + ) + headers = _get_auth_headers() + with requests.get( + zip_url, + headers=headers, + timeout=DOWNLOAD_READ_TIMEOUT, + stream=True, + ) as response: + response.raise_for_status() + with open(zip_file_path, "wb") as output_file: + for chunk in response.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE): + output_file.write(chunk) + logger.info( + "Successfully downloaded the LLM behavior eval result zip file for run ID %s to %s", + run_id, + zip_file_path, + ) + + if model_name: + model_folder = model_name.split("/")[-1] + summary_brief_name = f"responses/{model_folder}/summary_brief.csv" + summary_full_name = f"responses/{model_folder}/summary_full.csv" + + with zipfile.ZipFile(zip_file_path, "r") as zip_file: + filenames = [file.filename for file in zip_file.filelist] + if summary_brief_name not in filenames: + raise ValueError( + f"Missing {summary_brief_name} in LLM behavior eval zip for run {run_id}" + ) + if summary_full_name not in filenames: + raise ValueError( + f"Missing {summary_full_name} in LLM behavior eval zip for run {run_id}" + ) + with zip_file.open(summary_brief_name) as summary_brief_file: + summary_brief_df = load_df(summary_brief_file) + with zip_file.open(summary_full_name) as summary_full_file: + summary_full_df = load_df(summary_full_file) + else: + summary_brief_df = None + summary_full_df = None + + return LlmBehaviorEvalResults[DataFrameType]( + cached_zip_path=zip_file_path, + model_name=model_name, + summary_brief=summary_brief_df, + summary_full=summary_full_df, + ) + + def load_from_zip( zip_path: Path, file_name: str ) -> "pd.DataFrame | pl.DataFrame | None": diff --git a/pyproject.toml b/pyproject.toml index c70a17f5..23894fe3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,9 @@ pandas = ["pandas>=2.2.3"] polars = ["polars>=1.0.0"] transformers = ["transformers>=4.57.3", "peft>=0.18.1", "accelerate>=1.12.0"] +[dependency-groups] +dev = ["hirundo[dev,docs,pandas,polars,transformers]"] + [tool.bumpver] current_version = "0.2.3.post1" version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]" diff --git a/scripts/cleanup_test_artifacts.py b/scripts/cleanup_test_artifacts.py index 1307075e..6d65b6d3 100644 --- a/scripts/cleanup_test_artifacts.py +++ b/scripts/cleanup_test_artifacts.py @@ -2,8 +2,10 @@ from datetime import timedelta, timezone import requests -from hirundo import GitRepo, QADataset, StorageConfig -from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut, RunStatus +from hirundo import GitRepo, LlmBehaviorEval, QADataset, StorageConfig +from hirundo._run_status import RunStatus +from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut +from hirundo.llm_behavior_eval import EvalRunRecord from hirundo.logger import get_logger from hirundo.storage import ResponseStorageConfig from hirundo.unlearning_llm import ( @@ -69,7 +71,7 @@ def _delete_llm( def _should_delete_resource( resource_name: str, - runs: list[DataQARunOut | OutputUnlearningLlmRun], + runs: list[DataQARunOut | OutputUnlearningLlmRun | EvalRunRecord], expiry_date: datetime.datetime, ) -> bool: """ @@ -290,12 +292,42 @@ def _handle_llm_cleanup(one_week_ago: datetime.datetime): ) +def _handle_llm_behavior_eval_cleanup(one_week_ago: datetime.datetime) -> None: + archived_runs = set[str]() + trying_to_archive_runs = set[str]() + live_runs = LlmBehaviorEval.list_runs(archived=False) + for run in live_runs: + if _should_delete_resource(run.name, [run], one_week_ago): + trying_to_archive_runs.add(run.run_id) + try: + LlmBehaviorEval.archive_by_id(run.run_id) + archived_runs.add(run.run_id) + except (HirundoError, requests.HTTPError) as exc: + logger.warning( + "Failed to archive LLM behavior eval run with ID %s: %s", + run.run_id, + exc, + ) + logger.info( + "Archived %s (%s) LLM behavior eval runs", + archived_runs, + len(archived_runs), + ) + if trying_to_archive_runs != archived_runs: + logger.warning( + "Tried to archive %s LLM behavior eval runs, but only archived %s runs", + trying_to_archive_runs, + archived_runs, + ) + + def main() -> None: now = datetime.datetime.now(timezone.utc) one_week_ago = now - timedelta(days=7) _handle_datasets_cleanup(one_week_ago) _handle_llm_cleanup(one_week_ago) + _handle_llm_behavior_eval_cleanup(one_week_ago) _cleanup_storage_configs(one_week_ago) diff --git a/tests/dataset_qa_shared.py b/tests/dataset_qa_shared.py index 126789fb..014b3957 100644 --- a/tests/dataset_qa_shared.py +++ b/tests/dataset_qa_shared.py @@ -9,7 +9,7 @@ RunArgs, StorageConfig, ) -from hirundo.dataset_qa import RunStatus +from hirundo._run_status import RunStatus from hirundo.logger import get_logger logger = get_logger(__name__) diff --git a/tests/llm-behavior-eval/llm_behavior_eval_test.py b/tests/llm-behavior-eval/llm_behavior_eval_test.py new file mode 100644 index 00000000..090d1a82 --- /dev/null +++ b/tests/llm-behavior-eval/llm_behavior_eval_test.py @@ -0,0 +1,42 @@ +import logging +import os + +from hirundo import ( + BBQBiasType, + EvalRunInfo, + HuggingFaceTransformersModel, + LlmBehaviorEval, + LlmModel, + ModelOrRun, + PresetType, +) +from tests.testing_utils import get_unique_id + +logger = logging.getLogger(__name__) + +unique_id = get_unique_id() + + +def test_llm_behavior_eval(): + llm = LlmModel( + model_name=f"TEST-LLM-BEHAVIOR-EVAL-Granite-4-micro-{unique_id}", + model_source=HuggingFaceTransformersModel( + model_name="ibm-granite/granite-4.0-micro", + ), + ) + llm_id = llm.create() + run_info = EvalRunInfo( + name=f"TEST-LLM-BEHAVIOR-EVAL-RUN-{unique_id}", + model_id=llm_id, + preset_type=PresetType.BBQ_BIAS, + bias_type=BBQBiasType.ALL, + ) + assert llm_id is not None + if os.getenv("FULL_TEST", "false") == "true": + run_id = LlmBehaviorEval.launch_eval_run(ModelOrRun.MODEL, run_info) + assert run_id is not None + results = LlmBehaviorEval.check_run_by_id(run_id) + assert results is not None + assert results.cached_zip_path is not None + assert results.summary_brief is not None + assert results.summary_full is not None diff --git a/tests/unlearning-llm/unlearn_llm_behavior_test.py b/tests/unlearning-llm/unlearn_llm_behavior_test.py index 13144936..3506c982 100644 --- a/tests/unlearning-llm/unlearn_llm_behavior_test.py +++ b/tests/unlearning-llm/unlearn_llm_behavior_test.py @@ -2,8 +2,8 @@ import os from hirundo import ( + BBQBiasType, BiasRunInfo, - BiasType, HuggingFaceTransformersModel, LlmModel, LlmUnlearningRun, @@ -25,7 +25,7 @@ def test_unlearn_llm_behavior(): ) llm_id = llm.create() run_info = BiasRunInfo( - bias_type=BiasType.ALL, + bias_type=BBQBiasType.ALL, ) assert llm_id is not None if os.getenv("FULL_TEST", "false") == "true": diff --git a/uv.lock b/uv.lock index 0139e8f1..b1f2c0af 100644 --- a/uv.lock +++ b/uv.lock @@ -619,6 +619,11 @@ transformers = [ { name = "transformers" }, ] +[package.dev-dependencies] +dev = [ + { name = "hirundo", extra = ["dev", "docs", "pandas", "polars", "transformers"] }, +] + [package.metadata] requires-dist = [ { name = "accelerate", marker = "extra == 'transformers'", specifier = ">=1.12.0" }, @@ -680,6 +685,9 @@ requires-dist = [ ] provides-extras = ["dev", "docs", "pandas", "polars", "transformers"] +[package.metadata.requires-dev] +dev = [{ name = "hirundo", extras = ["dev", "docs", "pandas", "polars", "transformers"] }] + [[package]] name = "httpcore" version = "1.0.9"