From 1c091b48a6293bd3eebb54e0134a018f9c630a5b Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Tue, 13 Jan 2026 15:44:50 +0200 Subject: [PATCH 01/24] Add LLM eval metric models --- hirundo/__init__.py | 18 ++ hirundo/llm_behavior_eval.py | 393 +++++++++++++++++++++++++++++++++++ 2 files changed, 411 insertions(+) create mode 100644 hirundo/llm_behavior_eval.py diff --git a/hirundo/__init__.py b/hirundo/__init__.py index 96c51f2e..3028857f 100644 --- a/hirundo/__init__.py +++ b/hirundo/__init__.py @@ -23,6 +23,16 @@ KeylabsObjSegImages, KeylabsObjSegVideo, ) +from .llm_behavior_eval import ( + BiasType, + EvalRunInfo, + EvalRunRecord, + JudgeModel, + LlmBehaviorEval, + LlmBehaviorEvalError, + ModelOrRun, + PresetType, +) from .storage import ( StorageConfig, StorageGCP, @@ -42,8 +52,16 @@ "KeylabsObjDetVideo", "KeylabsObjSegImages", "KeylabsObjSegVideo", + "BiasType", "QADataset", + "EvalRunInfo", + "EvalRunRecord", + "JudgeModel", + "LlmBehaviorEval", + "LlmBehaviorEvalError", "ModalityType", + "ModelOrRun", + "PresetType", "RunArgs", "ClassificationRunArgs", "ObjectDetectionRunArgs", diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py new file mode 100644 index 00000000..8df92d12 --- /dev/null +++ b/hirundo/llm_behavior_eval.py @@ -0,0 +1,393 @@ +import datetime +import json +import typing +from collections.abc import AsyncGenerator, Generator +from enum import Enum + +import httpx +from pydantic import BaseModel, ConfigDict + +from hirundo._env import API_HOST +from hirundo._headers import get_headers +from hirundo._http import raise_for_status_with_reason, requests +from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying +from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT +from hirundo.logger import get_logger + +logger = get_logger(__name__) + + +class LlmBehaviorEvalError(Exception): + """ + Custom exception used to indicate errors in `hirundo` LLM behavior eval runs. + """ + + pass + + +class ModelOrRun(str, Enum): + MODEL = "model" + RUN = "run" + + +class PresetType(str, Enum): + BBQ_BIAS = "BBQ_BIAS" + BBQ_UNBIAS = "BBQ_UNBIAS" + UNQOVER_BIAS = "UNQOVER_BIAS" + HALU_EVAL = "HALU_EVAL" + MED_HALLU = "MED_HALLU" + INJECTION_EVAL = "INJECTION_EVAL" + + +class BiasType(str, Enum): + ALL = "ALL" + RACE = "RACE" + NATIONALITY = "NATIONALITY" + GENDER = "GENDER" + PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE" + RELIGION = "RELIGION" + AGE = "AGE" + + +class JudgeModel(BaseModel): + path_or_repo_id: str + token: typing.Optional[str] = None + batch_size: typing.Optional[int] = None + output_tokens: typing.Optional[int] = None + use_4bit: typing.Optional[bool] = None + + +class EvalRunInfo(BaseModel): + organization_id: typing.Optional[int] = None + name: typing.Optional[str] = None + model_id: typing.Optional[int] = None + source_run_id: typing.Optional[str] = None + file_path: typing.Optional[str] = None + preset_type: typing.Optional[PresetType] = None + bias_type: typing.Optional[BiasType] = None + judge_model: typing.Optional[JudgeModel] = None + + +class OutputLlm(BaseModel): + model_config = {"extra": "allow"} + + id: int + organization_id: int + creator_id: int + creator_name: str + created_at: datetime.datetime + updated_at: datetime.datetime + model_name: str + model_source: dict + + +class OutputUnlearningLlmRun(BaseModel): + model_config = {"extra": "allow"} + + id: int + name: str + run_id: str + status: str + created_at: datetime.datetime + + +class LlmEvalMetricRow(BaseModel): + model_config = ConfigDict(extra="allow") + + benchmark: str + metric: str + original: typing.Optional[typing.Union[float, str]] = None + post_unlearning: typing.Optional[typing.Union[float, str]] = None + reduction_percent: typing.Optional[float] = None + subset: typing.Optional[str] = None + + +class LlmEvalMetrics(BaseModel): + model_config = ConfigDict(extra="allow") + + rows: list[LlmEvalMetricRow] + + +class EvalRunRecord(BaseModel): + id: int + name: str + model_id: typing.Optional[int] + model: typing.Optional[OutputLlm] + source_run_id: typing.Optional[str] + source_run: typing.Optional[OutputUnlearningLlmRun] + file_path: typing.Optional[str] + preset_type: typing.Optional[PresetType] + bias_type: typing.Optional[BiasType] + judge_model: typing.Optional[JudgeModel] + run_id: str + mlflow_run_id: typing.Optional[str] + status: str + created_at: datetime.datetime + pre_process_progress: float + optimization_progress: float + post_process_progress: float + metrics: typing.Optional[LlmEvalMetrics] = None + responses_zip_url: typing.Optional[str] = None + + +class LlmBehaviorEval: + def __init__(self, run_id: typing.Optional[str] = None): + self.run_id = run_id + + @staticmethod + def _parse_eval_run_record(response_payload: dict) -> EvalRunRecord: + model_payload = response_payload.get("model") + source_run_payload = response_payload.get("source_run") + judge_model_payload = response_payload.get("judge_model") + metrics_payload = response_payload.get("metrics") + + model = ( + OutputLlm.model_validate(model_payload) + if isinstance(model_payload, dict) + else None + ) + source_run = ( + OutputUnlearningLlmRun.model_validate(source_run_payload) + if isinstance(source_run_payload, dict) + else None + ) + judge_model = ( + JudgeModel.model_validate(judge_model_payload) + if isinstance(judge_model_payload, dict) + else None + ) + if isinstance(metrics_payload, dict): + metrics = LlmEvalMetrics.model_validate(metrics_payload) + elif isinstance(metrics_payload, list): + metric_rows = [ + LlmEvalMetricRow.model_validate(metric_row) + for metric_row in metrics_payload + if isinstance(metric_row, dict) + ] + metrics = LlmEvalMetrics(rows=metric_rows) + else: + metrics = None + + return EvalRunRecord( + id=response_payload["id"], + name=response_payload["name"], + model_id=response_payload.get("model_id"), + model=model, + source_run_id=response_payload.get("source_run_id"), + source_run=source_run, + file_path=response_payload.get("file_path"), + preset_type=response_payload.get("preset_type"), + bias_type=response_payload.get("bias_type"), + judge_model=judge_model, + run_id=response_payload["run_id"], + mlflow_run_id=response_payload.get("mlflow_run_id"), + status=response_payload["status"], + created_at=response_payload["created_at"], + pre_process_progress=response_payload.get("pre_process_progress", 0.0), + optimization_progress=response_payload.get("optimization_progress", 0.0), + post_process_progress=response_payload.get("post_process_progress", 0.0), + metrics=metrics, + responses_zip_url=response_payload.get("responses_zip_url"), + ) + + @staticmethod + def launch_eval_run( + model_or_run: typing.Union[ModelOrRun, str], + run_info: EvalRunInfo, + ) -> str: + """ + Launch an LLM behavior evaluation run. + + Args: + model_or_run: Whether the evaluation is based on a model or a run. + run_info: The evaluation run parameters. + + Returns: + The ID of the created evaluation run. + """ + if isinstance(model_or_run, str): + model_or_run_value = ModelOrRun(model_or_run) + else: + model_or_run_value = model_or_run + + response = requests.post( + f"{API_HOST}/llm-behavior-eval/run/{model_or_run_value.value}", + json=run_info.model_dump(mode="json"), + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + response_payload = response.json() + run_identifier = ( + response_payload.get("run_id") + or response_payload.get("hir_run_id") + or response_payload.get("id") + ) + if not run_identifier: + raise LlmBehaviorEvalError( + "Unable to determine the run ID from the response payload." + ) + return run_identifier + + @staticmethod + def cancel_by_id(run_id: str) -> None: + """ + Cancel a running evaluation. + """ + response = requests.patch( + f"{API_HOST}/llm-behavior-eval/run/cancel/{run_id}", + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + + def cancel(self) -> None: + if not self.run_id: + raise ValueError("No run has been started") + self.cancel_by_id(self.run_id) + + @staticmethod + def rename_by_id(run_id: str, new_name: str) -> None: + """ + Rename an evaluation run. + """ + response = requests.patch( + f"{API_HOST}/llm-behavior-eval/run/rename/{run_id}", + json={"new_name": new_name}, + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + + def rename(self, new_name: str) -> None: + if not self.run_id: + raise ValueError("No run has been started") + self.rename_by_id(self.run_id, new_name) + + @staticmethod + def archive_by_id(run_id: str) -> None: + """ + Archive an evaluation run. + """ + response = requests.patch( + f"{API_HOST}/llm-behavior-eval/run/archive/{run_id}", + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + + def archive(self) -> None: + if not self.run_id: + raise ValueError("No run has been started") + self.archive_by_id(self.run_id) + + @staticmethod + def restore_by_id(run_id: str) -> None: + """ + Restore an archived evaluation run. + """ + response = requests.patch( + f"{API_HOST}/llm-behavior-eval/run/restore/{run_id}", + headers=get_headers(), + timeout=MODIFY_TIMEOUT, + ) + raise_for_status_with_reason(response) + + def restore(self) -> None: + if not self.run_id: + raise ValueError("No run has been started") + self.restore_by_id(self.run_id) + + @staticmethod + def get_run_info_by_id(run_id: str) -> EvalRunRecord: + """ + Retrieve the metadata for an evaluation run. + """ + response = requests.get( + f"{API_HOST}/llm-behavior-eval/run/info/{run_id}", + headers=get_headers(), + timeout=READ_TIMEOUT, + ) + raise_for_status_with_reason(response) + response_payload = response.json() + return LlmBehaviorEval._parse_eval_run_record(response_payload) + + @staticmethod + def list_runs( + organization_id: typing.Optional[int] = None, + archived: bool = False, + ) -> list[EvalRunRecord]: + """ + List evaluation runs. + """ + response = requests.get( + f"{API_HOST}/llm-behavior-eval/run/list", + params={ + "eval_organization_id": organization_id, + "archived": archived, + }, + headers=get_headers(), + timeout=READ_TIMEOUT, + ) + raise_for_status_with_reason(response) + response_payload = response.json() + return [ + LlmBehaviorEval._parse_eval_run_record(run_record) + for run_record in response_payload + ] + + @staticmethod + def stream_results_by_id(run_id: str) -> Generator[dict, None, None]: + """ + Stream evaluation results for a run. + """ + with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client: + for sse_event in iter_sse_retrying( + client, + "GET", + f"{API_HOST}/llm-behavior-eval/run/{run_id}", + headers=get_headers(), + ): + if sse_event.event == "ping": + continue + try: + yield json.loads(sse_event.data) + except json.JSONDecodeError: + logger.debug("Non-JSON SSE payload received: %s", sse_event.data) + yield {"data": sse_event.data} + + def stream_results(self) -> Generator[dict, None, None]: + if not self.run_id: + raise ValueError("No run has been started") + yield from self.stream_results_by_id(self.run_id) + + @staticmethod + async def astream_results_by_id(run_id: str) -> AsyncGenerator[dict, None]: + """ + Async stream evaluation results for a run. + """ + async with httpx.AsyncClient( + timeout=httpx.Timeout(None, connect=5.0) + ) as client: + async_iterator = await aiter_sse_retrying( + client, + "GET", + f"{API_HOST}/llm-behavior-eval/run/{run_id}", + headers=get_headers(), + ) + async for sse_event in async_iterator: + if sse_event.event == "ping": + continue + try: + yield json.loads(sse_event.data) + except json.JSONDecodeError: + logger.debug( + "Non-JSON SSE payload received: %s", sse_event.data + ) + yield {"data": sse_event.data} + + async def astream_results(self) -> AsyncGenerator[dict, None]: + if not self.run_id: + raise ValueError("No run has been started") + async for payload in self.astream_results_by_id(self.run_id): + yield payload From 1a047bb3c566b333fbccb9f9acb43c083979ea97 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Tue, 13 Jan 2026 19:36:19 +0200 Subject: [PATCH 02/24] Format llm_behavior_eval --- hirundo/llm_behavior_eval.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index 8df92d12..5bdadba1 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -381,9 +381,7 @@ async def astream_results_by_id(run_id: str) -> AsyncGenerator[dict, None]: try: yield json.loads(sse_event.data) except json.JSONDecodeError: - logger.debug( - "Non-JSON SSE payload received: %s", sse_event.data - ) + logger.debug("Non-JSON SSE payload received: %s", sse_event.data) yield {"data": sse_event.data} async def astream_results(self) -> AsyncGenerator[dict, None]: From 06d040cbc93c742a289ccdd7ddea97e91f64a0f2 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Tue, 13 Jan 2026 23:43:06 +0200 Subject: [PATCH 03/24] Fix optional type hints in llm behavior eval --- hirundo/llm_behavior_eval.py | 61 ++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index 5bdadba1..0bd08650 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -1,6 +1,5 @@ import datetime import json -import typing from collections.abc import AsyncGenerator, Generator from enum import Enum @@ -51,21 +50,21 @@ class BiasType(str, Enum): class JudgeModel(BaseModel): path_or_repo_id: str - token: typing.Optional[str] = None - batch_size: typing.Optional[int] = None - output_tokens: typing.Optional[int] = None - use_4bit: typing.Optional[bool] = None + token: str | None = None + batch_size: int | None = None + output_tokens: int | None = None + use_4bit: bool | None = None class EvalRunInfo(BaseModel): - organization_id: typing.Optional[int] = None - name: typing.Optional[str] = None - model_id: typing.Optional[int] = None - source_run_id: typing.Optional[str] = None - file_path: typing.Optional[str] = None - preset_type: typing.Optional[PresetType] = None - bias_type: typing.Optional[BiasType] = None - judge_model: typing.Optional[JudgeModel] = None + organization_id: int | None = None + name: str | None = None + model_id: int | None = None + source_run_id: str | None = None + file_path: str | None = None + preset_type: PresetType | None = None + bias_type: BiasType | None = None + judge_model: JudgeModel | None = None class OutputLlm(BaseModel): @@ -96,10 +95,10 @@ class LlmEvalMetricRow(BaseModel): benchmark: str metric: str - original: typing.Optional[typing.Union[float, str]] = None - post_unlearning: typing.Optional[typing.Union[float, str]] = None - reduction_percent: typing.Optional[float] = None - subset: typing.Optional[str] = None + original: float | str | None = None + post_unlearning: float | str | None = None + reduction_percent: float | None = None + subset: str | None = None class LlmEvalMetrics(BaseModel): @@ -111,27 +110,27 @@ class LlmEvalMetrics(BaseModel): class EvalRunRecord(BaseModel): id: int name: str - model_id: typing.Optional[int] - model: typing.Optional[OutputLlm] - source_run_id: typing.Optional[str] - source_run: typing.Optional[OutputUnlearningLlmRun] - file_path: typing.Optional[str] - preset_type: typing.Optional[PresetType] - bias_type: typing.Optional[BiasType] - judge_model: typing.Optional[JudgeModel] + model_id: int | None + model: OutputLlm | None + source_run_id: str | None + source_run: OutputUnlearningLlmRun | None + file_path: str | None + preset_type: PresetType | None + bias_type: BiasType | None + judge_model: JudgeModel | None run_id: str - mlflow_run_id: typing.Optional[str] + mlflow_run_id: str | None status: str created_at: datetime.datetime pre_process_progress: float optimization_progress: float post_process_progress: float - metrics: typing.Optional[LlmEvalMetrics] = None - responses_zip_url: typing.Optional[str] = None + metrics: LlmEvalMetrics | None = None + responses_zip_url: str | None = None class LlmBehaviorEval: - def __init__(self, run_id: typing.Optional[str] = None): + def __init__(self, run_id: str | None = None): self.run_id = run_id @staticmethod @@ -192,7 +191,7 @@ def _parse_eval_run_record(response_payload: dict) -> EvalRunRecord: @staticmethod def launch_eval_run( - model_or_run: typing.Union[ModelOrRun, str], + model_or_run: ModelOrRun | str, run_info: EvalRunInfo, ) -> str: """ @@ -314,7 +313,7 @@ def get_run_info_by_id(run_id: str) -> EvalRunRecord: @staticmethod def list_runs( - organization_id: typing.Optional[int] = None, + organization_id: int | None = None, archived: bool = False, ) -> list[EvalRunRecord]: """ From 415f7e39e00a444a6ed76759bca57d9c5ed0a253 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 28 Jan 2026 21:49:11 +0200 Subject: [PATCH 04/24] Basic first implementation of matching naming for `check_run` instead of `stream_results` and adding `tqdm`, unzipping and loading of CSVs Essentially matching the data QA behavior --- hirundo/__init__.py | 30 ++- hirundo/_hirundo_error.py | 6 + hirundo/_llm_pipeline.py | 4 +- hirundo/_llm_sources.py | 43 ++++ hirundo/_run_checking.py | 35 ++- hirundo/_sse_event_data.py | 24 ++ hirundo/dataset_qa.py | 7 +- hirundo/llm_behavior_eval.py | 225 ++++++++++++++---- hirundo/llm_behavior_eval_results.py | 27 +++ hirundo/llm_bias_type.py | 11 + hirundo/unlearning_llm.py | 51 +--- hirundo/unzip.py | 84 ++++++- .../llm_behavior_eval_test.py | 40 ++++ 13 files changed, 460 insertions(+), 127 deletions(-) create mode 100644 hirundo/_hirundo_error.py create mode 100644 hirundo/_llm_sources.py create mode 100644 hirundo/_sse_event_data.py create mode 100644 hirundo/llm_behavior_eval_results.py create mode 100644 hirundo/llm_bias_type.py create mode 100644 tests/llm-behavior-eval/llm_behavior_eval_test.py diff --git a/hirundo/__init__.py b/hirundo/__init__.py index 83d1cca1..2cb0ca16 100644 --- a/hirundo/__init__.py +++ b/hirundo/__init__.py @@ -1,3 +1,12 @@ +from ._hirundo_error import HirundoError +from ._llm_sources import ( + HuggingFaceTransformersModel, + HuggingFaceTransformersModelOutput, + LlmSources, + LlmSourcesOutput, + LocalTransformersModel, + ModelSourceType, +) from .dataset_enum import ( DatasetMetadataType, LabelingType, @@ -5,7 +14,7 @@ ) from .dataset_qa import ( ClassificationRunArgs, - HirundoError, + HirundoDatasetQaError, ModalityType, ObjectDetectionRunArgs, QADataset, @@ -24,15 +33,16 @@ KeylabsObjSegVideo, ) from .llm_behavior_eval import ( - BiasType, EvalRunInfo, EvalRunRecord, + HirundoLlmBehaviorEvalError, JudgeModel, LlmBehaviorEval, - LlmBehaviorEvalError, ModelOrRun, PresetType, ) +from .llm_behavior_eval_results import LlmBehaviorEvalResults +from .llm_bias_type import BiasType from .storage import ( StorageConfig, StorageGCP, @@ -42,20 +52,18 @@ ) from .unlearning_llm import ( BiasRunInfo, - BiasType, - HuggingFaceTransformersModel, LlmModel, - LlmSources, LlmUnlearningRun, - LocalTransformersModel, ) from .unzip import load_df, load_from_zip __all__ = [ "COCO", "YOLO", - "HirundoError", "HirundoCSV", + "HirundoError", + "HirundoDatasetQaError", + "HirundoLlmBehaviorEvalError", "KeylabsAuth", "KeylabsObjDetImages", "KeylabsObjDetVideo", @@ -67,7 +75,7 @@ "EvalRunRecord", "JudgeModel", "LlmBehaviorEval", - "LlmBehaviorEvalError", + "LlmBehaviorEvalResults", "ModalityType", "ModelOrRun", "PresetType", @@ -87,14 +95,16 @@ "StorageConfig", "DatasetQAResults", "BiasRunInfo", - "BiasType", "HuggingFaceTransformersModel", + "HuggingFaceTransformersModelOutput", "LlmModel", "LlmSources", + "LlmSourcesOutput", "LlmUnlearningRun", "LocalTransformersModel", "load_df", "load_from_zip", + "ModelSourceType", ] __version__ = "0.1.22" diff --git a/hirundo/_hirundo_error.py b/hirundo/_hirundo_error.py new file mode 100644 index 00000000..5f798aff --- /dev/null +++ b/hirundo/_hirundo_error.py @@ -0,0 +1,6 @@ +class HirundoError(Exception): + """ + Custom exception used to indicate errors in `hirundo` dataset QA runs + """ + + pass diff --git a/hirundo/_llm_pipeline.py b/hirundo/_llm_pipeline.py index 5224e732..a78c6b39 100644 --- a/hirundo/_llm_pipeline.py +++ b/hirundo/_llm_pipeline.py @@ -48,9 +48,11 @@ def get_hf_pipeline_for_run_given_model( from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.pipelines import pipeline - from hirundo.unlearning_llm import ( + from hirundo._llm_sources import ( HuggingFaceTransformersModel, HuggingFaceTransformersModelOutput, + ) + from hirundo.unlearning_llm import ( LlmUnlearningRun, ) diff --git a/hirundo/_llm_sources.py b/hirundo/_llm_sources.py new file mode 100644 index 00000000..03e1a112 --- /dev/null +++ b/hirundo/_llm_sources.py @@ -0,0 +1,43 @@ +from enum import Enum +from typing import Literal + +from pydantic import BaseModel, ConfigDict + + +class ModelSourceType(str, Enum): + HUGGINGFACE_TRANSFORMERS = "huggingface_transformers" + LOCAL_TRANSFORMERS = "local_transformers" + + +class HuggingFaceTransformersModel(BaseModel): + model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) + + type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = ( + ModelSourceType.HUGGINGFACE_TRANSFORMERS + ) + revision: str | None = None + code_revision: str | None = None + model_name: str + token: str | None = None + + +class HuggingFaceTransformersModelOutput(BaseModel): + model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) + + type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = ( + ModelSourceType.HUGGINGFACE_TRANSFORMERS + ) + model_name: str + + +class LocalTransformersModel(BaseModel): + type: Literal[ModelSourceType.LOCAL_TRANSFORMERS] = ( + ModelSourceType.LOCAL_TRANSFORMERS + ) + revision: None = None + code_revision: None = None + local_path: str + + +LlmSources = HuggingFaceTransformersModel | LocalTransformersModel +LlmSourcesOutput = HuggingFaceTransformersModelOutput | LocalTransformersModel diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py index 4866454b..9bf68622 100644 --- a/hirundo/_run_checking.py +++ b/hirundo/_run_checking.py @@ -6,6 +6,7 @@ from tqdm import tqdm from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying +from hirundo._sse_event_data import SseRunEventData from hirundo.logger import get_logger _logger = get_logger(__name__) @@ -62,7 +63,9 @@ def build_status_text_map( } -def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None: +def get_state( + payload: dict | SseRunEventData, status_keys: tuple[str, ...] +) -> str | None: """ Return the first non-null state value from a payload using a list of keys. @@ -74,7 +77,7 @@ def get_state(payload: dict, status_keys: tuple[str, ...]) -> str | None: The first non-null state value, or None if none are present. """ for key in status_keys: - value = payload.get(key) + value = payload.get(key) if isinstance(payload, dict) else getattr(payload, key) if value is not None: return value return None @@ -222,7 +225,7 @@ async def aiter_run_events( def update_progress_from_result( - iteration: dict, + iteration: dict | SseRunEventData, progress: tqdm, *, uploading_text: str, @@ -240,13 +243,15 @@ def update_progress_from_result( Returns: True if a progress update occurred, False otherwise. """ - if ( - iteration.get("result") - and isinstance(iteration["result"], dict) - and iteration["result"].get("result") - and isinstance(iteration["result"]["result"], str) - ): - result_info = iteration["result"]["result"].split(":") + result_outer = ( + iteration.get("result") if isinstance(iteration, dict) else iteration.result + ) + result_inner = ( + result_outer.get("result") if isinstance(result_outer, dict) else result_outer + ) + + if result_inner: + result_info = result_inner.split(":") if len(result_info) > 1: stage = result_info[0] current_progress_percentage = float( @@ -268,7 +273,7 @@ def update_progress_from_result( def handle_run_failure( - iteration: dict, *, error_cls: type[Exception], run_label: str + iteration: dict | SseRunEventData, *, error_cls: type[Exception], run_label: str ) -> None: """ Raise a run-specific failure exception based on the iteration payload. @@ -278,6 +283,10 @@ def handle_run_failure( error_cls: Exception type to raise. run_label: Human-readable label for the run type. """ - if iteration.get("result"): - raise error_cls(f"{run_label} run failed with error: {iteration['result']}") + if ( + result := iteration.get("result") + if isinstance(iteration, dict) + else iteration.result + ): + raise error_cls(f"{run_label} run failed with error: {result}") raise error_cls(f"{run_label} run failed with an unknown error") diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py new file mode 100644 index 00000000..00155f6f --- /dev/null +++ b/hirundo/_sse_event_data.py @@ -0,0 +1,24 @@ +from typing import Any + +from pydantic import BaseModel, TypeAdapter + +from hirundo._run_checking import RunStatus + + +class SseRunEventData(BaseModel): + id: str + state: RunStatus + result: str | dict | None + + +_SSE_RUN_EVENT_ADAPTER = TypeAdapter(SseRunEventData) + + +def _parse_sse_payload(payload: Any) -> SseRunEventData: + if isinstance(payload, dict): + if "data" in payload: + data = payload["data"] + if isinstance(data, dict): + return _SSE_RUN_EVENT_ADAPTER.validate_python(data) + + raise ValueError(f"Invalid SSE payload: {payload}") diff --git a/hirundo/dataset_qa.py b/hirundo/dataset_qa.py index 44f0bca9..99e211f6 100644 --- a/hirundo/dataset_qa.py +++ b/hirundo/dataset_qa.py @@ -11,6 +11,7 @@ from hirundo._constraints import validate_labeling_info, validate_url from hirundo._env import API_HOST from hirundo._headers import get_headers +from hirundo._hirundo_error import HirundoError from hirundo._http import raise_for_status_with_reason, requests from hirundo._run_checking import ( STATUS_TO_PROGRESS_MAP, @@ -34,11 +35,7 @@ logger = get_logger(__name__) -class HirundoError(Exception): - """ - Custom exception used to indicate errors in `hirundo` dataset QA runs - """ - +class HirundoDatasetQaError(HirundoError): pass diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index 0bd08650..72c27ba6 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -1,26 +1,42 @@ import datetime -import json +import typing from collections.abc import AsyncGenerator, Generator from enum import Enum +from typing import overload import httpx from pydantic import BaseModel, ConfigDict +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm from hirundo._env import API_HOST from hirundo._headers import get_headers +from hirundo._hirundo_error import HirundoError from hirundo._http import raise_for_status_with_reason, requests from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying +from hirundo._llm_sources import HuggingFaceTransformersModelOutput, LlmSourcesOutput +from hirundo._run_checking import ( + STATUS_TO_PROGRESS_MAP, + RunStatus, + build_status_text_map, + get_state, + handle_run_failure, + update_progress_from_result, +) +from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT +from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults +from hirundo.llm_bias_type import BiasType from hirundo.logger import get_logger +from hirundo.unzip import download_and_extract_llm_behavior_eval_zip logger = get_logger(__name__) -class LlmBehaviorEvalError(Exception): - """ - Custom exception used to indicate errors in `hirundo` LLM behavior eval runs. - """ +STATUS_TO_TEXT_MAP = build_status_text_map("LLM behavior eval") + +class HirundoLlmBehaviorEvalError(HirundoError): pass @@ -38,16 +54,6 @@ class PresetType(str, Enum): INJECTION_EVAL = "INJECTION_EVAL" -class BiasType(str, Enum): - ALL = "ALL" - RACE = "RACE" - NATIONALITY = "NATIONALITY" - GENDER = "GENDER" - PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE" - RELIGION = "RELIGION" - AGE = "AGE" - - class JudgeModel(BaseModel): path_or_repo_id: str token: str | None = None @@ -77,7 +83,7 @@ class OutputLlm(BaseModel): created_at: datetime.datetime updated_at: datetime.datetime model_name: str - model_source: dict + model_source: LlmSourcesOutput class OutputUnlearningLlmRun(BaseModel): @@ -86,6 +92,7 @@ class OutputUnlearningLlmRun(BaseModel): id: int name: str run_id: str + model: OutputLlm | None = None status: str created_at: datetime.datetime @@ -223,7 +230,7 @@ def launch_eval_run( or response_payload.get("id") ) if not run_identifier: - raise LlmBehaviorEvalError( + raise HirundoLlmBehaviorEvalError( "Unable to determine the run ID from the response payload." ) return run_identifier @@ -242,7 +249,7 @@ def cancel_by_id(run_id: str) -> None: def cancel(self) -> None: if not self.run_id: - raise ValueError("No run has been started") + raise HirundoLlmBehaviorEvalError("No run has been started") self.cancel_by_id(self.run_id) @staticmethod @@ -260,7 +267,7 @@ def rename_by_id(run_id: str, new_name: str) -> None: def rename(self, new_name: str) -> None: if not self.run_id: - raise ValueError("No run has been started") + raise HirundoLlmBehaviorEvalError("No run has been started") self.rename_by_id(self.run_id, new_name) @staticmethod @@ -277,7 +284,7 @@ def archive_by_id(run_id: str) -> None: def archive(self) -> None: if not self.run_id: - raise ValueError("No run has been started") + raise HirundoLlmBehaviorEvalError("No run has been started") self.archive_by_id(self.run_id) @staticmethod @@ -294,7 +301,7 @@ def restore_by_id(run_id: str) -> None: def restore(self) -> None: if not self.run_id: - raise ValueError("No run has been started") + raise HirundoLlmBehaviorEvalError("No run has been started") self.restore_by_id(self.run_id) @staticmethod @@ -336,10 +343,24 @@ def list_runs( ] @staticmethod - def stream_results_by_id(run_id: str) -> Generator[dict, None, None]: - """ - Stream evaluation results for a run. - """ + def _resolve_model_name(run_info: EvalRunRecord) -> str | None: + if run_info.model and isinstance( + run_info.model.model_source, HuggingFaceTransformersModelOutput + ): + return run_info.model.model_source.model_name + if ( + run_info.source_run + and run_info.source_run.model + and isinstance( + run_info.source_run.model.model_source, + HuggingFaceTransformersModelOutput, + ) + ): + return run_info.source_run.model.model_source.model_name + return None + + @staticmethod + def _check_run_by_id(run_id: str) -> Generator[SseRunEventData, None, None]: with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client: for sse_event in iter_sse_retrying( client, @@ -349,22 +370,133 @@ def stream_results_by_id(run_id: str) -> Generator[dict, None, None]: ): if sse_event.event == "ping": continue - try: - yield json.loads(sse_event.data) - except json.JSONDecodeError: - logger.debug("Non-JSON SSE payload received: %s", sse_event.data) - yield {"data": sse_event.data} + yield _parse_sse_payload(sse_event.data) + + @staticmethod + @overload + def check_run_by_id( + run_id: str, stop_on_manual_approval: typing.Literal[True] + ) -> LlmBehaviorEvalResults | None: ... + + @staticmethod + @overload + def check_run_by_id( + run_id: str, stop_on_manual_approval: typing.Literal[False] = False + ) -> LlmBehaviorEvalResults: ... + + @staticmethod + @overload + def check_run_by_id( + run_id: str, stop_on_manual_approval: bool + ) -> LlmBehaviorEvalResults | None: ... + + @staticmethod + def check_run_by_id( + run_id: str, stop_on_manual_approval: bool = False + ) -> LlmBehaviorEvalResults | None: + """ + Check the status of a run given its ID + + Args: + run_id: The `run_id` produced by a `launch_eval_run` call + stop_on_manual_approval: If True, the function will return `None` if the run is awaiting manual approval - def stream_results(self) -> Generator[dict, None, None]: + Returns: + An LlmBehaviorEvalResults object with the results of the evaluation run + + Raises: + HirundoLlmBehaviorEvalError: If the maximum number of retries is reached or if the run fails + """ + logger.debug("Checking run with ID: %s", run_id) + with logging_redirect_tqdm(): + progress_bar = tqdm(total=100.0) + for iteration in LlmBehaviorEval._check_run_by_id(run_id): + state = get_state(iteration, ("state",)) + if state in STATUS_TO_PROGRESS_MAP: + progress_bar.set_description(STATUS_TO_TEXT_MAP[state]) + progress_bar.n = STATUS_TO_PROGRESS_MAP[state] + logger.debug("Setting progress to %s", progress_bar.n) + progress_bar.refresh() + if state in [ + RunStatus.FAILURE.value, + RunStatus.REJECTED.value, + RunStatus.REVOKED.value, + ]: + logger.error( + "State is failure, rejected, or revoked: %s", + state, + ) + handle_run_failure( + iteration, + error_cls=HirundoLlmBehaviorEvalError, + run_label="LLM behavior eval", + ) + elif state == RunStatus.SUCCESS.value: + progress_bar.close() + zip_temporary_url = iteration.result + if not zip_temporary_url or not isinstance( + zip_temporary_url, str + ): + raise HirundoLlmBehaviorEvalError( + "LLM behavior eval run completed without a results URL." + ) + run_info = LlmBehaviorEval.get_run_info_by_id(run_id) + model_name = LlmBehaviorEval._resolve_model_name(run_info) + return download_and_extract_llm_behavior_eval_zip( + run_id, + zip_temporary_url, + model_name, + ) + elif ( + state == RunStatus.AWAITING_MANUAL_APPROVAL.value + and stop_on_manual_approval + ): + progress_bar.close() + return None + elif state is None: + update_progress_from_result( + iteration, + progress_bar, + uploading_text="LLM behavior eval run completed. Uploading results", + log=logger, + ) + raise HirundoLlmBehaviorEvalError( + "LLM behavior eval run failed with an unknown error in check_run_by_id" + ) + + @overload + def check_run( + self, stop_on_manual_approval: typing.Literal[True] + ) -> LlmBehaviorEvalResults | None: ... + + @overload + def check_run( + self, stop_on_manual_approval: typing.Literal[False] = False + ) -> LlmBehaviorEvalResults: ... + + def check_run( + self, stop_on_manual_approval: bool = False + ) -> LlmBehaviorEvalResults | None: + """ + Check the status of the current active instance's run. + + Returns: + An LlmBehaviorEvalResults object with the results of the evaluation run + """ if not self.run_id: - raise ValueError("No run has been started") - yield from self.stream_results_by_id(self.run_id) + raise HirundoLlmBehaviorEvalError("No run has been started") + return self.check_run_by_id(self.run_id, stop_on_manual_approval) @staticmethod - async def astream_results_by_id(run_id: str) -> AsyncGenerator[dict, None]: + async def acheck_run_by_id(run_id: str) -> AsyncGenerator[SseRunEventData, None]: """ - Async stream evaluation results for a run. + Async version of :func:`check_run_by_id` + + Check the status of a run given its ID. + + This generator will produce values to show progress of the run. """ + logger.debug("Checking run with ID: %s", run_id) async with httpx.AsyncClient( timeout=httpx.Timeout(None, connect=5.0) ) as client: @@ -377,14 +509,19 @@ async def astream_results_by_id(run_id: str) -> AsyncGenerator[dict, None]: async for sse_event in async_iterator: if sse_event.event == "ping": continue - try: - yield json.loads(sse_event.data) - except json.JSONDecodeError: - logger.debug("Non-JSON SSE payload received: %s", sse_event.data) - yield {"data": sse_event.data} + yield _parse_sse_payload(sse_event.data) - async def astream_results(self) -> AsyncGenerator[dict, None]: + async def acheck_run(self) -> AsyncGenerator[SseRunEventData, None]: + """ + Async version of :func:`check_run` + + Check the status of the current active instance's run. + + This generator will produce values to show progress of the run. + + Note: This function does not handle errors nor show progress. It is expected that you do that. + """ if not self.run_id: - raise ValueError("No run has been started") - async for payload in self.astream_results_by_id(self.run_id): - yield payload + raise HirundoLlmBehaviorEvalError("No run has been started") + async for iteration in self.acheck_run_by_id(self.run_id): + yield iteration diff --git a/hirundo/llm_behavior_eval_results.py b/hirundo/llm_behavior_eval_results.py new file mode 100644 index 00000000..f13e9e0b --- /dev/null +++ b/hirundo/llm_behavior_eval_results.py @@ -0,0 +1,27 @@ +import typing +from pathlib import Path + +from pydantic import BaseModel + +T = typing.TypeVar("T") + + +class LlmBehaviorEvalResults(BaseModel, typing.Generic[T]): + model_config = {"arbitrary_types_allowed": True} + + cached_zip_path: Path + """ + The path to the cached zip file of the results + """ + model_name: str | None = None + """ + The model name used to locate results in the zip file + """ + summary_brief: T + """ + A polars/pandas DataFrame containing the summary_brief CSV + """ + summary_full: T + """ + A polars/pandas DataFrame containing the summary_full CSV + """ diff --git a/hirundo/llm_bias_type.py b/hirundo/llm_bias_type.py new file mode 100644 index 00000000..bd45501d --- /dev/null +++ b/hirundo/llm_bias_type.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class BiasType(str, Enum): + ALL = "ALL" + RACE = "RACE" + NATIONALITY = "NATIONALITY" + GENDER = "GENDER" + PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE" + RELIGION = "RELIGION" + AGE = "AGE" diff --git a/hirundo/unlearning_llm.py b/hirundo/unlearning_llm.py index b36ea14f..19a58e04 100644 --- a/hirundo/unlearning_llm.py +++ b/hirundo/unlearning_llm.py @@ -12,6 +12,7 @@ from hirundo._headers import get_headers from hirundo._http import raise_for_status_with_reason, requests from hirundo._llm_pipeline import get_hf_pipeline_for_run_given_model +from hirundo._llm_sources import LlmSources, LlmSourcesOutput from hirundo._run_checking import ( STATUS_TO_PROGRESS_MAP, RunStatus, @@ -24,6 +25,7 @@ ) from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo.dataset_qa import HirundoError +from hirundo.llm_bias_type import BiasType from hirundo.logger import get_logger if TYPE_CHECKING: @@ -34,45 +36,6 @@ logger = get_logger(__name__) -class ModelSourceType(str, Enum): - HUGGINGFACE_TRANSFORMERS = "huggingface_transformers" - LOCAL_TRANSFORMERS = "local_transformers" - - -class HuggingFaceTransformersModel(BaseModel): - model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) - - type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = ( - ModelSourceType.HUGGINGFACE_TRANSFORMERS - ) - revision: str | None = None - code_revision: str | None = None - model_name: str - token: str | None = None - - -class HuggingFaceTransformersModelOutput(BaseModel): - model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) - - type: Literal[ModelSourceType.HUGGINGFACE_TRANSFORMERS] = ( - ModelSourceType.HUGGINGFACE_TRANSFORMERS - ) - model_name: str - - -class LocalTransformersModel(BaseModel): - type: Literal[ModelSourceType.LOCAL_TRANSFORMERS] = ( - ModelSourceType.LOCAL_TRANSFORMERS - ) - revision: None = None - code_revision: None = None - local_path: str - - -LlmSources = HuggingFaceTransformersModel | LocalTransformersModel -LlmSourcesOutput = HuggingFaceTransformersModelOutput | LocalTransformersModel - - class LlmModel(BaseModel): model_config = ConfigDict(protected_namespaces=("model_validate", "model_dump")) @@ -235,16 +198,6 @@ class UnlearningLlmAdvancedOptions(BaseModel): max_tokens_for_model: dict[DatasetType, int] | int | None = None -class BiasType(str, Enum): - ALL = "ALL" - RACE = "RACE" - NATIONALITY = "NATIONALITY" - GENDER = "GENDER" - PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE" - RELIGION = "RELIGION" - AGE = "AGE" - - class UtilityType(str, Enum): DEFAULT = "DEFAULT" CUSTOM = "CUSTOM" diff --git a/hirundo/unzip.py b/hirundo/unzip.py index 113f3be7..f0cf3e22 100644 --- a/hirundo/unzip.py +++ b/hirundo/unzip.py @@ -23,6 +23,7 @@ DataFrameType, DatasetQAResults, ) +from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults from hirundo.logger import get_logger ZIP_FILE_CHUNK_SIZE = 50 * 1024 * 1024 # 50 MB @@ -150,11 +151,11 @@ def download_and_extract_zip( headers=headers, timeout=DOWNLOAD_READ_TIMEOUT, stream=True, - ) as r: - r.raise_for_status() - with open(zip_file_path, "wb") as f: - for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE): - f.write(chunk) + ) as response: + response.raise_for_status() + with open(zip_file_path, "wb") as output_file: + for chunk in response.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE): + output_file.write(chunk) logger.info( "Successfully downloaded the result zip file for run ID %s to %s", run_id, @@ -224,6 +225,79 @@ def download_and_extract_zip( ) +def download_and_extract_llm_behavior_eval_zip( + run_id: str, + zip_url: str, + model_name: str | None = None, +) -> LlmBehaviorEvalResults[DataFrameType]: + """ + Download and extract the LLM behavior evaluation results zip file. + + Args: + run_id: The ID of the LLM behavior eval run. + zip_url: The URL of the zip file to download. + model_name (optional): The full model name to resolve the folder within the zip. + + Returns: + The LLM behavior eval results object. + """ + cache_dir = Path.home() / ".hirundo" / "cache" + cache_dir.mkdir(parents=True, exist_ok=True) + zip_file_path = cache_dir / f"{run_id}.zip" + + headers = None + if Url(zip_url).scheme == "file": + zip_url = f"{API_HOST}/llm-behavior-eval/run/local-download" + zip_url.replace( + "file://", "" + ) + headers = _get_auth_headers() + with requests.get( + zip_url, + headers=headers, + timeout=DOWNLOAD_READ_TIMEOUT, + stream=True, + ) as r: + r.raise_for_status() + with open(zip_file_path, "wb") as f: + for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE): + f.write(chunk) + logger.info( + "Successfully downloaded the LLM behavior eval result zip file for run ID %s to %s", + run_id, + zip_file_path, + ) + + if model_name: + model_folder = model_name.split("/")[-1] + summary_brief_name = f"{model_folder}/summary_brief.csv" + summary_full_name = f"{model_folder}/summary_full.csv" + + with zipfile.ZipFile(zip_file_path, "r") as zip_file: + filenames = [file.filename for file in zip_file.filelist] + if summary_brief_name not in filenames: + raise ValueError( + f"Missing {summary_brief_name} in LLM behavior eval zip for run {run_id}" + ) + if summary_full_name not in filenames: + raise ValueError( + f"Missing {summary_full_name} in LLM behavior eval zip for run {run_id}" + ) + with zip_file.open(summary_brief_name) as summary_brief_file: + summary_brief_df = load_df(summary_brief_file) + with zip_file.open(summary_full_name) as summary_full_file: + summary_full_df = load_df(summary_full_file) + else: + summary_brief_df = None + summary_full_df = None + + return LlmBehaviorEvalResults[DataFrameType]( + cached_zip_path=zip_file_path, + model_name=model_name, + summary_brief=summary_brief_df, + summary_full=summary_full_df, + ) + + def load_from_zip( zip_path: Path, file_name: str ) -> "pd.DataFrame | pl.DataFrame | None": diff --git a/tests/llm-behavior-eval/llm_behavior_eval_test.py b/tests/llm-behavior-eval/llm_behavior_eval_test.py new file mode 100644 index 00000000..a7de894a --- /dev/null +++ b/tests/llm-behavior-eval/llm_behavior_eval_test.py @@ -0,0 +1,40 @@ +import logging +import os + +from hirundo import ( + BiasType, + EvalRunInfo, + HuggingFaceTransformersModel, + LlmBehaviorEval, + LlmModel, + ModelOrRun, + PresetType, +) +from tests.testing_utils import get_unique_id + +logger = logging.getLogger(__name__) + +unique_id = get_unique_id() + + +def test_llm_behavior_eval(): + llm = LlmModel( + model_name=f"TEST-LLM-BEHAVIOR-EVAL-Granite-4-micro-{unique_id}", + model_source=HuggingFaceTransformersModel( + model_name="ibm-granite/granite-4.0-micro", + ), + ) + llm_id = llm.create() + run_info = EvalRunInfo( + name=f"TEST-LLM-BEHAVIOR-EVAL-RUN-{unique_id}", + model_id=llm_id, + preset_type=PresetType.BBQ_BIAS, + bias_type=BiasType.ALL, + ) + assert llm_id is not None + if os.getenv("FULL_TEST", "false") == "true": + run_id = LlmBehaviorEval.launch_eval_run(ModelOrRun.MODEL, run_info) + assert run_id is not None + results = LlmBehaviorEval.check_run_by_id(run_id) + assert results is not None + assert results.cached_zip_path is not None From b9027d8020c7ff32d939ed6a370a83b1bae6e57e Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 28 Jan 2026 21:50:34 +0200 Subject: [PATCH 05/24] Add AGENTS.md and new `dependency_groups` entry of `dev` for development --- .envrc | 2 +- .pre-commit-config.yaml | 2 +- AGENTS.md | 39 +++++++++++++++++++++++++++++++++++++++ pyproject.toml | 3 +++ uv.lock | 8 ++++++++ 5 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 AGENTS.md diff --git a/.envrc b/.envrc index b2d2268b..550b4cae 100644 --- a/.envrc +++ b/.envrc @@ -1,2 +1,2 @@ watch_file uv.lock -uv sync --all-extras && source .venv/bin/activate +uv sync --group dev && source .venv/bin/activate diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 25c82ebb..314b9f2a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -75,4 +75,4 @@ repos: hooks: - id: uv-lock - id: uv-sync - args: ["--extra", "dev", "--extra", "docs", "--extra", "pandas", "--extra", "polars", "--extra", "transformers"] + args: ["--group", "dev"] diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..c2cd2baa --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,39 @@ +# Repository Guidelines + +## Project Structure & Module Organization + +- `hirundo/` holds the SDK source (CLI entry point is `hirundo.cli:app`). +- `tests/` contains pytest-based test coverage. +- `docs/` and `source/` contain Sphinx documentation assets. +- `notebooks/` and `on_prem_test_notebook.ipynb` provide example workflows. +- `requirements/` stores compiled dependency sets (for dev, docs, pandas, polars, transformers). + +## Build, Test, and Development Commands + +- `uv sync --group dev`: fast dependency sync with extras. +- `ruff check` / `ruff format`: lint and auto-format (run before PRs). +- `pytest`: run the test suite. +- `python -m build`: build the package artifacts. +- `pre-commit install`: enable git hooks (optional, but recommended). + +## Coding Style & Naming Conventions + +- Python 3.10+ codebase, 4-space indentation, line length 88 (Ruff defaults). +- Follow Ruff linting rules (`pyproject.toml`), with tests allowing `assert` usage. +- Prefer descriptive names; avoid short, cryptic identifiers in new code. + +## Testing Guidelines + +- Frameworks: `pytest` and `pytest-asyncio`. +- Place tests in `tests/`; name files `test_*.py`. +- Run locally with `pytest` before opening a PR (CI runs lint + integration tests). + +## Commit & Pull Request Guidelines + +- Recent commit history favors `SDK-: ` (e.g., `SDK-78: Migrate to basedpyright`). +- Include issue/PR references when available (e.g., `(#190)`). +- PRs should describe changes clearly and confirm `ruff check` and `ruff format` passed. + +## Security & Configuration Tips + +- Supported Python versions: CPython 3.10–3.13. diff --git a/pyproject.toml b/pyproject.toml index 0af15c33..544e0a03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,9 @@ pandas = ["pandas>=2.2.3"] polars = ["polars>=1.0.0"] transformers = ["transformers>=4.57.3", "peft>=0.18.1", "accelerate>=1.12.0"] +[dependency-groups] +dev = ["hirundo[dev,docs,pandas,polars,transformers]"] + [tool.bumpver] current_version = "0.1.3b1" version_pattern = "MAJOR.MINOR.PATCH[PYTAGNUM]" diff --git a/uv.lock b/uv.lock index 82686b7a..bd1e3f97 100644 --- a/uv.lock +++ b/uv.lock @@ -619,6 +619,11 @@ transformers = [ { name = "transformers" }, ] +[package.dev-dependencies] +dev = [ + { name = "hirundo", extra = ["dev", "docs", "pandas", "polars", "transformers"] }, +] + [package.metadata] requires-dist = [ { name = "accelerate", marker = "extra == 'transformers'", specifier = ">=1.12.0" }, @@ -680,6 +685,9 @@ requires-dist = [ ] provides-extras = ["dev", "docs", "pandas", "polars", "transformers"] +[package.metadata.requires-dev] +dev = [{ name = "hirundo", extras = ["dev", "docs", "pandas", "polars", "transformers"] }] + [[package]] name = "httpcore" version = "1.0.9" From d141b9d1fc174068c5af0b93f4e9c33834485b98 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 28 Jan 2026 21:54:23 +0200 Subject: [PATCH 06/24] Fix `RunStatus` circular dependency --- hirundo/__init__.py | 2 ++ hirundo/_run_checking.py | 13 +------------ hirundo/_run_status.py | 12 ++++++++++++ hirundo/_sse_event_data.py | 2 +- hirundo/dataset_qa.py | 2 +- hirundo/llm_behavior_eval.py | 2 +- hirundo/unlearning_llm.py | 2 +- scripts/cleanup_test_artifacts.py | 3 ++- tests/dataset_qa_shared.py | 2 +- 9 files changed, 22 insertions(+), 18 deletions(-) create mode 100644 hirundo/_run_status.py diff --git a/hirundo/__init__.py b/hirundo/__init__.py index 2cb0ca16..7a7b0981 100644 --- a/hirundo/__init__.py +++ b/hirundo/__init__.py @@ -7,6 +7,7 @@ LocalTransformersModel, ModelSourceType, ) +from ._run_status import RunStatus from .dataset_enum import ( DatasetMetadataType, LabelingType, @@ -105,6 +106,7 @@ "load_df", "load_from_zip", "ModelSourceType", + "RunStatus", ] __version__ = "0.1.22" diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py index 9bf68622..b36f2232 100644 --- a/hirundo/_run_checking.py +++ b/hirundo/_run_checking.py @@ -1,11 +1,11 @@ import json from collections.abc import AsyncGenerator, Generator -from enum import Enum import httpx from tqdm import tqdm from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying +from hirundo._run_status import RunStatus from hirundo._sse_event_data import SseRunEventData from hirundo.logger import get_logger @@ -14,17 +14,6 @@ DEFAULT_MAX_RETRIES = 200 -class RunStatus(Enum): - PENDING = "PENDING" - STARTED = "STARTED" - SUCCESS = "SUCCESS" - FAILURE = "FAILURE" - AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL" - REVOKED = "REVOKED" - REJECTED = "REJECTED" - RETRY = "RETRY" - - STATUS_TO_PROGRESS_MAP = { RunStatus.STARTED.value: 0.0, RunStatus.PENDING.value: 0.0, diff --git a/hirundo/_run_status.py b/hirundo/_run_status.py new file mode 100644 index 00000000..1cc78f53 --- /dev/null +++ b/hirundo/_run_status.py @@ -0,0 +1,12 @@ +from enum import Enum + + +class RunStatus(str, Enum): + PENDING = "PENDING" + STARTED = "STARTED" + SUCCESS = "SUCCESS" + FAILURE = "FAILURE" + AWAITING_MANUAL_APPROVAL = "AWAITING MANUAL APPROVAL" + REVOKED = "REVOKED" + REJECTED = "REJECTED" + RETRY = "RETRY" diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py index 00155f6f..877051db 100644 --- a/hirundo/_sse_event_data.py +++ b/hirundo/_sse_event_data.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, TypeAdapter -from hirundo._run_checking import RunStatus +from hirundo._run_status import RunStatus class SseRunEventData(BaseModel): diff --git a/hirundo/dataset_qa.py b/hirundo/dataset_qa.py index 99e211f6..8428b69e 100644 --- a/hirundo/dataset_qa.py +++ b/hirundo/dataset_qa.py @@ -15,7 +15,6 @@ from hirundo._http import raise_for_status_with_reason, requests from hirundo._run_checking import ( STATUS_TO_PROGRESS_MAP, - RunStatus, aiter_run_events, build_status_text_map, get_state, @@ -23,6 +22,7 @@ iter_run_events, update_progress_from_result, ) +from hirundo._run_status import RunStatus from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo._urls import HirundoUrl from hirundo.dataset_enum import DatasetMetadataType, LabelingType diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index 72c27ba6..979443f6 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -17,12 +17,12 @@ from hirundo._llm_sources import HuggingFaceTransformersModelOutput, LlmSourcesOutput from hirundo._run_checking import ( STATUS_TO_PROGRESS_MAP, - RunStatus, build_status_text_map, get_state, handle_run_failure, update_progress_from_result, ) +from hirundo._run_status import RunStatus from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults diff --git a/hirundo/unlearning_llm.py b/hirundo/unlearning_llm.py index 19a58e04..a56bd7e4 100644 --- a/hirundo/unlearning_llm.py +++ b/hirundo/unlearning_llm.py @@ -15,7 +15,6 @@ from hirundo._llm_sources import LlmSources, LlmSourcesOutput from hirundo._run_checking import ( STATUS_TO_PROGRESS_MAP, - RunStatus, aiter_run_events, build_status_text_map, get_state, @@ -23,6 +22,7 @@ iter_run_events, update_progress_from_result, ) +from hirundo._run_status import RunStatus from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo.dataset_qa import HirundoError from hirundo.llm_bias_type import BiasType diff --git a/scripts/cleanup_test_artifacts.py b/scripts/cleanup_test_artifacts.py index 1307075e..e6357583 100644 --- a/scripts/cleanup_test_artifacts.py +++ b/scripts/cleanup_test_artifacts.py @@ -3,7 +3,8 @@ import requests from hirundo import GitRepo, QADataset, StorageConfig -from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut, RunStatus +from hirundo._run_status import RunStatus +from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut from hirundo.logger import get_logger from hirundo.storage import ResponseStorageConfig from hirundo.unlearning_llm import ( diff --git a/tests/dataset_qa_shared.py b/tests/dataset_qa_shared.py index 126789fb..014b3957 100644 --- a/tests/dataset_qa_shared.py +++ b/tests/dataset_qa_shared.py @@ -9,7 +9,7 @@ RunArgs, StorageConfig, ) -from hirundo.dataset_qa import RunStatus +from hirundo._run_status import RunStatus from hirundo.logger import get_logger logger = get_logger(__name__) From c62aad36e65d485482207037f57c9e281ae05106 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 28 Jan 2026 21:56:34 +0200 Subject: [PATCH 07/24] Drop unnecessary `TypeAdapter` and add error log for invalid SSE payload to make it more debuggable --- hirundo/_sse_event_data.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py index 877051db..48675392 100644 --- a/hirundo/_sse_event_data.py +++ b/hirundo/_sse_event_data.py @@ -1,8 +1,11 @@ from typing import Any -from pydantic import BaseModel, TypeAdapter +from pydantic import BaseModel from hirundo._run_status import RunStatus +from hirundo.logger import get_logger + +logger = get_logger(__name__) class SseRunEventData(BaseModel): @@ -11,14 +14,12 @@ class SseRunEventData(BaseModel): result: str | dict | None -_SSE_RUN_EVENT_ADAPTER = TypeAdapter(SseRunEventData) - - def _parse_sse_payload(payload: Any) -> SseRunEventData: if isinstance(payload, dict): if "data" in payload: data = payload["data"] if isinstance(data, dict): - return _SSE_RUN_EVENT_ADAPTER.validate_python(data) + return SseRunEventData.model_validate(data) + logger.error("Invalid SSE payload: %s", payload) raise ValueError(f"Invalid SSE payload: {payload}") From d42c9454192f736309d23b3d6869bbe8fa399e19 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 28 Jan 2026 22:48:14 +0200 Subject: [PATCH 08/24] Update `AGENTS.md` to use context7 and not use 1-3 character variable names --- AGENTS.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index c2cd2baa..e1d12ee7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,5 +1,11 @@ # Repository Guidelines +## Instructions + +- Always use context7 when I need code generation, setup or configuration steps, or + library/API documentation. This means you should automatically use the Context7 MCP + tools to resolve library id and get library docs without me having to explicitly ask. + ## Project Structure & Module Organization - `hirundo/` holds the SDK source (CLI entry point is `hirundo.cli:app`). @@ -21,6 +27,8 @@ - Python 3.10+ codebase, 4-space indentation, line length 88 (Ruff defaults). - Follow Ruff linting rules (`pyproject.toml`), with tests allowing `assert` usage. - Prefer descriptive names; avoid short, cryptic identifiers in new code. +- Avoid 1-3 character variable names in new or refactored code. Use descriptive names + even in small scopes. ## Testing Guidelines From 808aed79158cd6509b5f2735dbb37ba5f47dda6b Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 28 Jan 2026 22:52:53 +0200 Subject: [PATCH 09/24] Add assertion for `summary_brief` and `summary_full` to LLM behavior eval test --- tests/llm-behavior-eval/llm_behavior_eval_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/llm-behavior-eval/llm_behavior_eval_test.py b/tests/llm-behavior-eval/llm_behavior_eval_test.py index a7de894a..d4a8359c 100644 --- a/tests/llm-behavior-eval/llm_behavior_eval_test.py +++ b/tests/llm-behavior-eval/llm_behavior_eval_test.py @@ -38,3 +38,5 @@ def test_llm_behavior_eval(): results = LlmBehaviorEval.check_run_by_id(run_id) assert results is not None assert results.cached_zip_path is not None + assert results.summary_brief is not None + assert results.summary_full is not None From c872f890b7096449b6ecc72174581cad80724ad6 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 28 Jan 2026 22:53:12 +0200 Subject: [PATCH 10/24] Fix SSE payload parsing --- hirundo/_sse_event_data.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py index 48675392..34b704b8 100644 --- a/hirundo/_sse_event_data.py +++ b/hirundo/_sse_event_data.py @@ -1,7 +1,6 @@ -from typing import Any - -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError +from hirundo import HirundoError from hirundo._run_status import RunStatus from hirundo.logger import get_logger @@ -14,12 +13,13 @@ class SseRunEventData(BaseModel): result: str | dict | None -def _parse_sse_payload(payload: Any) -> SseRunEventData: - if isinstance(payload, dict): - if "data" in payload: - data = payload["data"] - if isinstance(data, dict): - return SseRunEventData.model_validate(data) +class SseRunEventDataPayload(BaseModel): + data: SseRunEventData + - logger.error("Invalid SSE payload: %s", payload) - raise ValueError(f"Invalid SSE payload: {payload}") +def _parse_sse_payload(payload: str) -> SseRunEventData: + try: + return SseRunEventDataPayload.model_validate_json(payload).data + except ValidationError as e: + logger.error("Invalid SSE payload: %s: %s", payload, exc_info=e) + raise HirundoError(f"Invalid SSE payload: {payload}") from e From be354c44a427d9a822344fb03f03c00181828533 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 28 Jan 2026 23:36:30 +0200 Subject: [PATCH 11/24] Try to fix `unzip` for LLM behavior eval results --- hirundo/unzip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hirundo/unzip.py b/hirundo/unzip.py index f0cf3e22..8fdad079 100644 --- a/hirundo/unzip.py +++ b/hirundo/unzip.py @@ -269,8 +269,8 @@ def download_and_extract_llm_behavior_eval_zip( if model_name: model_folder = model_name.split("/")[-1] - summary_brief_name = f"{model_folder}/summary_brief.csv" - summary_full_name = f"{model_folder}/summary_full.csv" + summary_brief_name = f"responses/{model_folder}/summary_brief.csv" + summary_full_name = f"responses/{model_folder}/summary_full.csv" with zipfile.ZipFile(zip_file_path, "r") as zip_file: filenames = [file.filename for file in zip_file.filelist] From 5391f9637b6f325294ed9714ef2528e3daf10ac5 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Tue, 3 Feb 2026 13:57:38 +0200 Subject: [PATCH 12/24] Apply Greptile suggestions from code review These look good to me :) Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Ben Lewis --- hirundo/_hirundo_error.py | 2 +- hirundo/_run_checking.py | 2 +- hirundo/_sse_event_data.py | 2 +- hirundo/unzip.py | 10 +++++----- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hirundo/_hirundo_error.py b/hirundo/_hirundo_error.py index 5f798aff..9b67978a 100644 --- a/hirundo/_hirundo_error.py +++ b/hirundo/_hirundo_error.py @@ -1,6 +1,6 @@ class HirundoError(Exception): """ - Custom exception used to indicate errors in `hirundo` dataset QA runs + Base exception class for Hirundo SDK errors """ pass diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py index b36f2232..10dcce1b 100644 --- a/hirundo/_run_checking.py +++ b/hirundo/_run_checking.py @@ -66,7 +66,7 @@ def get_state( The first non-null state value, or None if none are present. """ for key in status_keys: - value = payload.get(key) if isinstance(payload, dict) else getattr(payload, key) + value = payload.get(key) if isinstance(payload, dict) else getattr(payload, key, None) if value is not None: return value return None diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py index 34b704b8..940d8675 100644 --- a/hirundo/_sse_event_data.py +++ b/hirundo/_sse_event_data.py @@ -21,5 +21,5 @@ def _parse_sse_payload(payload: str) -> SseRunEventData: try: return SseRunEventDataPayload.model_validate_json(payload).data except ValidationError as e: - logger.error("Invalid SSE payload: %s: %s", payload, exc_info=e) + logger.error("Invalid SSE payload: %s", payload, exc_info=True) raise HirundoError(f"Invalid SSE payload: {payload}") from e diff --git a/hirundo/unzip.py b/hirundo/unzip.py index 8fdad079..3c2f0880 100644 --- a/hirundo/unzip.py +++ b/hirundo/unzip.py @@ -256,11 +256,11 @@ def download_and_extract_llm_behavior_eval_zip( headers=headers, timeout=DOWNLOAD_READ_TIMEOUT, stream=True, - ) as r: - r.raise_for_status() - with open(zip_file_path, "wb") as f: - for chunk in r.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE): - f.write(chunk) + ) as response: + response.raise_for_status() + with open(zip_file_path, "wb") as output_file: + for chunk in response.iter_content(chunk_size=ZIP_FILE_CHUNK_SIZE): + output_file.write(chunk) logger.info( "Successfully downloaded the LLM behavior eval result zip file for run ID %s to %s", run_id, From 7612832d40166eb51c2cd786d1968f3b18a447ef Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Tue, 3 Feb 2026 14:33:58 +0200 Subject: [PATCH 13/24] SDK-79: Guard SSE progress and retry eval stream --- hirundo/_run_checking.py | 29 ++++++++++++++++++++++++++- hirundo/llm_behavior_eval.py | 39 ++++++++++++++++++++++++++---------- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py index 10dcce1b..a2b123c8 100644 --- a/hirundo/_run_checking.py +++ b/hirundo/_run_checking.py @@ -239,7 +239,7 @@ def update_progress_from_result( result_outer.get("result") if isinstance(result_outer, dict) else result_outer ) - if result_inner: + if isinstance(result_inner, str): result_info = result_inner.split(":") if len(result_info) > 1: stage = result_info[0] @@ -258,6 +258,33 @@ def update_progress_from_result( log.debug("Setting progress to %s", progress.n) progress.refresh() return True + if isinstance(result_inner, dict): + stage = ( + result_inner.get("stage") + or result_inner.get("state") + or result_inner.get("status") + ) + progress_value = result_inner.get("progress") + if progress_value is None: + progress_value = result_inner.get("percentage") + if progress_value is None: + progress_value = result_inner.get("percent") + if isinstance(progress_value, str): + progress_value = progress_value.strip().removesuffix("%") + if isinstance(progress_value, (int, float, str)): + try: + current_progress_percentage = float(progress_value) + except (TypeError, ValueError): + current_progress_percentage = progress.n + else: + desc = uploading_text if current_progress_percentage == 100.0 else stage + if desc: + progress.set_description(desc) + progress.n = current_progress_percentage + log.debug("Setting progress to %s", progress.n) + progress.refresh() + return True + log.debug("Skipping non-string progress result payload: %s", result_inner) return False diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index 979443f6..0bf29d94 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -16,6 +16,7 @@ from hirundo._iter_sse_retrying import aiter_sse_retrying, iter_sse_retrying from hirundo._llm_sources import HuggingFaceTransformersModelOutput, LlmSourcesOutput from hirundo._run_checking import ( + DEFAULT_MAX_RETRIES, STATUS_TO_PROGRESS_MAP, build_status_text_map, get_state, @@ -360,17 +361,33 @@ def _resolve_model_name(run_info: EvalRunRecord) -> str | None: return None @staticmethod - def _check_run_by_id(run_id: str) -> Generator[SseRunEventData, None, None]: - with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client: - for sse_event in iter_sse_retrying( - client, - "GET", - f"{API_HOST}/llm-behavior-eval/run/{run_id}", - headers=get_headers(), - ): - if sse_event.event == "ping": - continue - yield _parse_sse_payload(sse_event.data) + def _check_run_by_id( + run_id: str, *, max_retries: int = DEFAULT_MAX_RETRIES + ) -> Generator[SseRunEventData, None, None]: + retry_count = 0 + while True: + if retry_count > max_retries: + raise HirundoLlmBehaviorEvalError("Max retries reached") + last_payload = None + with httpx.Client(timeout=httpx.Timeout(None, connect=5.0)) as client: + for sse_event in iter_sse_retrying( + client, + "GET", + f"{API_HOST}/llm-behavior-eval/run/{run_id}", + headers=get_headers(), + ): + if sse_event.event == "ping": + continue + payload = _parse_sse_payload(sse_event.data) + last_payload = payload + yield payload + last_state = ( + get_state(last_payload, ("state",)) if last_payload else None + ) + if last_payload is None or last_state == RunStatus.PENDING.value: + retry_count += 1 + continue + return @staticmethod @overload From bc1d309db088450c5a9e6cd557efa84cebc9fdf9 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Tue, 3 Feb 2026 14:34:05 +0200 Subject: [PATCH 14/24] SDK-79: Skip non-string SSE progress --- hirundo/_run_checking.py | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/hirundo/_run_checking.py b/hirundo/_run_checking.py index a2b123c8..8d493055 100644 --- a/hirundo/_run_checking.py +++ b/hirundo/_run_checking.py @@ -66,7 +66,11 @@ def get_state( The first non-null state value, or None if none are present. """ for key in status_keys: - value = payload.get(key) if isinstance(payload, dict) else getattr(payload, key, None) + value = ( + payload.get(key) + if isinstance(payload, dict) + else getattr(payload, key, None) + ) if value is not None: return value return None @@ -258,32 +262,7 @@ def update_progress_from_result( log.debug("Setting progress to %s", progress.n) progress.refresh() return True - if isinstance(result_inner, dict): - stage = ( - result_inner.get("stage") - or result_inner.get("state") - or result_inner.get("status") - ) - progress_value = result_inner.get("progress") - if progress_value is None: - progress_value = result_inner.get("percentage") - if progress_value is None: - progress_value = result_inner.get("percent") - if isinstance(progress_value, str): - progress_value = progress_value.strip().removesuffix("%") - if isinstance(progress_value, (int, float, str)): - try: - current_progress_percentage = float(progress_value) - except (TypeError, ValueError): - current_progress_percentage = progress.n - else: - desc = uploading_text if current_progress_percentage == 100.0 else stage - if desc: - progress.set_description(desc) - progress.n = current_progress_percentage - log.debug("Setting progress to %s", progress.n) - progress.refresh() - return True + if result_inner is not None: log.debug("Skipping non-string progress result payload: %s", result_inner) return False From 4374468c7967df62bb7e7eae290dd774119dd25e Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Tue, 3 Feb 2026 17:21:26 +0200 Subject: [PATCH 15/24] SDK-79: Apply ruff format to llm behavior eval --- hirundo/llm_behavior_eval.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index 0bf29d94..23971d9e 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -381,9 +381,7 @@ def _check_run_by_id( payload = _parse_sse_payload(sse_event.data) last_payload = payload yield payload - last_state = ( - get_state(last_payload, ("state",)) if last_payload else None - ) + last_state = get_state(last_payload, ("state",)) if last_payload else None if last_payload is None or last_state == RunStatus.PENDING.value: retry_count += 1 continue From 6bc31260fade12f3c88c1513d501ab7376311d0e Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 16:29:51 +0200 Subject: [PATCH 16/24] Update `README.md` and documentation (docs) --- README.md | 40 +++++++++++++++++++++- docs/hirundo.llm_behavior_eval.rst | 10 ++++++ docs/hirundo.llm_behavior_eval_results.rst | 10 ++++++ docs/hirundo.llm_bias_type.rst | 10 ++++++ docs/hirundo.rst | 3 ++ docs/index.rst | 12 +++++++ docs/llm_behavior_eval_example.py | 32 +++++++++++++++++ 7 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 docs/hirundo.llm_behavior_eval.rst create mode 100644 docs/hirundo.llm_behavior_eval_results.rst create mode 100644 docs/hirundo.llm_bias_type.rst create mode 100644 docs/llm_behavior_eval_example.py diff --git a/README.md b/README.md index aec8d5af..adb54aca 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ The Hirundo Python SDK lets you: - Launch and monitor LLM behavior unlearning runs. +- Run LLM behavior evaluations for bias, hallucination, and prompt injection. - Run dataset QA for ML datasets (classification, object detection, and more). - Fetch QA results as `pandas` or `polars` DataFrames. @@ -22,7 +23,7 @@ pip install hirundo Optional extras: - LLM behavior unlearning (Transformers + PEFT): `pip install hirundo[transformers]` -- Dataset QA results as DataFrames: `pip install hirundo[pandas]` or `pip install hirundo[polars]` +- Dataset QA or LLM behavior eval results as DataFrames: `pip install hirundo[pandas]` or `pip install hirundo[polars]` If you want to install from source, clone this repository and run: @@ -70,6 +71,43 @@ result = LlmUnlearningRun.check_run(run_id) new_adapter = llm.get_hf_pipeline_for_run(run_id) ``` +## Quickstart: LLM behavior eval + +If you want results as DataFrames, install `hirundo[pandas]` or `hirundo[polars]`. + +```python +from hirundo import ( + BiasType, + EvalRunInfo, + HuggingFaceTransformersModel, + LlmBehaviorEval, + LlmModel, + ModelOrRun, + PresetType, +) + +llm = LlmModel( + model_name="Nemotron-Flash-1B", + model_source=HuggingFaceTransformersModel( + model_name="nvidia/Nemotron-Flash-1B", + ), +) +llm_id = llm.create() + +run_id = LlmBehaviorEval.launch_eval_run( + ModelOrRun.MODEL, + EvalRunInfo( + name="Nemotron BBQ bias eval", + model_id=llm_id, + preset_type=PresetType.BBQ_BIAS, + bias_type=BiasType.ALL, + ), +) + +results = LlmBehaviorEval.check_run_by_id(run_id) +print(results.summary_brief) +``` + ## Quickstart: Dataset QA ### Classification diff --git a/docs/hirundo.llm_behavior_eval.rst b/docs/hirundo.llm_behavior_eval.rst new file mode 100644 index 00000000..500b4c4f --- /dev/null +++ b/docs/hirundo.llm_behavior_eval.rst @@ -0,0 +1,10 @@ +.. meta:: + :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none' + +hirundo.llm_behavior_eval module +============================= + +.. automodule:: hirundo.llm_behavior_eval + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/hirundo.llm_behavior_eval_results.rst b/docs/hirundo.llm_behavior_eval_results.rst new file mode 100644 index 00000000..61a30892 --- /dev/null +++ b/docs/hirundo.llm_behavior_eval_results.rst @@ -0,0 +1,10 @@ +.. meta:: + :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none' + +hirundo.llm_behavior_eval_results module +============================= + +.. automodule:: hirundo.llm_behavior_eval_results + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/hirundo.llm_bias_type.rst b/docs/hirundo.llm_bias_type.rst new file mode 100644 index 00000000..54a39208 --- /dev/null +++ b/docs/hirundo.llm_bias_type.rst @@ -0,0 +1,10 @@ +.. meta:: + :http-equiv=Content-Security-Policy: default-src 'self', frame-ancestors 'none' + +hirundo.llm_bias_type module +============================= + +.. automodule:: hirundo.llm_bias_type + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/hirundo.rst b/docs/hirundo.rst index abb3ad42..214fa564 100644 --- a/docs/hirundo.rst +++ b/docs/hirundo.rst @@ -17,6 +17,9 @@ Submodules hirundo.git hirundo.labeling hirundo.logger + hirundo.llm_behavior_eval + hirundo.llm_behavior_eval_results + hirundo.llm_bias_type hirundo.storage hirundo.unlearning_llm hirundo.unzip diff --git a/docs/index.rst b/docs/index.rst index 887f7bd7..9fa8877b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,6 +13,7 @@ Welcome to the ``hirundo`` client library documentation. This SDK connects to th Hirundo platform and provides APIs for: - LLM behavior unlearning runs (reducing bias, prompt injections and other unwanted behaviors). +- LLM behavior eval runs (measuring bias, hallucination, prompt injection, and more). - Dataset QA for machine learning datasets. Getting started @@ -45,6 +46,17 @@ Example: .. literalinclude:: llm_unlearning_example.py :language: python +LLM behavior eval +----------------- + +Run standardized evaluations over an LLM or an unlearning run to quantify +behavior changes (bias, hallucination, prompt injections, and more). + +Example: + +.. literalinclude:: llm_behavior_eval_example.py + :language: python + Dataset QA ---------- diff --git a/docs/llm_behavior_eval_example.py b/docs/llm_behavior_eval_example.py new file mode 100644 index 00000000..0cce0338 --- /dev/null +++ b/docs/llm_behavior_eval_example.py @@ -0,0 +1,32 @@ +"""Examples for docs/index.rst literalinclude blocks.""" + +from hirundo import ( + BiasType, + EvalRunInfo, + HuggingFaceTransformersModel, + LlmBehaviorEval, + LlmModel, + ModelOrRun, + PresetType, +) + +llm = LlmModel( + model_name="Nemotron-Flash-1B", + model_source=HuggingFaceTransformersModel( + model_name="nvidia/Nemotron-Flash-1B", + ), +) +llm_id = llm.create() + +run_id = LlmBehaviorEval.launch_eval_run( + ModelOrRun.MODEL, + EvalRunInfo( + name="Nemotron BBQ bias eval", + model_id=llm_id, + preset_type=PresetType.BBQ_BIAS, + bias_type=BiasType.ALL, + ), +) + +results = LlmBehaviorEval.check_run_by_id(run_id) +print(results.summary_brief) From 263f19fae980ccbc45990054fb539c8747b88c4f Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 16:33:25 +0200 Subject: [PATCH 17/24] Drop Python code from `README.md` --- README.md | 154 +----------------- ...y => dataset_qa_classification_example.py} | 0 docs/dataset_qa_object_detection_example.py | 41 +++++ docs/index.rst | 9 +- 4 files changed, 52 insertions(+), 152 deletions(-) rename docs/{dataset_qa_example.py => dataset_qa_classification_example.py} (100%) create mode 100644 docs/dataset_qa_object_detection_example.py diff --git a/README.md b/README.md index adb54aca..ff27cf46 100644 --- a/README.md +++ b/README.md @@ -41,157 +41,11 @@ hirundo setup This writes `API_KEY` (and optionally `API_HOST`) to `.env` in the current directory or `~/.hirundo.conf`. -## Quickstart: LLM behavior unlearning - -Make sure you have the `transformers` extra installed (`pip install hirundo[transformers]`). - -```python -from hirundo import ( - BiasRunInfo, - BiasType, - HuggingFaceTransformersModel, - LlmModel, - LlmUnlearningRun, -) - -llm = LlmModel( - model_name="Nemotron-Flash-1B", - model_source=HuggingFaceTransformersModel( - model_name="nvidia/Nemotron-Flash-1B", - ), -) -llm_id = llm.create() - -run_id = LlmUnlearningRun.launch( - llm_id, - BiasRunInfo(bias_type=BiasType.ALL), -) - -result = LlmUnlearningRun.check_run(run_id) -new_adapter = llm.get_hf_pipeline_for_run(run_id) -``` - -## Quickstart: LLM behavior eval - -If you want results as DataFrames, install `hirundo[pandas]` or `hirundo[polars]`. - -```python -from hirundo import ( - BiasType, - EvalRunInfo, - HuggingFaceTransformersModel, - LlmBehaviorEval, - LlmModel, - ModelOrRun, - PresetType, -) - -llm = LlmModel( - model_name="Nemotron-Flash-1B", - model_source=HuggingFaceTransformersModel( - model_name="nvidia/Nemotron-Flash-1B", - ), -) -llm_id = llm.create() - -run_id = LlmBehaviorEval.launch_eval_run( - ModelOrRun.MODEL, - EvalRunInfo( - name="Nemotron BBQ bias eval", - model_id=llm_id, - preset_type=PresetType.BBQ_BIAS, - bias_type=BiasType.ALL, - ), -) - -results = LlmBehaviorEval.check_run_by_id(run_id) -print(results.summary_brief) -``` +## Quickstart examples -## Quickstart: Dataset QA - -### Classification - -```python -import json -import os - -from hirundo import ( - HirundoCSV, - LabelingType, - QADataset, - StorageConfig, - StorageGCP, - StorageTypes, -) - -gcp_bucket = StorageGCP( - bucket_name="cifar100bucket", - project="Hirundo-global", - credentials_json=json.loads(os.environ["GCP_CREDENTIALS"]), -) - -test_dataset = QADataset( - name="TEST-GCP cifar 100 classification dataset", - labeling_type=LabelingType.SINGLE_LABEL_CLASSIFICATION, - storage_config=StorageConfig( - name="cifar100bucket", - type=StorageTypes.GCP, - gcp=gcp_bucket, - ), - data_root_url=gcp_bucket.get_url(path="/pytorch-cifar/data"), - labeling_info=HirundoCSV( - csv_url=gcp_bucket.get_url(path="/pytorch-cifar/data/cifar100.csv"), - ), - classes=cifar100_classes, -) - -test_dataset.run_qa() -results = test_dataset.check_run() -print(results) -``` - -### Object detection - -```python -from hirundo import ( - GitRepo, - HirundoCSV, - LabelingType, - QADataset, - StorageConfig, - StorageGit, - StorageTypes, -) - -git_storage = StorageGit( - repo=GitRepo( - name="BDD-100k-validation-dataset", - repository_url="https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only", - ), - branch="main", -) - -test_dataset = QADataset( - name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset", - labeling_type=LabelingType.OBJECT_DETECTION, - storage_config=StorageConfig( - name="BDD-100k-validation-dataset", - type=StorageTypes.GIT, - git=git_storage, - ), - data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"), - labeling_info=HirundoCSV( - csv_url=git_storage.get_url( - path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv" - ), - ), -) - -test_dataset.run_qa() -results = test_dataset.check_run() -print(results) -``` +The full quickstart examples now live in the Sphinx docs so they can be linted, +formatted, and type-checked as real Python files. See the examples embedded in +`docs/index.rst`, which are sourced from `docs/*.py` files. ## Supported dataset storage diff --git a/docs/dataset_qa_example.py b/docs/dataset_qa_classification_example.py similarity index 100% rename from docs/dataset_qa_example.py rename to docs/dataset_qa_classification_example.py diff --git a/docs/dataset_qa_object_detection_example.py b/docs/dataset_qa_object_detection_example.py new file mode 100644 index 00000000..7dc464dc --- /dev/null +++ b/docs/dataset_qa_object_detection_example.py @@ -0,0 +1,41 @@ +"""Examples for docs/index.rst literalinclude blocks.""" + +from hirundo import ( + GitRepo, + HirundoCSV, + LabelingType, + QADataset, + StorageConfig, + StorageGit, + StorageTypes, +) + +git_storage = StorageGit( + repo=GitRepo( + name="BDD-100k-validation-dataset", + repository_url=( + "https://huggingface.co/datasets/hirundo-io/bdd100k-validation-only" + ), + ), + branch="main", +) + +test_dataset = QADataset( + name="TEST-HuggingFace-BDD-100k-validation-OD-validation-dataset", + labeling_type=LabelingType.OBJECT_DETECTION, + storage_config=StorageConfig( + name="BDD-100k-validation-dataset", + type=StorageTypes.GIT, + git=git_storage, + ), + data_root_url=git_storage.get_url(path="/BDD100K Val from Hirundo.zip/bdd100k"), + labeling_info=HirundoCSV( + csv_url=git_storage.get_url( + path="/BDD100K Val from Hirundo.zip/bdd100k/bdd100k.csv" + ), + ), +) + +test_dataset.run_qa() +results = test_dataset.check_run() +print(results) diff --git a/docs/index.rst b/docs/index.rst index 9fa8877b..0841e123 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -75,9 +75,14 @@ Supported storage backends include: - Google Cloud Storage (GCS) - Git repositories with LFS (GitHub, Hugging Face) -Example: +Classification example: + +.. literalinclude:: dataset_qa_classification_example.py + :language: python + +Object detection example: -.. literalinclude:: dataset_qa_example.py +.. literalinclude:: dataset_qa_object_detection_example.py :language: python API reference From 6a96100ba220a935bf00ec5fb7183e73b6f8c25c Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 16:34:24 +0200 Subject: [PATCH 18/24] Fix circular import Thank you Cursor (bugbot) --- hirundo/_sse_event_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py index 940d8675..131d1c45 100644 --- a/hirundo/_sse_event_data.py +++ b/hirundo/_sse_event_data.py @@ -1,6 +1,6 @@ from pydantic import BaseModel, ValidationError -from hirundo import HirundoError +from hirundo._hirundo_error import HirundoError from hirundo._run_status import RunStatus from hirundo.logger import get_logger From 97b24318c8bfa3a8844bda2e9356530bd45b2586 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 16:41:30 +0200 Subject: [PATCH 19/24] Rename `BiasType` to `BBQBiasType` --- docs/llm_behavior_eval_example.py | 4 ++-- docs/llm_unlearning_example.py | 4 ++-- hirundo/__init__.py | 4 ++-- hirundo/llm_behavior_eval.py | 6 +++--- hirundo/llm_bias_type.py | 2 +- hirundo/unlearning_llm.py | 6 +++--- tests/llm-behavior-eval/llm_behavior_eval_test.py | 4 ++-- tests/unlearning-llm/unlearn_llm_behavior_test.py | 4 ++-- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/llm_behavior_eval_example.py b/docs/llm_behavior_eval_example.py index 0cce0338..7db7ee98 100644 --- a/docs/llm_behavior_eval_example.py +++ b/docs/llm_behavior_eval_example.py @@ -1,7 +1,7 @@ """Examples for docs/index.rst literalinclude blocks.""" from hirundo import ( - BiasType, + BBQBiasType, EvalRunInfo, HuggingFaceTransformersModel, LlmBehaviorEval, @@ -24,7 +24,7 @@ name="Nemotron BBQ bias eval", model_id=llm_id, preset_type=PresetType.BBQ_BIAS, - bias_type=BiasType.ALL, + bias_type=BBQBiasType.ALL, ), ) diff --git a/docs/llm_unlearning_example.py b/docs/llm_unlearning_example.py index 2d9c545c..813a4f72 100644 --- a/docs/llm_unlearning_example.py +++ b/docs/llm_unlearning_example.py @@ -1,8 +1,8 @@ """Examples for docs/index.rst literalinclude blocks.""" from hirundo import ( + BBQBiasType, BiasRunInfo, - BiasType, HuggingFaceTransformersModel, LlmModel, LlmUnlearningRun, @@ -17,7 +17,7 @@ llm_id = llm.create() run_id = LlmUnlearningRun.launch( llm_id, - BiasRunInfo(bias_type=BiasType.ALL), + BiasRunInfo(bias_type=BBQBiasType.ALL), ) result = LlmUnlearningRun.check_run(run_id) new_adapter = llm.get_hf_pipeline_for_run(run_id) diff --git a/hirundo/__init__.py b/hirundo/__init__.py index 88a77577..c6be677c 100644 --- a/hirundo/__init__.py +++ b/hirundo/__init__.py @@ -43,7 +43,7 @@ PresetType, ) from .llm_behavior_eval_results import LlmBehaviorEvalResults -from .llm_bias_type import BiasType +from .llm_bias_type import BBQBiasType from .storage import ( StorageConfig, StorageGCP, @@ -70,7 +70,7 @@ "KeylabsObjDetVideo", "KeylabsObjSegImages", "KeylabsObjSegVideo", - "BiasType", + "BBQBiasType", "QADataset", "EvalRunInfo", "EvalRunRecord", diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index 23971d9e..ca130873 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -27,7 +27,7 @@ from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults -from hirundo.llm_bias_type import BiasType +from hirundo.llm_bias_type import BBQBiasType from hirundo.logger import get_logger from hirundo.unzip import download_and_extract_llm_behavior_eval_zip @@ -70,7 +70,7 @@ class EvalRunInfo(BaseModel): source_run_id: str | None = None file_path: str | None = None preset_type: PresetType | None = None - bias_type: BiasType | None = None + bias_type: BBQBiasType | None = None judge_model: JudgeModel | None = None @@ -124,7 +124,7 @@ class EvalRunRecord(BaseModel): source_run: OutputUnlearningLlmRun | None file_path: str | None preset_type: PresetType | None - bias_type: BiasType | None + bias_type: BBQBiasType | None judge_model: JudgeModel | None run_id: str mlflow_run_id: str | None diff --git a/hirundo/llm_bias_type.py b/hirundo/llm_bias_type.py index bd45501d..5b0b72cc 100644 --- a/hirundo/llm_bias_type.py +++ b/hirundo/llm_bias_type.py @@ -1,7 +1,7 @@ from enum import Enum -class BiasType(str, Enum): +class BBQBiasType(str, Enum): ALL = "ALL" RACE = "RACE" NATIONALITY = "NATIONALITY" diff --git a/hirundo/unlearning_llm.py b/hirundo/unlearning_llm.py index a56bd7e4..00375dc1 100644 --- a/hirundo/unlearning_llm.py +++ b/hirundo/unlearning_llm.py @@ -25,7 +25,7 @@ from hirundo._run_status import RunStatus from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo.dataset_qa import HirundoError -from hirundo.llm_bias_type import BiasType +from hirundo.llm_bias_type import BBQBiasType from hirundo.logger import get_logger if TYPE_CHECKING: @@ -227,7 +227,7 @@ class CustomUtility(BaseModel): class BiasBehavior(BaseModel): type: Literal["BIAS"] = "BIAS" - bias_type: BiasType + bias_type: BBQBiasType class HallucinationType(str, Enum): @@ -270,7 +270,7 @@ class LlmRunInfo(BaseModel): class BiasRunInfo(BaseModel): - bias_type: BiasType + bias_type: BBQBiasType organization_id: int | None = None name: str | None = None target_utilities: list[TargetUtility] | None = None diff --git a/tests/llm-behavior-eval/llm_behavior_eval_test.py b/tests/llm-behavior-eval/llm_behavior_eval_test.py index d4a8359c..090d1a82 100644 --- a/tests/llm-behavior-eval/llm_behavior_eval_test.py +++ b/tests/llm-behavior-eval/llm_behavior_eval_test.py @@ -2,7 +2,7 @@ import os from hirundo import ( - BiasType, + BBQBiasType, EvalRunInfo, HuggingFaceTransformersModel, LlmBehaviorEval, @@ -29,7 +29,7 @@ def test_llm_behavior_eval(): name=f"TEST-LLM-BEHAVIOR-EVAL-RUN-{unique_id}", model_id=llm_id, preset_type=PresetType.BBQ_BIAS, - bias_type=BiasType.ALL, + bias_type=BBQBiasType.ALL, ) assert llm_id is not None if os.getenv("FULL_TEST", "false") == "true": diff --git a/tests/unlearning-llm/unlearn_llm_behavior_test.py b/tests/unlearning-llm/unlearn_llm_behavior_test.py index 13144936..3506c982 100644 --- a/tests/unlearning-llm/unlearn_llm_behavior_test.py +++ b/tests/unlearning-llm/unlearn_llm_behavior_test.py @@ -2,8 +2,8 @@ import os from hirundo import ( + BBQBiasType, BiasRunInfo, - BiasType, HuggingFaceTransformersModel, LlmModel, LlmUnlearningRun, @@ -25,7 +25,7 @@ def test_unlearn_llm_behavior(): ) llm_id = llm.create() run_info = BiasRunInfo( - bias_type=BiasType.ALL, + bias_type=BBQBiasType.ALL, ) assert llm_id is not None if os.getenv("FULL_TEST", "false") == "true": From ba0392a45646089b1241cc34850258bfd86d0d90 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 16:47:46 +0200 Subject: [PATCH 20/24] Fix `progress_bar` not being closed if there is an error with the run --- hirundo/llm_behavior_eval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index ca130873..e2563805 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -441,6 +441,7 @@ def check_run_by_id( "State is failure, rejected, or revoked: %s", state, ) + progress_bar.close() handle_run_failure( iteration, error_cls=HirundoLlmBehaviorEvalError, From beb20a6fd1b49152132544c5ef8bc55a23a87a05 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 16:48:40 +0200 Subject: [PATCH 21/24] Add `deleted_at` to `EvalRunRecord` --- hirundo/llm_behavior_eval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index e2563805..662cdcae 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -136,6 +136,8 @@ class EvalRunRecord(BaseModel): metrics: LlmEvalMetrics | None = None responses_zip_url: str | None = None + deleted_at: datetime.datetime | None = None + class LlmBehaviorEval: def __init__(self, run_id: str | None = None): From 56b538f362018b32680b1db30167f9070a79ffad Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 16:55:29 +0200 Subject: [PATCH 22/24] Add cleanup for LLM behavior eval runs --- scripts/cleanup_test_artifacts.py | 35 +++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/scripts/cleanup_test_artifacts.py b/scripts/cleanup_test_artifacts.py index e6357583..6d65b6d3 100644 --- a/scripts/cleanup_test_artifacts.py +++ b/scripts/cleanup_test_artifacts.py @@ -2,9 +2,10 @@ from datetime import timedelta, timezone import requests -from hirundo import GitRepo, QADataset, StorageConfig +from hirundo import GitRepo, LlmBehaviorEval, QADataset, StorageConfig from hirundo._run_status import RunStatus from hirundo.dataset_qa import DataQARunOut, HirundoError, QADatasetOut +from hirundo.llm_behavior_eval import EvalRunRecord from hirundo.logger import get_logger from hirundo.storage import ResponseStorageConfig from hirundo.unlearning_llm import ( @@ -70,7 +71,7 @@ def _delete_llm( def _should_delete_resource( resource_name: str, - runs: list[DataQARunOut | OutputUnlearningLlmRun], + runs: list[DataQARunOut | OutputUnlearningLlmRun | EvalRunRecord], expiry_date: datetime.datetime, ) -> bool: """ @@ -291,12 +292,42 @@ def _handle_llm_cleanup(one_week_ago: datetime.datetime): ) +def _handle_llm_behavior_eval_cleanup(one_week_ago: datetime.datetime) -> None: + archived_runs = set[str]() + trying_to_archive_runs = set[str]() + live_runs = LlmBehaviorEval.list_runs(archived=False) + for run in live_runs: + if _should_delete_resource(run.name, [run], one_week_ago): + trying_to_archive_runs.add(run.run_id) + try: + LlmBehaviorEval.archive_by_id(run.run_id) + archived_runs.add(run.run_id) + except (HirundoError, requests.HTTPError) as exc: + logger.warning( + "Failed to archive LLM behavior eval run with ID %s: %s", + run.run_id, + exc, + ) + logger.info( + "Archived %s (%s) LLM behavior eval runs", + archived_runs, + len(archived_runs), + ) + if trying_to_archive_runs != archived_runs: + logger.warning( + "Tried to archive %s LLM behavior eval runs, but only archived %s runs", + trying_to_archive_runs, + archived_runs, + ) + + def main() -> None: now = datetime.datetime.now(timezone.utc) one_week_ago = now - timedelta(days=7) _handle_datasets_cleanup(one_week_ago) _handle_llm_cleanup(one_week_ago) + _handle_llm_behavior_eval_cleanup(one_week_ago) _cleanup_storage_configs(one_week_ago) From 6c641ec5ac7ca4f57c1072027295a1dce51eac27 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 17:35:43 +0200 Subject: [PATCH 23/24] Add `UnqoverBiasType` as per @mishana 's PR comment --- hirundo/__init__.py | 3 ++- hirundo/llm_behavior_eval.py | 6 +++--- hirundo/llm_bias_type.py | 8 ++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/hirundo/__init__.py b/hirundo/__init__.py index c6be677c..c47c2fbc 100644 --- a/hirundo/__init__.py +++ b/hirundo/__init__.py @@ -43,7 +43,7 @@ PresetType, ) from .llm_behavior_eval_results import LlmBehaviorEvalResults -from .llm_bias_type import BBQBiasType +from .llm_bias_type import BBQBiasType, UnqoverBiasType from .storage import ( StorageConfig, StorageGCP, @@ -71,6 +71,7 @@ "KeylabsObjSegImages", "KeylabsObjSegVideo", "BBQBiasType", + "UnqoverBiasType", "QADataset", "EvalRunInfo", "EvalRunRecord", diff --git a/hirundo/llm_behavior_eval.py b/hirundo/llm_behavior_eval.py index 662cdcae..561204f2 100644 --- a/hirundo/llm_behavior_eval.py +++ b/hirundo/llm_behavior_eval.py @@ -27,7 +27,7 @@ from hirundo._sse_event_data import SseRunEventData, _parse_sse_payload from hirundo._timeouts import MODIFY_TIMEOUT, READ_TIMEOUT from hirundo.llm_behavior_eval_results import LlmBehaviorEvalResults -from hirundo.llm_bias_type import BBQBiasType +from hirundo.llm_bias_type import BBQBiasType, UnqoverBiasType from hirundo.logger import get_logger from hirundo.unzip import download_and_extract_llm_behavior_eval_zip @@ -70,7 +70,7 @@ class EvalRunInfo(BaseModel): source_run_id: str | None = None file_path: str | None = None preset_type: PresetType | None = None - bias_type: BBQBiasType | None = None + bias_type: BBQBiasType | UnqoverBiasType | None = None judge_model: JudgeModel | None = None @@ -124,7 +124,7 @@ class EvalRunRecord(BaseModel): source_run: OutputUnlearningLlmRun | None file_path: str | None preset_type: PresetType | None - bias_type: BBQBiasType | None + bias_type: BBQBiasType | UnqoverBiasType | None judge_model: JudgeModel | None run_id: str mlflow_run_id: str | None diff --git a/hirundo/llm_bias_type.py b/hirundo/llm_bias_type.py index 5b0b72cc..e5fc94dd 100644 --- a/hirundo/llm_bias_type.py +++ b/hirundo/llm_bias_type.py @@ -9,3 +9,11 @@ class BBQBiasType(str, Enum): PHYSICAL_APPEARANCE = "PHYSICAL_APPEARANCE" RELIGION = "RELIGION" AGE = "AGE" + + +class UnqoverBiasType(str, Enum): + ALL = "ALL" + RACE = "RACE" + NATIONALITY = "NATIONALITY" + GENDER = "GENDER" + RELIGION = "RELIGION" From c1eba15515c12b8a8c0101f0fc4db920283aeb69 Mon Sep 17 00:00:00 2001 From: Ben Lewis Date: Wed, 4 Feb 2026 17:38:00 +0200 Subject: [PATCH 24/24] Fix Cursor's bugbot's comment --- hirundo/_sse_event_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hirundo/_sse_event_data.py b/hirundo/_sse_event_data.py index 131d1c45..429c942b 100644 --- a/hirundo/_sse_event_data.py +++ b/hirundo/_sse_event_data.py @@ -9,7 +9,7 @@ class SseRunEventData(BaseModel): id: str - state: RunStatus + state: RunStatus | None result: str | dict | None