diff --git a/src/eva/models/config.py b/src/eva/models/config.py index a9cd5e31..6f98a029 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -12,6 +12,8 @@ ``RunConfig(_env_file=".env", _cli_parse_args=True)``. """ +import copy +import logging from datetime import UTC, datetime from pathlib import Path from typing import Annotated, Any, ClassVar, Literal @@ -34,9 +36,12 @@ from eva.models.provenance import RunProvenance +logger = logging.getLogger(__name__) -def current_date_and_time(): - return f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}" + +def _param_alias(params: dict[str, Any]) -> str: + """Return the display alias from a params dict.""" + return params.get("alias") or params["model"] class PipelineConfig(BaseModel): @@ -73,6 +78,15 @@ class PipelineConfig(BaseModel): ), ) + @property + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return { + "stt": _param_alias(self.stt_params) or self.stt, + "llm": self.llm, + "tts": _param_alias(self.tts_params) or self.tts, + } + @model_validator(mode="before") @classmethod def _migrate_legacy_fields(cls, data: Any) -> Any: @@ -107,6 +121,11 @@ class SpeechToSpeechConfig(BaseModel): "Set via EVA_MODEL__TURN_STRATEGY=external." ), ) + + @property + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return {"s2s": _param_alias(self.s2s_params) or self.s2s} class AudioLLMConfig(BaseModel): @@ -129,6 +148,14 @@ class AudioLLMConfig(BaseModel): tts: str = Field(description="TTS model", examples=["cartesia", "elevenlabs"]) tts_params: dict[str, Any] = Field({}, description="Additional TTS model parameters (JSON)") + @property + def pipeline_parts(self) -> dict[str, str]: + """Component names for this pipeline.""" + return { + "audio_llm": _param_alias(self.audio_llm_params) or self.audio_llm, + "tts": _param_alias(self.tts_params) or self.tts, + } + _PIPELINE_FIELDS = { "llm", @@ -280,6 +307,21 @@ class RunConfig(BaseSettings): "EVA_METRICS_TO_RUN": "EVA_METRICS", } + # Providers that manage their own model/key resolution (e.g. WebSocket-based) + _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"} + + # Maps *_params field names to their provider field for env override logic + _PARAMS_TO_PROVIDER: ClassVar[dict[str, str]] = { + "stt_params": "stt", + "tts_params": "tts", + "s2s_params": "s2s", + "audio_llm_params": "audio_llm", + } + # Keys always read from the live environment (not persisted across runs) + _ENV_OVERRIDE_KEYS: ClassVar[set[str]] = {"url", "urls"} + # Substrings that identify secret keys (redacted in logs and config.json) + _SECRET_KEY_PATTERNS: ClassVar[set[str]] = {"key", "credentials", "secret"} + class ModelDeployment(DeploymentTypedDict): """DeploymentTypedDict that preserves extra keys in litellm_params.""" @@ -294,7 +336,7 @@ class ModelDeployment(DeploymentTypedDict): # Run identifier run_id: str = Field( - default_factory=current_date_and_time, + "timestamp and model name(s)", # Overwritten by _set_default_run_id() description="Run identifier, auto-generated if not provided", ) @@ -464,8 +506,12 @@ def _check_companion_services(self) -> "RunConfig": self._validate_service_params("S2S", self.model.s2s, ["api_key"], self.model.s2s_params) return self - # Providers that manage their own model/key resolution (e.g. WebSocket-based) - _SKIP_PARAMS_VALIDATION: ClassVar[set[str]] = {"nvidia"} + @model_validator(mode="after") + def _set_default_run_id(self) -> "RunConfig": + if "run_id" not in self.model_fields_set: + suffix = "_".join(v for v in self.model.pipeline_parts.values() if v) + self.run_id = f"{datetime.now(UTC):%Y-%m-%d_%H-%M-%S.%f}_{suffix}" + return self @classmethod def _validate_service_params( @@ -503,20 +549,131 @@ def _expand_metrics_all(cls, v: list[str] | None) -> list[str] | None: return [m for m in get_global_registry().list_metrics() if m not in cls._VALIDATION_METRIC_NAMES] return v + @classmethod + def _is_secret_key(cls, key: str) -> bool: + """Return True if *key* matches any pattern in _SECRET_KEY_PATTERNS.""" + return any(pattern in key for pattern in cls._SECRET_KEY_PATTERNS) + + @classmethod + def _redact_dict(cls, params: dict) -> dict: + """Return a copy of *params* with secret values replaced by ``***``.""" + return {k: "***" if cls._is_secret_key(k) else v for k, v in params.items()} + @field_serializer("model_list") @classmethod def _redact_model_list(cls, deployments: list[ModelDeployment]) -> list[dict]: """Redact secret values in litellm_params when serializing.""" redacted = [] for deployment in deployments: + deployment = copy.deepcopy(deployment) if "litellm_params" in deployment: - params = deployment["litellm_params"] - for key in params: - if "key" in key or "credentials" in key: - params[key] = "***" + deployment["litellm_params"] = cls._redact_dict(deployment["litellm_params"]) redacted.append(deployment) return redacted + @field_serializer("model") + @classmethod + def _redact_model_params(cls, model: ModelConfigUnion) -> dict: + """Redact secret values in STT/TTS/S2S/AudioLLM params when serializing.""" + data = model.model_dump(mode="json") + for field_name, value in data.items(): + if field_name.endswith("_params") and isinstance(value, dict): + data[field_name] = cls._redact_dict(value) + return data + + def apply_env_overrides(self, live: "RunConfig") -> None: + """Apply environment-dependent values from *live* config onto this (saved) config. + + Restores redacted secrets (``***``) and overrides dynamic fields (``url``, + ``urls``) in ``model.*_params`` and ``model_list[].litellm_params``. + + Raises: + ValueError: If provider or alias differs for a service with redacted secrets. + """ + # ── model.*_params (STT / TTS / S2S / AudioLLM) ── + for params_field, provider_field in self._PARAMS_TO_PROVIDER.items(): + saved = getattr(self.model, params_field, None) + source = getattr(live.model, params_field, None) + if not isinstance(saved, dict) or not isinstance(source, dict): + continue + + has_redacted = any(v == "***" for v in saved.values()) + has_env_overrides = any(k in saved or k in source for k in self._ENV_OVERRIDE_KEYS) + if not has_redacted and not has_env_overrides: + continue + + if has_redacted: + saved_alias = saved.get("alias") + live_alias = source.get("alias") + if saved_alias and live_alias and saved_alias != live_alias: + raise ValueError( + f"Cannot restore secrets: saved {params_field}[alias]={saved_alias!r} " + f"but current environment has {params_field}[alias]={live_alias!r}" + ) + + saved_provider = getattr(self.model, provider_field, None) + live_provider = getattr(live.model, provider_field, None) + if saved_provider != live_provider: + logger.warning( + f"Provider mismatch for {params_field}: saved {saved_provider!r}, " + f"current environment has {live_provider!r}" + ) + + saved_model = saved.get("model") + live_model = source.get("model") + if saved_model and live_model and saved_model != live_model: + logger.warning( + f"Model mismatch for {params_field}: saved {saved_model!r}, " + f"current environment has {live_model!r}" + ) + + for key, value in saved.items(): + if value == "***" and key in source: + saved[key] = source[key] + + # Always use url/urls from the live environment + for key in self._ENV_OVERRIDE_KEYS: + if key in source: + saved_val = saved.get(key) + if saved_val and saved_val != source[key]: + logger.warning( + f"{params_field}[{key}] differs: saved {saved_val!r}, " + f"using {source[key]!r} from current environment" + ) + saved[key] = source[key] + + # ── model_list[].litellm_params (LLM deployments) ── + live_by_name = {d["model_name"]: d for d in live.model_list if "model_name" in d} + for deployment in self.model_list: + name = deployment.get("model_name") + if not name: + continue + saved_params = deployment.get("litellm_params", {}) + has_redacted = any(v == "***" for v in saved_params.values()) + if not has_redacted: + continue + if name not in live_by_name: + raise ValueError( + f"Cannot restore secrets: deployment {name!r} not found in " + f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + ) + live_params = live_by_name[name].get("litellm_params", {}) + for key, value in saved_params.items(): + if value == "***" and key in live_params: + saved_params[key] = live_params[key] + + # ── Log resolved configuration ── + for params_field, provider_field in self._PARAMS_TO_PROVIDER.items(): + params = getattr(self.model, params_field, None) + provider = getattr(self.model, provider_field, None) + if isinstance(params, dict) and params: + logger.info(f"Resolved {provider_field} ({provider}): {self._redact_dict(params)}") + + for deployment in self.model_list: + name = deployment.get("model_name", "?") + params = deployment.get("litellm_params", {}) + logger.info(f"Resolved deployment {name}: {self._redact_dict(params)}") + @classmethod def from_yaml(cls, path: Path | str) -> "RunConfig": """Load configuration from YAML file.""" diff --git a/src/eva/orchestrator/runner.py b/src/eva/orchestrator/runner.py index f92d98af..ac5a45f3 100644 --- a/src/eva/orchestrator/runner.py +++ b/src/eva/orchestrator/runner.py @@ -138,7 +138,10 @@ async def run(self, records: list[EvaluationRecord]) -> RunResult: } config_path = self.output_dir / "config.json" - config_path.write_text(self.config.model_dump_json(indent=2)) + config_data = self.config.model_dump(mode="json") + pipeline_parts = self.config.model.pipeline_parts + config_data["pipeline_parts"] = pipeline_parts + config_path.write_text(json.dumps(config_data, indent=2)) # Build output_id list for tracking (supports pass@k) num_trials = self.config.num_trials diff --git a/src/eva/run_benchmark.py b/src/eva/run_benchmark.py index 92d32b01..49096448 100644 --- a/src/eva/run_benchmark.py +++ b/src/eva/run_benchmark.py @@ -42,6 +42,9 @@ async def run_benchmark(config: RunConfig) -> int: logger.error(str(e)) return 1 + # Apply env-dependent values (secrets, urls) from live env onto saved config + runner.config.apply_env_overrides(config) + # Apply CLI overrides runner.config.max_rerun_attempts = config.max_rerun_attempts runner.config.force_rerun_metrics = config.force_rerun_metrics diff --git a/tests/unit/models/test_config_models.py b/tests/unit/models/test_config_models.py index f107cd80..99424e22 100644 --- a/tests/unit/models/test_config_models.py +++ b/tests/unit/models/test_config_models.py @@ -2,7 +2,6 @@ import json import os -from datetime import datetime from pathlib import Path from unittest.mock import MagicMock, patch @@ -85,7 +84,8 @@ def test_create_minimal_config(self): assert config.dataset_path == Path("data/airline_dataset.jsonl") assert config.tool_mocks_path == Path("data/airline_scenarios") - assert datetime.strptime(config.run_id, "%Y-%m-%d_%H-%M-%S.%f") + # run_id = timestamp + model suffix (e.g. "2024-01-15_14-30-45.123456_nova-2_gpt-5.2_sonic") + assert config.run_id.endswith("nova-2_gpt-5.2_sonic") assert config.max_concurrent_conversations == 1 assert config.conversation_timeout_seconds == 360 @@ -164,13 +164,169 @@ def test_indentation_in_model_list(self, tmp_path: Path, vars_location: str, ind assert config.model_list == MODEL_LIST def test_secrets_redacted(self): - """Secrets are redacted in model_list.""" + """Secrets are redacted in model_list and STT/TTS params.""" config = _config(env_vars=_BASE_ENV) dumped = config.model_dump(mode="json") assert dumped["model_list"][0]["litellm_params"]["api_key"] == "***" assert dumped["model_list"][1]["litellm_params"]["vertex_credentials"] == "***" assert dumped["model_list"][2]["litellm_params"]["aws_access_key_id"] == "***" assert dumped["model_list"][2]["litellm_params"]["aws_secret_access_key"] == "***" + # STT/TTS params api_key must also be redacted + assert dumped["model"]["stt_params"]["api_key"] == "***" + assert dumped["model"]["tts_params"]["api_key"] == "***" + # Non-secret fields preserved + assert dumped["model"]["stt_params"]["model"] == "nova-2" + assert dumped["model"]["tts_params"]["model"] == "sonic" + + def test_secrets_redaction_does_not_mutate_live_config(self): + """Serializing must not corrupt the in-memory config objects.""" + config = _config(env_vars=_BASE_ENV) + config.model_dump(mode="json") + # model_list keys must still hold real values + assert config.model_list[0]["litellm_params"]["api_key"] == "must_be_redacted" + assert config.model_list[1]["litellm_params"]["vertex_credentials"] == "must_be_redacted" + # STT/TTS params must still hold real values + assert config.model.stt_params["api_key"] == "test_key" + assert config.model.tts_params["api_key"] == "test_key" + + def test_apply_env_overrides(self): + """Redacted secrets are restored from a live config for both model and model_list.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + # Everything is redacted after round-trip + assert loaded.model.stt_params["api_key"] == "***" + assert loaded.model.tts_params["api_key"] == "***" + assert loaded.model_list[0]["litellm_params"]["api_key"] == "***" + assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "***" + assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "***" + + loaded.apply_env_overrides(config) + + # STT/TTS params restored + assert loaded.model.stt_params["api_key"] == "test_key" + assert loaded.model.tts_params["api_key"] == "test_key" + assert loaded.model.stt_params["model"] == "nova-2" + # model_list restored + assert loaded.model_list[0]["litellm_params"]["api_key"] == "must_be_redacted" + assert loaded.model_list[1]["litellm_params"]["vertex_credentials"] == "must_be_redacted" + assert loaded.model_list[2]["litellm_params"]["aws_access_key_id"] == "must_be_redacted" + assert loaded.model_list[2]["litellm_params"]["aws_secret_access_key"] == "must_be_redacted" + + def test_apply_env_overrides_provider_mismatch(self): + """Restoring secrets fails if the STT/TTS provider changed.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + live = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT": "openai_whisper", + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "whisper-1"}), + } + ) + with pytest.raises(ValueError, match=r"saved stt='deepgram'.*current environment has stt='openai_whisper'"): + loaded.apply_env_overrides(live) + + def test_apply_env_overrides_alias_mismatch(self): + """Restoring secrets fails if the alias changed.""" + config = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "alias": "stt-v1"}), + } + ) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + live = _config( + env_vars=_BASE_ENV + | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "alias": "stt-v2"}), + } + ) + with pytest.raises( + ValueError, + match=r"saved stt_params\[alias\]='stt-v1'.*current environment has stt_params\[alias\]='stt-v2'", + ): + loaded.apply_env_overrides(live) + + def test_apply_env_overrides_model_mismatch_warns(self, caplog): + """Restoring secrets warns (but succeeds) if the STT/TTS model changed.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + live = _config(env_vars=_BASE_ENV | {"EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic-2"})}) + with caplog.at_level("WARNING", logger="eva.models.config"): + loaded.apply_env_overrides(live) + assert "sonic" in caplog.text + assert "sonic-2" in caplog.text + assert loaded.model.tts_params["api_key"] == "k" + + def test_apply_env_overrides_url_from_env(self, caplog): + """Url is always taken from the live env, with a warning if it differs.""" + saved_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://old-host/stt"}), + } + config = _config(env_vars=saved_env) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + # Live env has a different url + live_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}), + } + live = _config(env_vars=live_env) + + with caplog.at_level("WARNING", logger="eva.models.config"): + loaded.apply_env_overrides(live) + + assert loaded.model.stt_params["url"] == "wss://new-host/stt" + assert "wss://old-host/stt" in caplog.text + assert "wss://new-host/stt" in caplog.text + + def test_apply_env_overrides_url_added_from_env(self): + """Url from live env is added even if the saved config didn't have one.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + live_env = _BASE_ENV | { + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2", "url": "wss://new-host/stt"}), + } + live = _config(env_vars=live_env) + loaded.apply_env_overrides(live) + + assert loaded.model.stt_params["url"] == "wss://new-host/stt" + + def test_apply_env_overrides_llm_deployment_mismatch(self): + """Restoring secrets fails if a saved LLM deployment is missing from the live model_list.""" + config = _config(env_vars=_BASE_ENV) + dumped_json = config.model_dump_json() + loaded = RunConfig.model_validate_json(dumped_json) + + # Live config has a different model_list (only one deployment, different name) + different_model_list = [ + { + "model_name": "gpt-4o", + "litellm_params": {"model": "openai/gpt-4o", "api_key": "real_key"}, + } + ] + live = _config( + env_vars={ + "EVA_MODEL_LIST": json.dumps(different_model_list), + "EVA_MODEL__LLM": "gpt-4o", + "EVA_MODEL__STT": "deepgram", + "EVA_MODEL__TTS": "cartesia", + "EVA_MODEL__STT_PARAMS": json.dumps({"api_key": "k", "model": "nova-2"}), + "EVA_MODEL__TTS_PARAMS": json.dumps({"api_key": "k", "model": "sonic"}), + } + ) + with pytest.raises(ValueError, match=r"deployment 'gpt-5.2' not found in current EVA_MODEL_LIST"): + loaded.apply_env_overrides(live) @pytest.mark.parametrize( "environ, expected_exception, expected_message",