From 5a5e03dd1ea06d2aa6a0aa5483c703ada8dc7b20 Mon Sep 17 00:00:00 2001 From: Chibukach Date: Tue, 12 May 2026 21:25:22 +0000 Subject: [PATCH 1/6] average accuracy scores across multiple seeds --- every_eval_ever/cli.py | 56 +++++++- every_eval_ever/merge_seeds.py | 228 +++++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+), 4 deletions(-) create mode 100644 every_eval_ever/merge_seeds.py diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index c215c89cd..e6feda532 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -231,6 +231,35 @@ def _cmd_convert_alpaca_eval(args: argparse.Namespace) -> int: return 0 +def _cmd_average_scores(args: argparse.Namespace, current_time: str) -> int: + from collections import defaultdict + from every_eval_ever.eval_types import EvaluationLog + + benchmark_dir = defaultdict(list) + output_dir = Path(args.output_dir) + for p in output_dir.rglob("*"): + if not p.is_file(): + continue + if p.stat().st_mtime >= current_time: + benchmark_dir[p.parts[1]].append(p) + + from every_eval_ever.merge_seeds import merge_seed_runs + for benchmark, result_files in benchmark_dir.items(): + print(benchmark) + logs = [] + for result_file in result_files: + print(result_file) + log = EvaluationLog.model_validate_json(result_file.read_text(encoding='utf-8')) + logs.append(log) + + all_logs = merge_seed_runs(logs) + + for log in all_logs: + eval_uuid = str(uuid.uuid4()) + print(f"Average log: {_write_log(log, output_dir, eval_uuid=eval_uuid)}") + + return 0 + def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog='every_eval_ever', @@ -354,6 +383,14 @@ def build_parser() -> argparse.ArgumentParser: help='Evaluation library version recorded in eval_library.version.', ) + source_parser.add_argument( + '--average_scores', + '--average-scores', + action='store_true', + default=False, + help='Compute the average scores of the evaluation logs for a given benchmark.', + ) + if source == 'alpaca_eval': source_parser.add_argument( '--version', @@ -392,6 +429,9 @@ def main(argv: list[str] | None = None) -> int: parser = build_parser() args = parser.parse_args(argv) + from datetime import datetime, timezone + current_time = datetime.now().timestamp() + if args.command == 'validate': from every_eval_ever.validate import main as validate_main @@ -414,13 +454,21 @@ def main(argv: list[str] | None = None) -> int: if args.command == 'convert': if args.source == 'lm_eval': - return _cmd_convert_lm_eval(args) + _cmd_convert_lm_eval(args) if args.source == 'inspect': - return _cmd_convert_inspect(args) + _cmd_convert_inspect(args) if args.source == 'helm': - return _cmd_convert_helm(args) + _cmd_convert_helm(args) if args.source == 'alpaca_eval': - return _cmd_convert_alpaca_eval(args) + _cmd_convert_alpaca_eval(args) + + if args.average_scores: + _cmd_average_scores(args, current_time) + + return 0 + + + parser.print_help() return 1 diff --git a/every_eval_ever/merge_seeds.py b/every_eval_ever/merge_seeds.py new file mode 100644 index 000000000..f1c9fb4d6 --- /dev/null +++ b/every_eval_ever/merge_seeds.py @@ -0,0 +1,228 @@ +"""Utilities for merging multiple seed runs into averaged evaluation logs.""" + +from __future__ import annotations + +import json +import statistics +from collections import defaultdict +from typing import Dict, List, Optional + +from every_eval_ever.eval_types import ( + EvaluationLog, + EvaluationResult, + ScoreDetails, + StandardError, + Uncertainty, +) + + +def _extract_seed_from_log(log: EvaluationLog) -> Optional[int]: + """Extract seed value from generation_config.additional_details.""" + + for result in log.evaluation_results: + if result.generation_config.additional_details: + gen_details = result.generation_config.additional_details + for key in ['seed', 'random_seed']: + if key in gen_details: + try: + seed_str = gen_details[key] + # Handle JSON-encoded values + if isinstance(seed_str, str): + if seed_str.isdigit(): + return int(seed_str) + # Try parsing JSON + try: + seed_val = json.loads(seed_str) + return int(seed_val) + except (json.JSONDecodeError, ValueError, TypeError): + pass + return int(seed_str) + except (ValueError, TypeError): + pass + + return None + +def _make_grouping_key(log: EvaluationLog, result: EvaluationResult) -> tuple: + """Create a grouping key for matching results across seeds.""" + return ( + log.model_info.id, + result.evaluation_name, + result.metric_config.evaluation_description, + result.source_data.dataset_name if result.source_data else None, + ) + + +def merge_seed_runs( + logs: List[EvaluationLog], + num_input_paths: Optional[int] = None, +) -> List[EvaluationLog]: + """Merge multiple seed runs into averaged evaluation logs. + + Args: + logs: List of EvaluationLog objects from different seed runs (often one + log per benchmark *task* per results file). + num_input_paths: Number of distinct input result files (seeds). When each + file defines many tasks, this must be ``len(paths)``, not ``len(logs)``. + If omitted, defaults to ``len(logs)`` (only correct when each input + file yields exactly one EvaluationLog). + + Returns: + List containing a single merged EvaluationLog with averaged scores + + Raises: + ValueError: If logs have incompatible configurations, or no metric groups + could be merged (each key must appear in exactly ``expected_group_size`` + logs). + """ + if not logs: + return [] + + if len(logs) == 1: + return logs + + expected_group_size = ( + num_input_paths if num_input_paths is not None else len(logs) + ) + + # Validate that all logs are for the same model + model_ids = {log.model_info.id for log in logs} + if len(model_ids) > 1: + raise ValueError( + f"Cannot merge logs from different models: {model_ids}" + ) + + # Group evaluation results across all logs by (model, task, metric) + grouped_results: Dict[ + tuple, List[tuple[EvaluationResult, int | None, EvaluationLog]] + ] = defaultdict(list) + + for log in logs: + seed = _extract_seed_from_log(log) + print(seed) + for result in log.evaluation_results: + key = _make_grouping_key(log, result) + grouped_results[key].append((result, seed, log)) + + # Compute averaged results + merged_results = [] + + for _key, result_group in grouped_results.items(): + if len(result_group) != expected_group_size: + # Not all seeds have this metric - skip or warn + continue + + # Extract scores and seeds + scores = [r.score_details.score for r, _, _ in result_group] + seed_values = [s for _, s, _ in result_group if s is not None] + evaluation_timestamps = [int(float(str(r.evaluation_timestamp))) for r, _, log in result_group] + + # Compute statistics + mean_score = statistics.mean(scores) + if len(scores) > 1: + std_dev = statistics.stdev(scores) + std_error = std_dev / (len(scores) ** 0.5) + else: + std_error = 0.0 + + # Use the first result as template + template_result, _, _ = result_group[0] + + # Build uncertainty with seed-based standard error + uncertainty = Uncertainty( + standard_error=StandardError( + value=std_error, + method='across_seeds', + ), + num_samples=len(scores), + ) + + # Create merged result with additional seed details + additional_seed_details = { + 'seed_scores': json.dumps(scores), + 'seed_values': json.dumps(seed_values) if seed_values else json.dumps([]), + 'evaluation_timestamps': json.dumps(evaluation_timestamps), + } + + # Preserve original details if any, and add seed info + score_details_dict = dict(template_result.score_details.details or {}) + score_details_dict.update(additional_seed_details) + + merged_result = EvaluationResult( + evaluation_name=template_result.evaluation_name, + source_data=template_result.source_data, + metric_config=template_result.metric_config, + score_details=ScoreDetails( + score=mean_score, + uncertainty=uncertainty, + details=score_details_dict, + ), + generation_config=template_result.generation_config, + ) + merged_results.append(merged_result) + + if not merged_results: + raise ValueError( + 'No evaluation results could be averaged across logs: each ' + '(model_id, evaluation_name, metric evaluation_description, ' + f'dataset_name) group must appear in exactly {expected_group_size} ' + 'logs. Different tasks per file usually do not merge this way.' + ) + + # Create merged log using first log as template + template_log = logs[0] + + # Compute time range + start_times = [] + end_times = [] + + for log in logs: + if log.evaluation_timestamp: + try: + start_times.append(int(float(log.evaluation_timestamp))) + except (ValueError, TypeError): + pass + # Try to extract end time from additional_details if available + if ( + log.model_info.additional_details + and 'end_time' in log.model_info.additional_details + ): + try: + end_times.append( + int(log.model_info.additional_details['end_time']) + ) + except (ValueError, TypeError): + pass + + # Use earliest start time as evaluation_timestamp + eval_timestamp = ( + str(min(start_times)) if start_times else template_log.evaluation_timestamp + ) + + # Add time range to model_info additional_details + model_additional = dict(template_log.model_info.additional_details or {}) + if start_times and end_times: + model_additional['evaluation_time_range'] = json.dumps({ + 'start_time': min(start_times), + 'end_time': max(end_times), + }) + model_additional['num_seeds_merged'] = str(expected_group_size) + + # Create merged model_info + merged_model_info = template_log.model_info.model_copy( + update={'additional_details': model_additional} + ) + + # Create merged evaluation log + merged_log = EvaluationLog( + schema_version=template_log.schema_version, + evaluation_id=template_log.evaluation_id, + retrieved_timestamp=template_log.retrieved_timestamp, + evaluation_timestamp=eval_timestamp, + source_metadata=template_log.source_metadata, + eval_library=template_log.eval_library, + model_info=merged_model_info, + evaluation_results=merged_results, + detailed_evaluation_results=None, # Don't merge instance-level results + ) + + return [merged_log] From 686af7c51eab0e8be57c37f45dd8a0816be39f35 Mon Sep 17 00:00:00 2001 From: Chibukach Date: Wed, 13 May 2026 12:41:03 +0000 Subject: [PATCH 2/6] average across taks with subtasks --- every_eval_ever/cli.py | 15 +++++++++++++-- every_eval_ever/converters/lm_eval/adapter.py | 8 ++++++-- every_eval_ever/eval_types.py | 5 +++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index e6feda532..9deeaf21e 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -237,6 +237,15 @@ def _cmd_average_scores(args: argparse.Namespace, current_time: str) -> int: benchmark_dir = defaultdict(list) output_dir = Path(args.output_dir) + # list files created in the output directory after script invocation + files = [] + for p in output_dir.rglob("*"): + if not p.is_file(): + continue + if p.stat().st_mtime >= current_time: + files.append(p) + files.sort(key=lambda x: x.stat().st_mtime) + for p in files: for p in output_dir.rglob("*"): if not p.is_file(): continue @@ -245,13 +254,15 @@ def _cmd_average_scores(args: argparse.Namespace, current_time: str) -> int: from every_eval_ever.merge_seeds import merge_seed_runs for benchmark, result_files in benchmark_dir.items(): - print(benchmark) logs = [] for result_file in result_files: print(result_file) log = EvaluationLog.model_validate_json(result_file.read_text(encoding='utf-8')) - logs.append(log) + if log.additional_details.get('group_subtasks', None) != benchmark : + # skip subtasks + continue + logs.append(log) all_logs = merge_seed_runs(logs) for log in all_logs: diff --git a/every_eval_ever/converters/lm_eval/adapter.py b/every_eval_ever/converters/lm_eval/adapter.py index 2dc253cc5..764d55cd8 100644 --- a/every_eval_ever/converters/lm_eval/adapter.py +++ b/every_eval_ever/converters/lm_eval/adapter.py @@ -179,7 +179,7 @@ def _build_generation_config( self, task_config: Dict[str, Any] ) -> Optional[GenerationConfig]: """Build generation config from task config.""" - gen_kwargs = task_config.get('generation_kwargs', {}) + gen_kwargs = task_config.get('generation_kwargs', {}) or task_config.get('gen_kwargs', {}) if not gen_kwargs: return None @@ -207,7 +207,7 @@ def _build_evaluation_results( ) -> List[EvaluationResult]: """Build EvaluationResult list for a single task.""" task_results = raw_data['results'][task_name] - task_config = raw_data.get('configs', {}).get(task_name, {}) + task_config = raw_data.get('configs', {}).get(task_name, {}) or raw_data.get('config', {}) higher_is_better = raw_data.get('higher_is_better', {}).get( task_name, {} ) @@ -348,6 +348,9 @@ def _transform_single( 'task_name': task_name, } + additional_details = {} + if raw_data.get('group_subtasks', None): + additional_details['group_subtasks'] = next(iter(raw_data['group_subtasks'])) or "" return EvaluationLog( schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, @@ -357,6 +360,7 @@ def _transform_single( eval_library=eval_library, model_info=model_info, evaluation_results=evaluation_results, + additional_details=additional_details, ) def transform_from_file( diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py index 40035403b..9a8f854bb 100644 --- a/every_eval_ever/eval_types.py +++ b/every_eval_ever/eval_types.py @@ -518,3 +518,8 @@ class EvaluationLog(BaseModel): None, description='Reference to the evaluation results for all individual samples in the evaluation', ) + + additional_details: dict[str, str] | None = Field( + None, + description='Additional parameters (key-value pairs, all values must be strings)', + ) From 0e2af0f8291a7588da1dacc3ba44d8f0ce024bcf Mon Sep 17 00:00:00 2001 From: Chibukach Date: Wed, 13 May 2026 13:13:04 +0000 Subject: [PATCH 3/6] remove duplicate line --- every_eval_ever/cli.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index 9deeaf21e..82502eb41 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -238,14 +238,7 @@ def _cmd_average_scores(args: argparse.Namespace, current_time: str) -> int: benchmark_dir = defaultdict(list) output_dir = Path(args.output_dir) # list files created in the output directory after script invocation - files = [] - for p in output_dir.rglob("*"): - if not p.is_file(): - continue - if p.stat().st_mtime >= current_time: - files.append(p) - files.sort(key=lambda x: x.stat().st_mtime) - for p in files: + for p in output_dir.rglob("*"): if not p.is_file(): continue From 492f8e261067f5c6cb9cb410a7958ea464694e34 Mon Sep 17 00:00:00 2001 From: Chibukach Date: Thu, 14 May 2026 14:53:39 +0000 Subject: [PATCH 4/6] clean up merge and use original adapter --- every_eval_ever/converters/lm_eval/adapter.py | 8 +--- every_eval_ever/eval_types.py | 5 -- every_eval_ever/merge_seeds.py | 46 ++++++++++--------- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/every_eval_ever/converters/lm_eval/adapter.py b/every_eval_ever/converters/lm_eval/adapter.py index 764d55cd8..2dc253cc5 100644 --- a/every_eval_ever/converters/lm_eval/adapter.py +++ b/every_eval_ever/converters/lm_eval/adapter.py @@ -179,7 +179,7 @@ def _build_generation_config( self, task_config: Dict[str, Any] ) -> Optional[GenerationConfig]: """Build generation config from task config.""" - gen_kwargs = task_config.get('generation_kwargs', {}) or task_config.get('gen_kwargs', {}) + gen_kwargs = task_config.get('generation_kwargs', {}) if not gen_kwargs: return None @@ -207,7 +207,7 @@ def _build_evaluation_results( ) -> List[EvaluationResult]: """Build EvaluationResult list for a single task.""" task_results = raw_data['results'][task_name] - task_config = raw_data.get('configs', {}).get(task_name, {}) or raw_data.get('config', {}) + task_config = raw_data.get('configs', {}).get(task_name, {}) higher_is_better = raw_data.get('higher_is_better', {}).get( task_name, {} ) @@ -348,9 +348,6 @@ def _transform_single( 'task_name': task_name, } - additional_details = {} - if raw_data.get('group_subtasks', None): - additional_details['group_subtasks'] = next(iter(raw_data['group_subtasks'])) or "" return EvaluationLog( schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, @@ -360,7 +357,6 @@ def _transform_single( eval_library=eval_library, model_info=model_info, evaluation_results=evaluation_results, - additional_details=additional_details, ) def transform_from_file( diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py index 9a8f854bb..40035403b 100644 --- a/every_eval_ever/eval_types.py +++ b/every_eval_ever/eval_types.py @@ -518,8 +518,3 @@ class EvaluationLog(BaseModel): None, description='Reference to the evaluation results for all individual samples in the evaluation', ) - - additional_details: dict[str, str] | None = Field( - None, - description='Additional parameters (key-value pairs, all values must be strings)', - ) diff --git a/every_eval_ever/merge_seeds.py b/every_eval_ever/merge_seeds.py index f1c9fb4d6..509936f23 100644 --- a/every_eval_ever/merge_seeds.py +++ b/every_eval_ever/merge_seeds.py @@ -20,25 +20,28 @@ def _extract_seed_from_log(log: EvaluationLog) -> Optional[int]: """Extract seed value from generation_config.additional_details.""" for result in log.evaluation_results: - if result.generation_config.additional_details: - gen_details = result.generation_config.additional_details - for key in ['seed', 'random_seed']: - if key in gen_details: - try: - seed_str = gen_details[key] - # Handle JSON-encoded values - if isinstance(seed_str, str): - if seed_str.isdigit(): - return int(seed_str) - # Try parsing JSON - try: - seed_val = json.loads(seed_str) - return int(seed_val) - except (json.JSONDecodeError, ValueError, TypeError): - pass - return int(seed_str) - except (ValueError, TypeError): - pass + try: + if result.generation_config.additional_details: + gen_details = result.generation_config.additional_details + for key in ['seed', 'random_seed']: + if key in gen_details: + try: + seed_str = gen_details[key] + # Handle JSON-encoded values + if isinstance(seed_str, str): + if seed_str.isdigit(): + return int(seed_str) + # Try parsing JSON + try: + seed_val = json.loads(seed_str) + return int(seed_val) + except (json.JSONDecodeError, ValueError, TypeError): + pass + return int(seed_str) + except (ValueError, TypeError): + pass + except: + pass return None @@ -98,7 +101,6 @@ def merge_seed_runs( for log in logs: seed = _extract_seed_from_log(log) - print(seed) for result in log.evaluation_results: key = _make_grouping_key(log, result) grouped_results[key].append((result, seed, log)) @@ -139,10 +141,12 @@ def merge_seed_runs( # Create merged result with additional seed details additional_seed_details = { 'seed_scores': json.dumps(scores), - 'seed_values': json.dumps(seed_values) if seed_values else json.dumps([]), 'evaluation_timestamps': json.dumps(evaluation_timestamps), } + if seed_values: + additional_seed_details['seed_values'] = json.dumps(seed_values) + # Preserve original details if any, and add seed info score_details_dict = dict(template_result.score_details.details or {}) score_details_dict.update(additional_seed_details) From 0aa15d5830a597972f90ae746906bd1a5cdd52d3 Mon Sep 17 00:00:00 2001 From: Chibukach Date: Thu, 14 May 2026 15:01:31 +0000 Subject: [PATCH 5/6] add updated cli --- every_eval_ever/cli.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index 82502eb41..aba6943e9 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -251,16 +251,12 @@ def _cmd_average_scores(args: argparse.Namespace, current_time: str) -> int: for result_file in result_files: print(result_file) log = EvaluationLog.model_validate_json(result_file.read_text(encoding='utf-8')) - if log.additional_details.get('group_subtasks', None) != benchmark : - # skip subtasks - continue - logs.append(log) all_logs = merge_seed_runs(logs) - for log in all_logs: - eval_uuid = str(uuid.uuid4()) - print(f"Average log: {_write_log(log, output_dir, eval_uuid=eval_uuid)}") + for log in all_logs: + eval_uuid = str(uuid.uuid4()) + print(f"Average {benchmark}: {_write_log(log, output_dir, eval_uuid=eval_uuid)}") return 0 @@ -274,6 +270,7 @@ def build_parser() -> argparse.ArgumentParser: epilog=( 'Examples:\n' ' every_eval_ever convert lm_eval --log_path results.json --output_dir data\n' + ' every_eval_ever convert lm_eval --log_path results.json --output_dir data\n --average-scores' ' every_eval_ever convert inspect --log_path inspect_log.json --output_dir data\n' ' every_eval_ever convert helm --log_path helm_run_dir --output_dir data' ), From 75e726204b3d7bc875def9c35f33979d279356e2 Mon Sep 17 00:00:00 2001 From: Chibukach Date: Thu, 14 May 2026 19:06:35 +0000 Subject: [PATCH 6/6] add tests for merge seeds --- tests/test_merge_seeds.py | 254 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 tests/test_merge_seeds.py diff --git a/tests/test_merge_seeds.py b/tests/test_merge_seeds.py new file mode 100644 index 000000000..eeba5b52a --- /dev/null +++ b/tests/test_merge_seeds.py @@ -0,0 +1,254 @@ +"""Tests for every_eval_ever.merge_seeds.""" + +import json +import statistics + +import pytest + +from every_eval_ever.eval_types import ( + EvalLibrary, + EvaluationLog, + EvaluationResult, + EvaluatorRelationship, + GenerationConfig, + MetricConfig, + ModelInfo, + ScoreDetails, + ScoreType, + SourceDataPrivate, + SourceMetadata, +) +from every_eval_ever.merge_seeds import ( + _extract_seed_from_log, + _make_grouping_key, + merge_seed_runs, +) + + +def _source_data(name: str = 'bench_ds') -> SourceDataPrivate: + return SourceDataPrivate(dataset_name=name, source_type='other') + + +def _metric(description: str = 'exact_match') -> MetricConfig: + return MetricConfig( + evaluation_description=description, + lower_is_better=False, + score_type=ScoreType.continuous, + min_score=0.0, + max_score=1.0, + ) + + +def _result( + score: float, + *, + eval_name: str = 'gsm8k', + metric_desc: str = 'exact_match', + dataset_name: str = 'bench_ds', + evaluation_timestamp: str = '1700000000', + seed_key: str = 'seed', + seed_value: str = '42', + score_details_details: dict | None = None, +) -> EvaluationResult: + gen_details = {seed_key: seed_value} + return EvaluationResult( + evaluation_name=eval_name, + source_data=_source_data(dataset_name), + evaluation_timestamp=evaluation_timestamp, + metric_config=_metric(metric_desc), + score_details=ScoreDetails( + score=score, + details=score_details_details, + ), + generation_config=GenerationConfig( + generation_args=None, + additional_details=gen_details, + ), + ) + + +def _log( + *results: EvaluationResult, + model_id: str = 'RedHatAI/Qwen3-4B-Thinking-2507', + evaluation_id: str = 'run/1', + retrieved_timestamp: str = '1700000001', + evaluation_timestamp: str | None = '1700000100', + model_end_time: str | None = None, +) -> EvaluationLog: + add = {'end_time': model_end_time} if model_end_time else None + return EvaluationLog( + schema_version='0.2.0', + evaluation_id=evaluation_id, + retrieved_timestamp=retrieved_timestamp, + evaluation_timestamp=evaluation_timestamp, + source_metadata=SourceMetadata( + source_name='test', + source_type='evaluation_run', + source_organization_name='Org', + evaluator_relationship=EvaluatorRelationship.first_party, + ), + eval_library=EvalLibrary(name='lm_eval', version='0.4'), + model_info=ModelInfo( + name=model_id, + id=model_id, + additional_details=add, + ), + evaluation_results=list(results), + ) + + +def test_merge_empty_returns_empty(): + assert merge_seed_runs([]) == [] + + +def test_merge_single_log_returns_same_list(): + r = _result(0.7, seed_value='1') + log = _log(r) + logs_in = [log] + out = merge_seed_runs(logs_in) + assert out is logs_in + assert out[0] is log + + +def test_extract_seed_digit_string(): + log = _log(_result(0.0, seed_value='3344')) + assert _extract_seed_from_log(log) == 3344 + + +def test_extract_seed_random_seed_key(): + res = _result( + 0.0, + seed_key='random_seed', + seed_value='99', + ) + assert _extract_seed_from_log(_log(res)) == 99 + + +def test_extract_seed_json_encoded_string(): + res_digit = _result(0.0, seed_value='3344') + assert _extract_seed_from_log(_log(res_digit)) == 3344 + + res_json = _result(0.0, seed_value='"3344"') + assert _extract_seed_from_log(_log(res_json)) == 3344 + + +def test_extract_seed_returns_none_without_generation_config(): + res = EvaluationResult( + evaluation_name='t', + source_data=_source_data(), + metric_config=_metric(), + score_details=ScoreDetails(score=0.5), + generation_config=None, + ) + assert _extract_seed_from_log(_log(res)) is None + + +def test_make_grouping_key(): + log = _log(_result(0.1, eval_name='a', metric_desc='m', dataset_name='d1')) + key = _make_grouping_key(log, log.evaluation_results[0]) + assert key == (log.model_info.id, 'a', 'm', 'd1') + + +def test_merge_two_seeds_averages_score_and_stderr(): + r1 = _result( + 0.4, + evaluation_timestamp='1700000000', + seed_value='1', + ) + r2 = _result( + 0.6, + evaluation_timestamp='1700000002', + seed_value='2', + ) + merged = merge_seed_runs([_log(r1), _log(r2)], num_input_paths=2) + assert len(merged) == 1 + mlog = merged[0] + assert len(mlog.evaluation_results) == 1 + mr = mlog.evaluation_results[0] + assert mr.score_details.score == pytest.approx(0.5) + std_dev = statistics.stdev([0.4, 0.6]) + expected_se = std_dev / (2**0.5) + assert mr.score_details.uncertainty.standard_error.value == pytest.approx( + expected_se + ) + assert mr.score_details.uncertainty.standard_error.method == 'across_seeds' + assert mr.score_details.uncertainty.num_samples == 2 + + d = mr.score_details.details + assert json.loads(d['seed_scores']) == [0.4, 0.6] + assert json.loads(d['seed_values']) == [1, 2] + assert json.loads(d['evaluation_timestamps']) == [1700000000, 1700000002] + + +def test_merge_identical_scores_zero_standard_error(): + r1 = _result(0.5, evaluation_timestamp='1', seed_value='1') + r2 = _result(0.5, evaluation_timestamp='2', seed_value='2') + merged = merge_seed_runs([_log(r1), _log(r2)]) + se = merged[0].evaluation_results[0].score_details.uncertainty.standard_error + assert se.value == 0.0 + + +def test_merge_different_models_raises(): + r1 = _result(0.1) + r2 = _result(0.2) + with pytest.raises(ValueError, match='different models'): + merge_seed_runs( + [ + _log(r1, model_id='a/m1'), + _log(r2, model_id='a/m2'), + ] + ) + + +def test_merge_no_complete_groups_raises(): + """Each group appears once but two inputs expected → nothing merged.""" + r1 = _result(0.1, eval_name='only_a') + r2 = _result(0.2, eval_name='only_b') + with pytest.raises(ValueError, match='No evaluation results could be averaged'): + merge_seed_runs([_log(r1), _log(r2)], num_input_paths=2) + + +def test_merge_expected_size_larger_than_group_raises(): + r1 = _result(0.4, evaluation_timestamp='1') + r2 = _result(0.6, evaluation_timestamp='2') + with pytest.raises(ValueError, match='No evaluation results could be averaged'): + merge_seed_runs([_log(r1), _log(r2)], num_input_paths=3) + + +def test_merge_model_info_num_seeds_and_time_range(): + r1 = _result(0.0, evaluation_timestamp='100', seed_value='1') + r2 = _result(0.0, evaluation_timestamp='200', seed_value='2') + merged = merge_seed_runs( + [ + _log(r1, model_end_time='500', evaluation_timestamp='100'), + _log(r2, model_end_time='600', evaluation_timestamp='200'), + ], + num_input_paths=2, + ) + mi = merged[0].model_info + assert mi.additional_details['num_seeds_merged'] == '2' + assert merged[0].evaluation_timestamp == '100' + time_range = json.loads(mi.additional_details['evaluation_time_range']) + assert time_range == {'start_time': 100, 'end_time': 600} + + +def test_merge_preserves_prior_score_details_strings(): + r1 = _result( + 0.1, + evaluation_timestamp='1', + score_details_details={'keep': 'yes'}, + ) + r2 = _result(0.3, evaluation_timestamp='2') + merged = merge_seed_runs([_log(r1), _log(r2)]) + d = merged[0].evaluation_results[0].score_details.details + assert d['keep'] == 'yes' + assert 'seed_scores' in d + + +def test_merge_detailed_evaluation_results_cleared(): + r1 = _result(0.1, evaluation_timestamp='1') + r2 = _result(0.3, evaluation_timestamp='2') + log1 = _log(r1) + # EvaluationLog may not have detailed_evaluation_results in constructor default None + merged = merge_seed_runs([log1, _log(r2)]) + assert merged[0].detailed_evaluation_results is None