From 09288d4aeeb6eb747d316dc00468eaad048b3c49 Mon Sep 17 00:00:00 2001 From: Chibukach Date: Mon, 11 May 2026 14:44:29 +0000 Subject: [PATCH 1/2] add swebench adapter --- every_eval_ever/cli.py | 63 ++++- every_eval_ever/converters/common/adapter.py | 1 + .../converters/swebench/__init__.py | 5 + .../converters/swebench/adapter.py | 256 ++++++++++++++++++ every_eval_ever/eval_types.py | 12 + 5 files changed, 336 insertions(+), 1 deletion(-) create mode 100644 every_eval_ever/converters/swebench/__init__.py create mode 100644 every_eval_ever/converters/swebench/adapter.py diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index c215c89cd..143fd3c49 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -58,6 +58,37 @@ def _write_log( return out_file +def _cmd_convert_swebench(args: argparse.Namespace) -> int: + from every_eval_ever.converters.swebench.adapter import SWEBenchAdapter + + adapter = SWEBenchAdapter() + metadata = _common_metadata(args) + metadata['model_id'] = args.model_id + if args.benchmark_name: + metadata['benchmark_name'] = args.benchmark_name + if args.hf_repo: + metadata['hf_repo'] = args.hf_repo + + log_path = Path(args.log_path) + metadata['parent_eval_output_dir'] = str( + log_path.parent if log_path.is_file() else log_path + ) + + if log_path.is_file(): + logs = adapter.transform_from_file(log_path, metadata) + elif log_path.is_dir(): + logs = adapter.transform_from_directory(log_path, metadata) + else: + raise FileNotFoundError(f'Path is not a file or directory: {log_path}') + + output_dir = Path(args.output_dir) + for log in logs: + print(_write_log(log, output_dir)) + + print(f'Converted {len(logs)} evaluation log(s).') + return 0 + + def _cmd_convert_lm_eval(args: argparse.Namespace) -> int: from every_eval_ever.converters.lm_eval.adapter import LMEvalAdapter from every_eval_ever.converters.lm_eval.instance_level_adapter import ( @@ -298,7 +329,7 @@ def build_parser() -> argparse.ArgumentParser: dest='source', required=True ) - for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval']: + for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval', 'swebench']: source_parser = convert_subparsers.add_parser( source, help=f'Convert {source} logs', @@ -385,6 +416,34 @@ def build_parser() -> argparse.ArgumentParser: help='Inference engine version to record in model_info.inference_engine.version.', ) + if source == 'swebench': + source_parser.add_argument( + '--model_id', + '--model-id', + required=True, + help=( + 'Model identifier (e.g. org/model). Required because SWE-bench ' + 'evaluation.json summaries do not include model metadata.' + ), + ) + source_parser.add_argument( + '--benchmark_name', + '--benchmark-name', + default=None, + help=( + 'Benchmark label for dataset_name and metrics (default: SWE-bench).' + ), + ) + source_parser.add_argument( + '--hf_repo', + '--hf-repo', + default=None, + help=( + 'Hugging Face dataset repo id for source_data (default: ' + 'princeton-nlp/SWE-bench).' + ), + ) + return parser @@ -421,6 +480,8 @@ def main(argv: list[str] | None = None) -> int: return _cmd_convert_helm(args) if args.source == 'alpaca_eval': return _cmd_convert_alpaca_eval(args) + if args.source == 'swebench': + return _cmd_convert_swebench(args) parser.print_help() return 1 diff --git a/every_eval_ever/converters/common/adapter.py b/every_eval_ever/converters/common/adapter.py index 4598ee6de..98bba42bc 100644 --- a/every_eval_ever/converters/common/adapter.py +++ b/every_eval_ever/converters/common/adapter.py @@ -31,6 +31,7 @@ class SupportedLibrary(Enum): LM_EVAL = 'lm-evaluation-harness' INSPECT_AI = 'inspect-ai' HELM = 'helm' + SWE_BENCH = 'swebench' CUSTOM = 'custom' diff --git a/every_eval_ever/converters/swebench/__init__.py b/every_eval_ever/converters/swebench/__init__.py new file mode 100644 index 000000000..df2e5b93c --- /dev/null +++ b/every_eval_ever/converters/swebench/__init__.py @@ -0,0 +1,5 @@ +"""SWE-bench aggregate evaluation summary (evaluation.json) converter.""" + +from .adapter import SWEBenchAdapter + +__all__ = ['SWEBenchAdapter'] diff --git a/every_eval_ever/converters/swebench/adapter.py b/every_eval_ever/converters/swebench/adapter.py new file mode 100644 index 000000000..d4f16a509 --- /dev/null +++ b/every_eval_ever/converters/swebench/adapter.py @@ -0,0 +1,256 @@ +"""Adapter for SWE-bench-style aggregate summaries (e.g. evaluation.json).""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List, Union + +from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters.common.adapter import ( + AdapterMetadata, + BaseEvaluationAdapter, + SupportedLibrary, +) +from every_eval_ever.converters.common.error import AdapterError, TransformationError +from every_eval_ever.converters.common.utils import get_current_unix_timestamp +from every_eval_ever.eval_types import ( + AgenticEvalConfig, + AvailableTool, + EvalLibrary, + EvaluationLog, + EvaluationResult, + EvaluatorRelationship, + GenerationArgs, + GenerationConfig, + MetricConfig, + ModelInfo, + ScoreDetails, + ScoreType, + SourceDataHf, + SourceMetadata, + SourceType, +) +from every_eval_ever.helpers import get_developer, get_model_id + + +class SWEBenchAdapter(BaseEvaluationAdapter): + """Converts SWE-bench harness aggregate JSON summaries to every_eval_ever format.""" + + DEFAULT_BENCHMARK_NAME = 'SWE-bench' + DEFAULT_HF_REPO = 'princeton-nlp/SWE-bench' + + @property + def metadata(self) -> AdapterMetadata: + return AdapterMetadata( + name='swebench-adapter', + version='0.1.0', + supported_library_versions=['2.*'], + description=( + 'Converts SWE-bench aggregate evaluation.json summaries ' + 'to every_eval_ever format' + ), + ) + + @property + def supported_library(self) -> SupportedLibrary: + return SupportedLibrary.SWE_BENCH + + def _extract_model_info( + self, + raw_data: Dict[str, Any], + metadata_args: Dict[str, Any], + ) -> ModelInfo: + model_id_input = (metadata_args.get('model_id') or '').strip() + if not model_id_input: + raise TransformationError( + 'model_id is required in metadata (SWE-bench summary JSON has no model field)' + ) + + developer = get_developer(model_id_input) + model_id = get_model_id(model_id_input, developer) + + additional: Dict[str, str] = {} + if raw_data.get('schema_version') is not None: + additional['swe_summary_schema_version'] = str( + raw_data['schema_version'] + ) + fn = metadata_args.get('input_filename') + if fn: + additional['source_summary_file'] = str(fn) + + return ModelInfo( + name=model_id_input, + id=model_id, + developer=developer if developer != 'unknown' else None, + additional_details=additional if additional else None, + ) + + def _build_evaluation_results( + self, + raw_data: Dict[str, Any], + metadata_args: Dict[str, Any], + ) -> List[EvaluationResult]: + + total = raw_data.get("total_instances",0) + + resolved = raw_data.get("resolved_instances",0) + submitted = raw_data.get("submitted_instances",0) + completed = raw_data.get("completed_instances",0) + unresolved = raw_data.get("unresolved_instances",0) + empty_patch = raw_data.get("empty_patch_instances",0) + errors = raw_data.get("error_instances",0) + + score = (resolved / total) if total else 0.0 + + benchmark_name = ( + metadata_args.get('benchmark_name') or self.DEFAULT_BENCHMARK_NAME + ) + hf_repo = metadata_args.get('hf_repo') or self.DEFAULT_HF_REPO + + details: Dict[str, str] = { + 'total_instances': str(total), + 'resolved_instances': str(resolved), + 'submitted_instances': str(submitted), + 'completed_instances': str(completed), + 'unresolved_instances': str(unresolved), + 'empty_patch_instances': str(empty_patch), + 'error_instances': str(errors), + } + + completed_ids = raw_data.get("completed_ids", []) + submitted_ids = raw_data.get("submitted_ids", []) + resolved_ids = raw_data.get("resolved_ids", []) + + source_data = SourceDataHf( + dataset_name=benchmark_name, + source_type='hf_dataset', + hf_repo=hf_repo, + hf_split='test', + samples_number=total, + ) + + return [ + EvaluationResult( + evaluation_name=benchmark_name, + source_data=source_data, + metric_config=MetricConfig( + evaluation_description=( + 'Resolve rate: fraction of benchmark instances where the ' + 'submitted patch passes verification (0.0–1.0)' + ), + lower_is_better=False, + score_type=ScoreType.continuous, + min_score=0.0, + max_score=1.0, + ), + score_details=ScoreDetails( + score=score, + details=details, + completed_ids=completed_ids, + submitted_ids=submitted_ids, + resolved_ids=resolved_ids, + ), + generation_config=GenerationConfig( + generation_args=GenerationArgs( + agentic_eval_config=AgenticEvalConfig( + available_tools=[AvailableTool(name='bash')], + ), + ), + ), + ) + ] + + def _transform_single( + self, raw_data: Any, metadata_args: Dict[str, Any] + ) -> EvaluationLog: + if not isinstance(raw_data, dict): + raise TransformationError( + f'Expected a JSON object, got {type(raw_data).__name__}' + ) + + # Minimal validation: harness summaries always include these keys + if 'total_instances' not in raw_data or 'resolved_instances' not in raw_data: + raise TransformationError( + 'SWE-bench summary must include total_instances and resolved_instances' + ) + + model_info = self._extract_model_info(raw_data, metadata_args) + evaluation_results = self._build_evaluation_results( + raw_data, metadata_args + ) + + retrieved_timestamp = get_current_unix_timestamp() + + benchmark_name = ( + metadata_args.get('benchmark_name') or self.DEFAULT_BENCHMARK_NAME + ) + + bench_slug = benchmark_name.replace('/', '_') + model_slug = model_info.id.replace('/', '_') + input_slug = str(metadata_args.get('input_filename') or 'evaluation') + + evaluation_id = ( + f'{bench_slug}/{model_slug}/{input_slug}/{retrieved_timestamp}' + ) + + evaluator_rel_str = metadata_args.get( + 'evaluator_relationship', 'third_party' + ) + evaluator_relationship = EvaluatorRelationship(evaluator_rel_str) + + eval_library = EvalLibrary( + name=metadata_args.get('eval_library_name', 'swebench'), + version=metadata_args.get('eval_library_version', 'unknown'), + ) + + source_metadata = SourceMetadata( + source_name='SWE-bench', + source_type=SourceType.evaluation_run, + source_organization_name=metadata_args.get( + 'source_organization_name', '' + ), + source_organization_url=metadata_args.get( + 'source_organization_url' + ), + source_organization_logo_url=metadata_args.get( + 'source_organization_logo_url' + ), + evaluator_relationship=evaluator_relationship, + ) + + return EvaluationLog( + schema_version=SCHEMA_VERSION, + evaluation_id=evaluation_id, + retrieved_timestamp=retrieved_timestamp, + evaluation_timestamp=None, + source_metadata=source_metadata, + eval_library=eval_library, + model_info=model_info, + evaluation_results=evaluation_results, + ) + + def transform_from_file( + self, file_path: Union[str, Path], metadata_args: Dict[str, Any] + ) -> List[EvaluationLog]: + file_path = Path(file_path) + raw_data = self._load_file(file_path) + merged_meta: Dict[str, Any] = { + **metadata_args, + 'input_filename': file_path.name, + 'parent_eval_output_dir': str(file_path.parent), + } + return [self._transform_single(raw_data, merged_meta)] + + def transform_from_directory( + self, dir_path: Union[str, Path], metadata_args: Dict[str, Any] + ) -> List[EvaluationLog]: + """Transform all json results files in a directory.""" + dir_path = Path(dir_path) + results_files = sorted(dir_path.glob('**/*.json')) + + all_logs = [] + for results_file in results_files: + logs = self.transform_from_file(results_file, metadata_args) + all_logs.extend(logs) + + return all_logs \ No newline at end of file diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py index 40035403b..ef8dddba4 100644 --- a/every_eval_ever/eval_types.py +++ b/every_eval_ever/eval_types.py @@ -142,6 +142,18 @@ class ScoreDetails(BaseModel): None, description='Quantification of uncertainty around the reported score', ) + completed_ids: list[str] | None = Field( + None, + description='Instance ids that finished processing (e.g. SWE-bench evaluation.json)', + ) + submitted_ids: list[str] | None = Field( + None, + description='Instance ids that had a submission (e.g. SWE-bench evaluation.json)', + ) + resolved_ids: list[str] | None = Field( + None, + description='Instance ids that passed verification (e.g. SWE-bench evaluation.json)', + ) class AvailableTool(BaseModel): From ee0e03c1223ce70c8d31e67254c5d01c2485fb6b Mon Sep 17 00:00:00 2001 From: Chibukach Date: Thu, 14 May 2026 18:52:31 +0000 Subject: [PATCH 2/2] add tests for swebench adapter --- tests/data/swebench/evaluation.json | 875 ++++++++++++++++++++++++++++ tests/test_swebench_adapter.py | 248 ++++++++ 2 files changed, 1123 insertions(+) create mode 100644 tests/data/swebench/evaluation.json create mode 100644 tests/test_swebench_adapter.py diff --git a/tests/data/swebench/evaluation.json b/tests/data/swebench/evaluation.json new file mode 100644 index 000000000..71d512df2 --- /dev/null +++ b/tests/data/swebench/evaluation.json @@ -0,0 +1,875 @@ +{ + "total_instances": 300, + "submitted_instances": 300, + "completed_instances": 252, + "resolved_instances": 156, + "unresolved_instances": 96, + "empty_patch_instances": 4, + "error_instances": 44, + "completed_ids": [ + "astropy__astropy-12907", + "astropy__astropy-14365", + "astropy__astropy-14995", + "astropy__astropy-6938", + "astropy__astropy-7746", + "django__django-10914", + "django__django-10924", + "django__django-11001", + "django__django-11019", + "django__django-11039", + "django__django-11049", + "django__django-11099", + "django__django-11133", + "django__django-11179", + "django__django-11283", + "django__django-11583", + "django__django-11620", + "django__django-11742", + "django__django-11797", + "django__django-11815", + "django__django-11848", + "django__django-11905", + "django__django-11910", + "django__django-11999", + "django__django-12113", + "django__django-12284", + "django__django-12308", + "django__django-12470", + "django__django-12497", + "django__django-12589", + "django__django-12747", + "django__django-12856", + "django__django-12908", + "django__django-12915", + "django__django-12983", + "django__django-13028", + "django__django-13220", + "django__django-13230", + "django__django-13265", + "django__django-13321", + "django__django-13401", + "django__django-13551", + "django__django-13590", + "django__django-13658", + "django__django-13660", + "django__django-13710", + "django__django-13757", + "django__django-13768", + "django__django-13925", + "django__django-13933", + "django__django-14016", + "django__django-14017", + "django__django-14155", + "django__django-14382", + "django__django-14534", + "django__django-14580", + "django__django-14608", + "django__django-14667", + "django__django-14672", + "django__django-14730", + "django__django-14752", + "django__django-14787", + "django__django-14855", + "django__django-14915", + "django__django-14999", + "django__django-15061", + "django__django-15202", + "django__django-15213", + "django__django-15320", + "django__django-15388", + "django__django-15400", + "django__django-15695", + "django__django-15738", + "django__django-15781", + "django__django-15789", + "django__django-15790", + "django__django-15819", + "django__django-15851", + "django__django-16041", + "django__django-16046", + "django__django-16139", + "django__django-16255", + "django__django-16379", + "django__django-16595", + "django__django-16816", + "django__django-16820", + "django__django-16873", + "django__django-16910", + "django__django-17051", + "django__django-17087", + "matplotlib__matplotlib-18869", + "matplotlib__matplotlib-22711", + "matplotlib__matplotlib-23299", + "matplotlib__matplotlib-23314", + "matplotlib__matplotlib-23476", + "matplotlib__matplotlib-23562", + "matplotlib__matplotlib-23913", + "matplotlib__matplotlib-23964", + "matplotlib__matplotlib-23987", + "matplotlib__matplotlib-24149", + "matplotlib__matplotlib-24265", + "matplotlib__matplotlib-24334", + "matplotlib__matplotlib-25079", + "matplotlib__matplotlib-25311", + "matplotlib__matplotlib-25332", + "matplotlib__matplotlib-25433", + "matplotlib__matplotlib-25442", + "matplotlib__matplotlib-25498", + "matplotlib__matplotlib-26011", + "matplotlib__matplotlib-26020", + "mwaskom__seaborn-2848", + "mwaskom__seaborn-3010", + "mwaskom__seaborn-3190", + "mwaskom__seaborn-3407", + "pallets__flask-4045", + "pallets__flask-4992", + "pallets__flask-5063", + "psf__requests-1963", + "psf__requests-2148", + "psf__requests-2317", + "psf__requests-2674", + "psf__requests-3362", + "psf__requests-863", + "pydata__xarray-3364", + "pydata__xarray-4094", + "pydata__xarray-4248", + "pydata__xarray-4493", + "pylint-dev__pylint-5859", + "pylint-dev__pylint-6506", + "pylint-dev__pylint-7228", + "pylint-dev__pylint-7993", + "pytest-dev__pytest-11143", + "pytest-dev__pytest-11148", + "pytest-dev__pytest-5103", + "pytest-dev__pytest-5221", + "pytest-dev__pytest-5227", + "pytest-dev__pytest-5413", + "pytest-dev__pytest-5495", + "pytest-dev__pytest-5692", + "pytest-dev__pytest-6116", + "pytest-dev__pytest-7168", + "pytest-dev__pytest-7220", + "pytest-dev__pytest-7373", + "pytest-dev__pytest-7432", + "pytest-dev__pytest-7490", + "pytest-dev__pytest-8365", + "pytest-dev__pytest-8906", + "pytest-dev__pytest-9359", + "scikit-learn__scikit-learn-10297", + "scikit-learn__scikit-learn-10949", + "scikit-learn__scikit-learn-11040", + "scikit-learn__scikit-learn-11281", + "scikit-learn__scikit-learn-12471", + "scikit-learn__scikit-learn-13142", + "scikit-learn__scikit-learn-13241", + "scikit-learn__scikit-learn-13439", + "scikit-learn__scikit-learn-13497", + "scikit-learn__scikit-learn-13584", + "scikit-learn__scikit-learn-13779", + "scikit-learn__scikit-learn-14087", + "scikit-learn__scikit-learn-14092", + "scikit-learn__scikit-learn-14894", + "scikit-learn__scikit-learn-14983", + "scikit-learn__scikit-learn-15512", + "scikit-learn__scikit-learn-15535", + "scikit-learn__scikit-learn-25500", + "scikit-learn__scikit-learn-25570", + "scikit-learn__scikit-learn-25638", + "scikit-learn__scikit-learn-25747", + "sphinx-doc__sphinx-10325", + "sphinx-doc__sphinx-10451", + "sphinx-doc__sphinx-11445", + "sphinx-doc__sphinx-7738", + "sphinx-doc__sphinx-7975", + "sphinx-doc__sphinx-8273", + "sphinx-doc__sphinx-8435", + "sphinx-doc__sphinx-8474", + "sphinx-doc__sphinx-8506", + "sphinx-doc__sphinx-8595", + "sphinx-doc__sphinx-8627", + "sphinx-doc__sphinx-8713", + "sphinx-doc__sphinx-8721", + "sphinx-doc__sphinx-8801", + "sympy__sympy-11400", + "sympy__sympy-11870", + "sympy__sympy-11897", + "sympy__sympy-12171", + "sympy__sympy-12236", + "sympy__sympy-12419", + "sympy__sympy-12454", + "sympy__sympy-12481", + "sympy__sympy-13146", + "sympy__sympy-13177", + "sympy__sympy-13437", + "sympy__sympy-13471", + "sympy__sympy-13480", + "sympy__sympy-13647", + "sympy__sympy-13773", + "sympy__sympy-13895", + "sympy__sympy-13915", + "sympy__sympy-13971", + "sympy__sympy-14024", + "sympy__sympy-14317", + "sympy__sympy-14396", + "sympy__sympy-14774", + "sympy__sympy-14817", + "sympy__sympy-15011", + "sympy__sympy-15308", + "sympy__sympy-15345", + "sympy__sympy-15346", + "sympy__sympy-15609", + "sympy__sympy-15678", + "sympy__sympy-16106", + "sympy__sympy-16281", + "sympy__sympy-16503", + "sympy__sympy-16792", + "sympy__sympy-17022", + "sympy__sympy-17139", + "sympy__sympy-17630", + "sympy__sympy-17655", + "sympy__sympy-18189", + "sympy__sympy-18199", + "sympy__sympy-18532", + "sympy__sympy-18621", + "sympy__sympy-18698", + "sympy__sympy-18835", + "sympy__sympy-19007", + "sympy__sympy-19254", + "sympy__sympy-19487", + "sympy__sympy-20049", + "sympy__sympy-20154", + "sympy__sympy-20212", + "sympy__sympy-20322", + "sympy__sympy-20442", + "sympy__sympy-20590", + "sympy__sympy-20639", + "sympy__sympy-21055", + "sympy__sympy-21171", + "sympy__sympy-21612", + "sympy__sympy-21614", + "sympy__sympy-21627", + "sympy__sympy-21847", + "sympy__sympy-22005", + "sympy__sympy-22714", + "sympy__sympy-22840", + "sympy__sympy-23191", + "sympy__sympy-23262", + "sympy__sympy-24066", + "sympy__sympy-24102", + "sympy__sympy-24152", + "sympy__sympy-24213", + "sympy__sympy-24909" + ], + "incomplete_ids": [], + "empty_patch_ids": [ + "django__django-15252", + "django__django-16400", + "sympy__sympy-14308", + "sympy__sympy-16988" + ], + "submitted_ids": [ + "astropy__astropy-12907", + "astropy__astropy-14182", + "astropy__astropy-14365", + "astropy__astropy-14995", + "astropy__astropy-6938", + "astropy__astropy-7746", + "django__django-10914", + "django__django-10924", + "django__django-11001", + "django__django-11019", + "django__django-11039", + "django__django-11049", + "django__django-11099", + "django__django-11133", + "django__django-11179", + "django__django-11283", + "django__django-11422", + "django__django-11564", + "django__django-11583", + "django__django-11620", + "django__django-11630", + "django__django-11742", + "django__django-11797", + "django__django-11815", + "django__django-11848", + "django__django-11905", + "django__django-11910", + "django__django-11964", + "django__django-11999", + "django__django-12113", + "django__django-12125", + "django__django-12184", + "django__django-12284", + "django__django-12286", + "django__django-12308", + "django__django-12453", + "django__django-12470", + "django__django-12497", + "django__django-12589", + "django__django-12700", + "django__django-12708", + "django__django-12747", + "django__django-12856", + "django__django-12908", + "django__django-12915", + "django__django-12983", + "django__django-13028", + "django__django-13033", + "django__django-13158", + "django__django-13220", + "django__django-13230", + "django__django-13265", + "django__django-13315", + "django__django-13321", + "django__django-13401", + "django__django-13447", + "django__django-13448", + "django__django-13551", + "django__django-13590", + "django__django-13658", + "django__django-13660", + "django__django-13710", + "django__django-13757", + "django__django-13768", + "django__django-13925", + "django__django-13933", + "django__django-13964", + "django__django-14016", + "django__django-14017", + "django__django-14155", + "django__django-14238", + "django__django-14382", + "django__django-14411", + "django__django-14534", + "django__django-14580", + "django__django-14608", + "django__django-14667", + "django__django-14672", + "django__django-14730", + "django__django-14752", + "django__django-14787", + "django__django-14855", + "django__django-14915", + "django__django-14997", + "django__django-14999", + "django__django-15061", + "django__django-15202", + "django__django-15213", + "django__django-15252", + "django__django-15320", + "django__django-15347", + "django__django-15388", + "django__django-15400", + "django__django-15498", + "django__django-15695", + "django__django-15738", + "django__django-15781", + "django__django-15789", + "django__django-15790", + "django__django-15814", + "django__django-15819", + "django__django-15851", + "django__django-15902", + "django__django-15996", + "django__django-16041", + "django__django-16046", + "django__django-16139", + "django__django-16229", + "django__django-16255", + "django__django-16379", + "django__django-16400", + "django__django-16408", + "django__django-16527", + "django__django-16595", + "django__django-16816", + "django__django-16820", + "django__django-16873", + "django__django-16910", + "django__django-17051", + "django__django-17087", + "matplotlib__matplotlib-18869", + "matplotlib__matplotlib-22711", + "matplotlib__matplotlib-22835", + "matplotlib__matplotlib-23299", + "matplotlib__matplotlib-23314", + "matplotlib__matplotlib-23476", + "matplotlib__matplotlib-23562", + "matplotlib__matplotlib-23563", + "matplotlib__matplotlib-23913", + "matplotlib__matplotlib-23964", + "matplotlib__matplotlib-23987", + "matplotlib__matplotlib-24149", + "matplotlib__matplotlib-24265", + "matplotlib__matplotlib-24334", + "matplotlib__matplotlib-24970", + "matplotlib__matplotlib-25079", + "matplotlib__matplotlib-25311", + "matplotlib__matplotlib-25332", + "matplotlib__matplotlib-25433", + "matplotlib__matplotlib-25442", + "matplotlib__matplotlib-25498", + "matplotlib__matplotlib-26011", + "matplotlib__matplotlib-26020", + "mwaskom__seaborn-2848", + "mwaskom__seaborn-3010", + "mwaskom__seaborn-3190", + "mwaskom__seaborn-3407", + "pallets__flask-4045", + "pallets__flask-4992", + "pallets__flask-5063", + "psf__requests-1963", + "psf__requests-2148", + "psf__requests-2317", + "psf__requests-2674", + "psf__requests-3362", + "psf__requests-863", + "pydata__xarray-3364", + "pydata__xarray-4094", + "pydata__xarray-4248", + "pydata__xarray-4493", + "pydata__xarray-5131", + "pylint-dev__pylint-5859", + "pylint-dev__pylint-6506", + "pylint-dev__pylint-7080", + "pylint-dev__pylint-7114", + "pylint-dev__pylint-7228", + "pylint-dev__pylint-7993", + "pytest-dev__pytest-11143", + "pytest-dev__pytest-11148", + "pytest-dev__pytest-5103", + "pytest-dev__pytest-5221", + "pytest-dev__pytest-5227", + "pytest-dev__pytest-5413", + "pytest-dev__pytest-5495", + "pytest-dev__pytest-5692", + "pytest-dev__pytest-6116", + "pytest-dev__pytest-7168", + "pytest-dev__pytest-7220", + "pytest-dev__pytest-7373", + "pytest-dev__pytest-7432", + "pytest-dev__pytest-7490", + "pytest-dev__pytest-8365", + "pytest-dev__pytest-8906", + "pytest-dev__pytest-9359", + "scikit-learn__scikit-learn-10297", + "scikit-learn__scikit-learn-10508", + "scikit-learn__scikit-learn-10949", + "scikit-learn__scikit-learn-11040", + "scikit-learn__scikit-learn-11281", + "scikit-learn__scikit-learn-12471", + "scikit-learn__scikit-learn-13142", + "scikit-learn__scikit-learn-13241", + "scikit-learn__scikit-learn-13439", + "scikit-learn__scikit-learn-13496", + "scikit-learn__scikit-learn-13497", + "scikit-learn__scikit-learn-13584", + "scikit-learn__scikit-learn-13779", + "scikit-learn__scikit-learn-14087", + "scikit-learn__scikit-learn-14092", + "scikit-learn__scikit-learn-14894", + "scikit-learn__scikit-learn-14983", + "scikit-learn__scikit-learn-15512", + "scikit-learn__scikit-learn-15535", + "scikit-learn__scikit-learn-25500", + "scikit-learn__scikit-learn-25570", + "scikit-learn__scikit-learn-25638", + "scikit-learn__scikit-learn-25747", + "sphinx-doc__sphinx-10325", + "sphinx-doc__sphinx-10451", + "sphinx-doc__sphinx-11445", + "sphinx-doc__sphinx-7686", + "sphinx-doc__sphinx-7738", + "sphinx-doc__sphinx-7975", + "sphinx-doc__sphinx-8273", + "sphinx-doc__sphinx-8282", + "sphinx-doc__sphinx-8435", + "sphinx-doc__sphinx-8474", + "sphinx-doc__sphinx-8506", + "sphinx-doc__sphinx-8595", + "sphinx-doc__sphinx-8627", + "sphinx-doc__sphinx-8713", + "sphinx-doc__sphinx-8721", + "sphinx-doc__sphinx-8801", + "sympy__sympy-11400", + "sympy__sympy-11870", + "sympy__sympy-11897", + "sympy__sympy-12171", + "sympy__sympy-12236", + "sympy__sympy-12419", + "sympy__sympy-12454", + "sympy__sympy-12481", + "sympy__sympy-13031", + "sympy__sympy-13043", + "sympy__sympy-13146", + "sympy__sympy-13177", + "sympy__sympy-13437", + "sympy__sympy-13471", + "sympy__sympy-13480", + "sympy__sympy-13647", + "sympy__sympy-13773", + "sympy__sympy-13895", + "sympy__sympy-13915", + "sympy__sympy-13971", + "sympy__sympy-14024", + "sympy__sympy-14308", + "sympy__sympy-14317", + "sympy__sympy-14396", + "sympy__sympy-14774", + "sympy__sympy-14817", + "sympy__sympy-15011", + "sympy__sympy-15308", + "sympy__sympy-15345", + "sympy__sympy-15346", + "sympy__sympy-15609", + "sympy__sympy-15678", + "sympy__sympy-16106", + "sympy__sympy-16281", + "sympy__sympy-16503", + "sympy__sympy-16792", + "sympy__sympy-16988", + "sympy__sympy-17022", + "sympy__sympy-17139", + "sympy__sympy-17630", + "sympy__sympy-17655", + "sympy__sympy-18057", + "sympy__sympy-18087", + "sympy__sympy-18189", + "sympy__sympy-18199", + "sympy__sympy-18532", + "sympy__sympy-18621", + "sympy__sympy-18698", + "sympy__sympy-18835", + "sympy__sympy-19007", + "sympy__sympy-19254", + "sympy__sympy-19487", + "sympy__sympy-20049", + "sympy__sympy-20154", + "sympy__sympy-20212", + "sympy__sympy-20322", + "sympy__sympy-20442", + "sympy__sympy-20590", + "sympy__sympy-20639", + "sympy__sympy-21055", + "sympy__sympy-21171", + "sympy__sympy-21379", + "sympy__sympy-21612", + "sympy__sympy-21614", + "sympy__sympy-21627", + "sympy__sympy-21847", + "sympy__sympy-22005", + "sympy__sympy-22714", + "sympy__sympy-22840", + "sympy__sympy-23117", + "sympy__sympy-23191", + "sympy__sympy-23262", + "sympy__sympy-24066", + "sympy__sympy-24102", + "sympy__sympy-24152", + "sympy__sympy-24213", + "sympy__sympy-24909" + ], + "resolved_ids": [ + "astropy__astropy-12907", + "astropy__astropy-14995", + "astropy__astropy-6938", + "django__django-10914", + "django__django-10924", + "django__django-11001", + "django__django-11039", + "django__django-11049", + "django__django-11099", + "django__django-11133", + "django__django-11179", + "django__django-11283", + "django__django-11583", + "django__django-11620", + "django__django-11797", + "django__django-11910", + "django__django-11999", + "django__django-12284", + "django__django-12470", + "django__django-12856", + "django__django-12908", + "django__django-12915", + "django__django-12983", + "django__django-13028", + "django__django-13230", + "django__django-13401", + "django__django-13590", + "django__django-13658", + "django__django-13710", + "django__django-13757", + "django__django-13925", + "django__django-13933", + "django__django-14016", + "django__django-14017", + "django__django-14382", + "django__django-14580", + "django__django-14608", + "django__django-14672", + "django__django-14752", + "django__django-14787", + "django__django-14855", + "django__django-14915", + "django__django-14999", + "django__django-15061", + "django__django-15320", + "django__django-15388", + "django__django-15400", + "django__django-15781", + "django__django-15789", + "django__django-15790", + "django__django-15851", + "django__django-16041", + "django__django-16046", + "django__django-16139", + "django__django-16255", + "django__django-16379", + "django__django-16595", + "django__django-16820", + "django__django-16873", + "django__django-16910", + "django__django-17051", + "django__django-17087", + "matplotlib__matplotlib-18869", + "matplotlib__matplotlib-23314", + "matplotlib__matplotlib-23562", + "matplotlib__matplotlib-23913", + "matplotlib__matplotlib-23964", + "matplotlib__matplotlib-23987", + "matplotlib__matplotlib-24149", + "matplotlib__matplotlib-24265", + "matplotlib__matplotlib-25311", + "matplotlib__matplotlib-25332", + "matplotlib__matplotlib-25433", + "matplotlib__matplotlib-25442", + "matplotlib__matplotlib-26011", + "matplotlib__matplotlib-26020", + "mwaskom__seaborn-2848", + "mwaskom__seaborn-3010", + "mwaskom__seaborn-3190", + "mwaskom__seaborn-3407", + "psf__requests-1963", + "psf__requests-2148", + "psf__requests-2317", + "psf__requests-2674", + "psf__requests-3362", + "psf__requests-863", + "pydata__xarray-4094", + "pylint-dev__pylint-5859", + "pylint-dev__pylint-7993", + "pytest-dev__pytest-11143", + "pytest-dev__pytest-11148", + "pytest-dev__pytest-5227", + "pytest-dev__pytest-7168", + "pytest-dev__pytest-7373", + "pytest-dev__pytest-7432", + "pytest-dev__pytest-7490", + "scikit-learn__scikit-learn-10297", + "scikit-learn__scikit-learn-11281", + "scikit-learn__scikit-learn-12471", + "scikit-learn__scikit-learn-13142", + "scikit-learn__scikit-learn-13241", + "scikit-learn__scikit-learn-13439", + "scikit-learn__scikit-learn-13497", + "scikit-learn__scikit-learn-13584", + "scikit-learn__scikit-learn-13779", + "scikit-learn__scikit-learn-14092", + "scikit-learn__scikit-learn-14894", + "scikit-learn__scikit-learn-14983", + "scikit-learn__scikit-learn-15512", + "scikit-learn__scikit-learn-15535", + "scikit-learn__scikit-learn-25500", + "sphinx-doc__sphinx-10325", + "sphinx-doc__sphinx-10451", + "sphinx-doc__sphinx-11445", + "sphinx-doc__sphinx-7975", + "sphinx-doc__sphinx-8435", + "sphinx-doc__sphinx-8627", + "sphinx-doc__sphinx-8713", + "sphinx-doc__sphinx-8721", + "sphinx-doc__sphinx-8801", + "sympy__sympy-12419", + "sympy__sympy-12454", + "sympy__sympy-13471", + "sympy__sympy-13480", + "sympy__sympy-13647", + "sympy__sympy-14396", + "sympy__sympy-14817", + "sympy__sympy-15011", + "sympy__sympy-15345", + "sympy__sympy-15346", + "sympy__sympy-15609", + "sympy__sympy-15678", + "sympy__sympy-16106", + "sympy__sympy-16792", + "sympy__sympy-17139", + "sympy__sympy-17655", + "sympy__sympy-18189", + "sympy__sympy-18532", + "sympy__sympy-18621", + "sympy__sympy-18698", + "sympy__sympy-18835", + "sympy__sympy-20154", + "sympy__sympy-20212", + "sympy__sympy-20442", + "sympy__sympy-20590", + "sympy__sympy-21055", + "sympy__sympy-21612", + "sympy__sympy-21614", + "sympy__sympy-21627", + "sympy__sympy-21847", + "sympy__sympy-22005", + "sympy__sympy-22714", + "sympy__sympy-23262", + "sympy__sympy-24066", + "sympy__sympy-24152", + "sympy__sympy-24213" + ], + "unresolved_ids": [ + "astropy__astropy-14365", + "astropy__astropy-7746", + "django__django-11019", + "django__django-11742", + "django__django-11815", + "django__django-11848", + "django__django-11905", + "django__django-12113", + "django__django-12308", + "django__django-12497", + "django__django-12589", + "django__django-12747", + "django__django-13220", + "django__django-13265", + "django__django-13321", + "django__django-13551", + "django__django-13660", + "django__django-13768", + "django__django-14155", + "django__django-14534", + "django__django-14667", + "django__django-14730", + "django__django-15202", + "django__django-15213", + "django__django-15695", + "django__django-15738", + "django__django-15819", + "django__django-16816", + "matplotlib__matplotlib-22711", + "matplotlib__matplotlib-23299", + "matplotlib__matplotlib-23476", + "matplotlib__matplotlib-24334", + "matplotlib__matplotlib-25079", + "matplotlib__matplotlib-25498", + "pallets__flask-4045", + "pallets__flask-4992", + "pallets__flask-5063", + "pydata__xarray-3364", + "pydata__xarray-4248", + "pydata__xarray-4493", + "pylint-dev__pylint-6506", + "pylint-dev__pylint-7228", + "pytest-dev__pytest-5103", + "pytest-dev__pytest-5221", + "pytest-dev__pytest-5413", + "pytest-dev__pytest-5495", + "pytest-dev__pytest-5692", + "pytest-dev__pytest-6116", + "pytest-dev__pytest-7220", + "pytest-dev__pytest-8365", + "pytest-dev__pytest-8906", + "pytest-dev__pytest-9359", + "scikit-learn__scikit-learn-10949", + "scikit-learn__scikit-learn-11040", + "scikit-learn__scikit-learn-14087", + "scikit-learn__scikit-learn-25570", + "scikit-learn__scikit-learn-25638", + "scikit-learn__scikit-learn-25747", + "sphinx-doc__sphinx-7738", + "sphinx-doc__sphinx-8273", + "sphinx-doc__sphinx-8474", + "sphinx-doc__sphinx-8506", + "sphinx-doc__sphinx-8595", + "sympy__sympy-11400", + "sympy__sympy-11870", + "sympy__sympy-11897", + "sympy__sympy-12171", + "sympy__sympy-12236", + "sympy__sympy-12481", + "sympy__sympy-13146", + "sympy__sympy-13177", + "sympy__sympy-13437", + "sympy__sympy-13773", + "sympy__sympy-13895", + "sympy__sympy-13915", + "sympy__sympy-13971", + "sympy__sympy-14024", + "sympy__sympy-14317", + "sympy__sympy-14774", + "sympy__sympy-15308", + "sympy__sympy-16281", + "sympy__sympy-16503", + "sympy__sympy-17022", + "sympy__sympy-17630", + "sympy__sympy-18199", + "sympy__sympy-19007", + "sympy__sympy-19254", + "sympy__sympy-19487", + "sympy__sympy-20049", + "sympy__sympy-20322", + "sympy__sympy-20639", + "sympy__sympy-21171", + "sympy__sympy-22840", + "sympy__sympy-23191", + "sympy__sympy-24102", + "sympy__sympy-24909" + ], + "error_ids": [ + "astropy__astropy-14182", + "django__django-11422", + "django__django-11564", + "django__django-11630", + "django__django-11964", + "django__django-12125", + "django__django-12184", + "django__django-12286", + "django__django-12453", + "django__django-12700", + "django__django-12708", + "django__django-13033", + "django__django-13158", + "django__django-13315", + "django__django-13447", + "django__django-13448", + "django__django-13964", + "django__django-14238", + "django__django-14411", + "django__django-14997", + "django__django-15347", + "django__django-15498", + "django__django-15814", + "django__django-15902", + "django__django-15996", + "django__django-16229", + "django__django-16408", + "django__django-16527", + "matplotlib__matplotlib-22835", + "matplotlib__matplotlib-23563", + "matplotlib__matplotlib-24970", + "pydata__xarray-5131", + "pylint-dev__pylint-7080", + "pylint-dev__pylint-7114", + "scikit-learn__scikit-learn-10508", + "scikit-learn__scikit-learn-13496", + "sphinx-doc__sphinx-7686", + "sphinx-doc__sphinx-8282", + "sympy__sympy-13031", + "sympy__sympy-13043", + "sympy__sympy-18057", + "sympy__sympy-18087", + "sympy__sympy-21379", + "sympy__sympy-23117" + ], + "schema_version": 2 +} diff --git a/tests/test_swebench_adapter.py b/tests/test_swebench_adapter.py new file mode 100644 index 000000000..ac432dbe5 --- /dev/null +++ b/tests/test_swebench_adapter.py @@ -0,0 +1,248 @@ +import shutil +import tempfile +from pathlib import Path + +import pytest + +from every_eval_ever.converters.common.error import TransformationError +from every_eval_ever.converters.swebench.adapter import SWEBenchAdapter +from every_eval_ever.eval_types import ( + EvaluationLog, + EvaluatorRelationship, + SourceDataHf, +) + +DATA_DIR = Path('tests/data/swebench') +EVALUATION_FILE = DATA_DIR / 'evaluation.json' + + +def _make_metadata_args(**overrides): + args = { + 'model_id': 'Qwen/Qwen3-Coder-Next', + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.third_party, + } + args.update(overrides) + return args + + +# ── Adapter: transform_from_file ───────────────────────────────────────── + + +def test_transform_from_file_returns_single_log(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + assert len(logs) == 1 + assert isinstance(logs[0], EvaluationLog) + + +def test_transform_from_file_model_info(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + model = logs[0].model_info + + assert model.name == 'Qwen/Qwen3-Coder-Next' + assert model.id == 'Qwen/Qwen3-Coder-Next' + assert model.developer == 'Qwen' + assert model.additional_details is not None + assert model.additional_details['swe_summary_schema_version'] == '2' + assert model.additional_details['source_summary_file'] == 'evaluation.json' + + +def test_transform_from_file_source_metadata(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + src = logs[0].source_metadata + + assert src.source_name == 'SWE-bench' + assert src.source_type.value == 'evaluation_run' + assert src.source_organization_name == 'TestOrg' + assert src.evaluator_relationship == EvaluatorRelationship.third_party + + +def test_transform_from_file_eval_library_defaults(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + assert logs[0].eval_library.name == 'swebench' + assert logs[0].eval_library.version == 'unknown' + + +def test_transform_from_file_evaluation_timestamp_none(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + assert logs[0].evaluation_timestamp is None + + +def test_transform_from_file_evaluation_id_shape(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + parts = logs[0].evaluation_id.split('/') + assert parts[0] == 'SWE-bench' + assert parts[1] == 'Qwen_Qwen3-Coder-Next' + assert parts[2] == 'evaluation.json' + + +def test_transform_from_file_resolve_rate_and_details(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + res = logs[0].evaluation_results[0] + assert res.evaluation_name == 'SWE-bench' + assert res.score_details.score == pytest.approx(156 / 300) + d = res.score_details.details + assert d is not None + assert d['total_instances'] == '300' + assert d['resolved_instances'] == '156' + assert d['submitted_instances'] == '300' + assert d['completed_instances'] == '252' + assert d['unresolved_instances'] == '96' + assert d['empty_patch_instances'] == '4' + assert d['error_instances'] == '44' + + +def test_transform_from_file_instance_id_lists(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + sd = logs[0].evaluation_results[0].score_details + assert sd.completed_ids is not None + assert len(sd.completed_ids) == 252 + assert sd.submitted_ids is not None + assert len(sd.submitted_ids) == 300 + assert sd.resolved_ids is not None + assert len(sd.resolved_ids) == 156 + assert sd.completed_ids[0] == 'astropy__astropy-12907' + assert sd.resolved_ids[0] == 'astropy__astropy-12907' + + +def test_transform_from_file_source_data_hf_defaults(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + src = logs[0].evaluation_results[0].source_data + assert isinstance(src, SourceDataHf) + assert src.dataset_name == 'SWE-bench' + assert src.hf_repo == 'princeton-nlp/SWE-bench' + assert src.hf_split == 'test' + assert src.samples_number == 300 + + +def test_transform_from_file_metric_config(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + mc = logs[0].evaluation_results[0].metric_config + assert 'Resolve rate' in mc.evaluation_description + assert mc.lower_is_better is False + assert mc.min_score == 0.0 + assert mc.max_score == 1.0 + + +def test_transform_from_file_generation_config_agentic(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_file( + EVALUATION_FILE, _make_metadata_args() + ) + gen = logs[0].evaluation_results[0].generation_config + assert gen is not None + assert gen.generation_args.agentic_eval_config is not None + tools = gen.generation_args.agentic_eval_config.available_tools + assert tools is not None + assert len(tools) == 1 + assert tools[0].name == 'bash' + + +# ── Adapter: transform_from_directory ──────────────────────────────────── + + +def test_transform_from_directory(): + adapter = SWEBenchAdapter() + logs = adapter.transform_from_directory( + DATA_DIR, _make_metadata_args() + ) + assert len(logs) == 1 + + +def test_transform_from_directory_nested_file(): + adapter = SWEBenchAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + nested = Path(tmpdir) / 'run_a' + nested.mkdir() + dest = nested / EVALUATION_FILE.name + shutil.copy(EVALUATION_FILE, dest) + logs = adapter.transform_from_directory( + tmpdir, _make_metadata_args() + ) + assert len(logs) == 1 + assert ( + logs[0].model_info.additional_details['source_summary_file'] + == 'evaluation.json' + ) + + +# ── Adapter: metadata overrides ────────────────────────────────────────── + + +def test_benchmark_name_and_hf_repo_override(): + adapter = SWEBenchAdapter() + meta = _make_metadata_args( + benchmark_name='SWE-bench-Lite', + hf_repo='princeton-nlp/SWE-bench_Lite', + ) + logs = adapter.transform_from_file(EVALUATION_FILE, meta) + res = logs[0].evaluation_results[0] + assert res.evaluation_name == 'SWE-bench-Lite' + assert isinstance(res.source_data, SourceDataHf) + assert res.source_data.dataset_name == 'SWE-bench-Lite' + assert res.source_data.hf_repo == 'princeton-nlp/SWE-bench_Lite' + + +def test_eval_library_version_override(): + adapter = SWEBenchAdapter() + meta = _make_metadata_args(eval_library_version='2.0.0') + logs = adapter.transform_from_file(EVALUATION_FILE, meta) + assert logs[0].eval_library.version == '2.0.0' + + +# ── Adapter: validation errors ─────────────────────────────────────────── + + +def test_transform_requires_model_id(): + adapter = SWEBenchAdapter() + bad_meta = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.third_party, + } + with pytest.raises(TransformationError, match='model_id is required'): + adapter.transform_from_file(EVALUATION_FILE, bad_meta) + + +def test_transform_requires_total_and_resolved(): + adapter = SWEBenchAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + p = Path(tmpdir) / 'bad.json' + p.write_text('{"resolved_instances": 1}', encoding='utf-8') + with pytest.raises(TransformationError, match='total_instances'): + adapter.transform_from_file(p, _make_metadata_args()) + + +def test_transform_single_rejects_non_object(): + adapter = SWEBenchAdapter() + with pytest.raises(TransformationError, match='JSON object'): + adapter._transform_single([], _make_metadata_args())