From 261c7399ca6ca961860c8a5572b8ce197ce908c0 Mon Sep 17 00:00:00 2001 From: Chibukach Date: Mon, 11 May 2026 11:59:42 +0000 Subject: [PATCH] add support for ifeval metrics --- every_eval_ever/converters/lm_eval/adapter.py | 9 +++++++-- every_eval_ever/converters/lm_eval/utils.py | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/every_eval_ever/converters/lm_eval/adapter.py b/every_eval_ever/converters/lm_eval/adapter.py index 2dc253cc5..771e6f780 100644 --- a/every_eval_ever/converters/lm_eval/adapter.py +++ b/every_eval_ever/converters/lm_eval/adapter.py @@ -246,6 +246,7 @@ def _build_evaluation_results( is_higher_better = higher_is_better.get(metric_name, True) bounds = KNOWN_METRIC_BOUNDS.get(metric_name) + min_score = bounds[0] if bounds else None max_score = bounds[1] if bounds else None @@ -267,11 +268,15 @@ def _build_evaluation_results( or task_results.get('samples') or task_results.get('sample_len') ) - if stderr_val is not None or num_samples: + # Only use stderr_val if it's a valid number (not 'N/A' or other strings) + valid_stderr = ( + isinstance(stderr_val, (int, float)) and stderr_val is not None + ) + if valid_stderr or num_samples: uncertainty = Uncertainty( standard_error=( StandardError(value=stderr_val, method='bootstrap') - if stderr_val is not None + if valid_stderr else None ), num_samples=num_samples, diff --git a/every_eval_ever/converters/lm_eval/utils.py b/every_eval_ever/converters/lm_eval/utils.py index 92fad5d27..93202cfc1 100644 --- a/every_eval_ever/converters/lm_eval/utils.py +++ b/every_eval_ever/converters/lm_eval/utils.py @@ -73,4 +73,8 @@ def find_samples_file(output_dir: Path, task_name: str) -> Optional[Path]: 'rougeLsum': (0.0, 1.0), 'ter': (0.0, None), 'brier_score': (0.0, 1.0), + 'prompt_level_strict_acc': (0.0, 1.0), + 'inst_level_strict_acc': (0.0, 1.0), + 'prompt_level_loose_acc': (0.0, 1.0), + 'inst_level_loose_acc': (0.0, 1.0), }