Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 61 additions & 5 deletions tests/models/model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,14 +265,14 @@ def _mode_specific_baseline_value(self, attr_name: str):
mode_suffix = "FAST" if self._is_fast_model_test_mode() else "SLOW"
preferred = f"{attr_name}_{mode_suffix}"
if hasattr(self, preferred):
return getattr(self, preferred)
return self._resolve_metric_baseline_value(getattr(self, preferred))

if self._is_fast_model_test_mode():
fallback = f"{attr_name}_SLOW"
if hasattr(self, fallback):
return getattr(self, fallback)
return self._resolve_metric_baseline_value(getattr(self, fallback))

return getattr(self, attr_name, None)
return self._resolve_metric_baseline_value(getattr(self, attr_name, None))

def _legacy_metric_ceil_pct(self) -> float:
if self._is_fast_model_test_mode():
Expand Down Expand Up @@ -492,12 +492,12 @@ def _normalize_metric_spec(self, spec):
if isinstance(spec, dict):
if "value" not in spec:
raise ValueError("Baseline metric dictionaries must include a `value` key.")
value = spec["value"]
value = self._resolve_metric_baseline_value(spec["value"])
floor_pct = spec.get("floor_pct", spec.get("max_delta_floor_percent", default_floor))
ceil_pct = spec.get("ceil_pct", spec.get("max_delta_ceil_percent", default_ceil))
metric_key = spec.get("metric_key")
else:
value = spec
value = self._resolve_metric_baseline_value(spec)
floor_pct = default_floor
ceil_pct = default_ceil
metric_key = None
Expand All @@ -516,6 +516,62 @@ def _normalize_metric_spec(self, spec):
"metric_key": metric_key,
}

@staticmethod
def _detect_cuda0_name():
try:
if not torch.cuda.is_available():
return None
return str(torch.cuda.get_device_name(0))
except Exception:
return None

@classmethod
def _detect_gpu_profile(cls):
cuda0_name = cls._detect_cuda0_name()
if not cuda0_name:
return None

normalized = cuda0_name.lower()
if "a100" in normalized:
return "A100"
if "4090" in normalized:
return "RTX4090"
return None

def _resolve_metric_baseline_value(self, value):
if not isinstance(value, Mapping):
return value

normalized_lookup = {
self._normalize_gpu_profile_key(key): val
for key, val in value.items()
}
gpu_profile = self._detect_gpu_profile()
normalized_profile = self._normalize_gpu_profile_key(gpu_profile)

if normalized_profile is not None and normalized_profile in normalized_lookup:
return normalized_lookup[normalized_profile]

if "a100" in normalized_lookup:
return normalized_lookup["a100"]

available = ", ".join(sorted(normalized_lookup.keys()))
raise ValueError(
"Unable to resolve GPU-specific baseline value. "
f"Detected profile={gpu_profile!r}, available profiles={available}."
)

@staticmethod
def _normalize_gpu_profile_key(profile):
if profile is None:
return None
normalized = str(profile).strip().lower().replace("-", "").replace("_", "").replace(" ", "")
if normalized == "a100":
return "a100"
if normalized in {"rtx4090", "4090"}:
return "rtx4090"
return normalized

def get_eval_tasks(self):
self._task_chat_template = {}
self._task_evalution_suite_kwargs = {}
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_apertus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class TestApertus(ModelTest):
EVAL_TASKS_SLOW = {
"arc_challenge": {
"chat_template": True,
"acc": {"value": 0.5145, "floor_pct": 0.2},
"acc_norm": {"value": 0.5256, "floor_pct": 0.2},
"acc": {"value": {"A100": 0.5136, "RTX4090": 0.5136}, "floor_pct": 0.20},
"acc_norm": {"value": {"A100": 0.5085, "RTX4090": 0.5059}, "floor_pct": 0.20},
},
}
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class TestBaiChuan(ModelTest):
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4317
NATIVE_ARC_CHALLENGE_ACC_SLOW = NATIVE_ARC_CHALLENGE_ACC
NATIVE_ARC_CHALLENGE_ACC_NORM_SLOW = NATIVE_ARC_CHALLENGE_ACC_NORM
NATIVE_ARC_CHALLENGE_ACC_FAST = 0.3762
NATIVE_ARC_CHALLENGE_ACC_NORM_FAST = 0.3882
NATIVE_ARC_CHALLENGE_ACC_FAST = {"A100": 0.3771, "RTX4090": 0.3890}
NATIVE_ARC_CHALLENGE_ACC_NORM_FAST = {"A100": 0.3890, "RTX4090": 0.4001}
MODEL_MAX_LEN = 4096
TRUST_REMOTE_CODE = True
USE_FLASH_ATTN = False
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class TestBloom(ModelTest):
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2440
NATIVE_ARC_CHALLENGE_ACC_SLOW = NATIVE_ARC_CHALLENGE_ACC
NATIVE_ARC_CHALLENGE_ACC_NORM_SLOW = NATIVE_ARC_CHALLENGE_ACC_NORM
NATIVE_ARC_CHALLENGE_ACC_FAST = 0.22440273037542663
NATIVE_ARC_CHALLENGE_ACC_NORM_FAST = 0.23720136518771331
NATIVE_ARC_CHALLENGE_ACC_FAST = {"A100": 0.2201, "RTX4090": 0.2124}
NATIVE_ARC_CHALLENGE_ACC_NORM_FAST = {"A100": 0.2397, "RTX4090": 0.2380}
TORCH_DTYPE = torch.float16
USE_FLASH_ATTN = False

Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ class TestCohere(ModelTest):
NATIVE_MODEL_ID = "/monster/data/model/aya-expanse-8b" # "CohereForAI/aya-expanse-8b"
EVAL_TASKS_SLOW = {
"arc_challenge": {
"acc": {"value": 0.5401, "floor_pct": 0.20},
"acc_norm": {"value": 0.5640, "floor_pct": 0.20},
"acc": {"value": {"A100": 0.5546, "RTX4090": 0.5520}, "floor_pct": 0.20},
"acc_norm": {"value": {"A100": 0.5802, "RTX4090": 0.5802}, "floor_pct": 0.20},
},
}
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
Expand Down
47 changes: 45 additions & 2 deletions tests/test_model_test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
from models import model_test as model_test_module
from models.model_test import ModelTest
import model_test as model_test_module
from model_test import ModelTest


class FakeBatchEncoding(dict):
Expand Down Expand Up @@ -142,3 +142,46 @@ def _broken_load_dataset(*args, **kwargs):
dataset = ModelTest.load_dataset(rows=5)

assert list(dataset) == [{"text": "x"}, {"text": "y"}]


def test_detect_gpu_profile_from_cuda0_name(monkeypatch):
monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
monkeypatch.setattr(torch.cuda, "get_device_name", lambda _idx: "NVIDIA A100-SXM4-80GB")
assert ModelTest._detect_gpu_profile() == "A100"

monkeypatch.setattr(torch.cuda, "get_device_name", lambda _idx: "NVIDIA GeForce RTX 4090")
assert ModelTest._detect_gpu_profile() == "RTX4090"


def test_resolve_metric_baseline_value_uses_gpu_profile(monkeypatch):
helper = ModelTest(methodName="runTest")
monkeypatch.setattr(ModelTest, "_detect_gpu_profile", classmethod(lambda cls: "A100"))
selected = helper._resolve_metric_baseline_value({"A100": 0.6, "RTX4090": 0.5})
assert selected == 0.6

monkeypatch.setattr(ModelTest, "_detect_gpu_profile", classmethod(lambda cls: "unknown"))
selected_a100_fallback = helper._resolve_metric_baseline_value({"A100": 0.6, "RTX4090": 0.5})
assert selected_a100_fallback == 0.6

monkeypatch.setattr(ModelTest, "_detect_gpu_profile", classmethod(lambda cls: "A100"))
selected_a100_fallback = helper._resolve_metric_baseline_value(0.6)
assert selected_a100_fallback == 0.6

monkeypatch.setattr(ModelTest, "_detect_gpu_profile", classmethod(lambda cls: "RTX 4090"))
selected_a100_fallback = helper._resolve_metric_baseline_value(0.6)
assert selected_a100_fallback == 0.6

monkeypatch.setattr(ModelTest, "_detect_gpu_profile", classmethod(lambda cls: "unknown"))
selected_a100_fallback = helper._resolve_metric_baseline_value(0.6)
assert selected_a100_fallback == 0.6


def test_mode_specific_baseline_value_supports_gpu_mapping(monkeypatch):
class _ModeSpecificHarness(ModelTest):
NATIVE_ARC_CHALLENGE_ACC_FAST = {"A100": 0.55, "RTX4090": 0.53}

helper = _ModeSpecificHarness(methodName="runTest")
monkeypatch.setattr(_ModeSpecificHarness, "_is_fast_model_test_mode", lambda self: True)
monkeypatch.setattr(_ModeSpecificHarness, "_detect_gpu_profile", classmethod(lambda cls: "RTX4090"))

assert helper._mode_specific_baseline_value("NATIVE_ARC_CHALLENGE_ACC") == 0.53
Loading