diff --git a/docs/README_bench.md b/docs/README_bench.md new file mode 100644 index 000000000..66a27fd97 --- /dev/null +++ b/docs/README_bench.md @@ -0,0 +1,136 @@ +# iris.bench - Unified Benchmarking Harness + +A standardized benchmarking infrastructure for Iris using a decorator-based approach. + +## Quick Start + +```python +from iris.bench import benchmark + +@benchmark(name="my_kernel", warmup=5, iters=50) +def run_benchmark(shmem, size=1024): + # shmem is automatically created by the decorator + + @setup + def allocate(): + buffer = shmem.zeros(size, size) + return buffer + + @measure + def kernel_launch(buffer): + my_kernel[grid](buffer) + +result = run_benchmark(size=2048) +result.print_summary() +``` + +## Key Features + +- ✅ **Automatic iris instance creation** - The decorator creates and manages the iris instance +- ✅ **Code annotation** - Use @setup, @preamble, and @measure to organize your code +- ✅ **Rich statistics** - mean, median, p50, p99, min, max automatically computed +- ✅ **Automatic barrier synchronization** - Built-in multi-GPU support +- ✅ **JSON export** - Structured results for CI/CD integration +- ✅ **Utility functions** - `torch_dtype_from_str`, `compute_bandwidth_gbps` + +## Code Annotations + +The benchmarking decorator uses three function annotations: + +### @setup +Runs **once** before any timing starts. Use for: +- Tensor allocation +- Initial data setup +- One-time configuration + +Returns values are passed to @preamble and @measure functions. + +### @preamble +Runs **before each timed iteration**. Use for: +- Resetting output buffers +- Clearing flags/state +- Per-iteration setup + +Receives the values returned by @setup. + +### @measure +The code that gets **actually timed**. Use for: +- Kernel launches +- The operation you want to benchmark + +Receives the values returned by @setup. + +## Full Example + +```python +from iris.bench import benchmark + +@benchmark(name="gemm", warmup=5, iters=50, heap_size=1<<33) +def run_gemm(shmem, m=8192, n=4608, k=36864): + + @setup + def allocate_matrices(): + # Runs once - allocate tensors + A = shmem.randn(m, k, dtype=torch.float16) + B = shmem.randn(k, n, dtype=torch.float16) + C = shmem.zeros(m, n, dtype=torch.float16) + return A, B, C + + @preamble + def reset_output(A, B, C): + # Runs before each iteration - clear output + C.zero_() + + @measure + def compute(A, B, C): + # This gets timed - run kernel + gemm_kernel[grid](A, B, C, m, n, k) + +result = run_gemm(m=8192, n=4608, k=36864) +result.print_summary() +result.to_json("results.json") # Export to JSON +``` + +## Documentation + +- 📖 [Full API Documentation](bench_harness.md) +- 📖 [Migration Guide](bench_migration_example.md) +- 💻 [Complete Examples](../examples/benchmark/bench_harness_example.py) + +## Testing + +```bash +# Run basic tests (no GPU required) +python3 tests/unittests/test_bench_basic.py + +# Run full test suite (requires GPU) +pytest tests/unittests/test_bench.py +``` + +## API Overview + +### @benchmark decorator +Main decorator for benchmarking with automatic iris instance management. + +**Parameters:** +- `name` - Benchmark name +- `warmup` - Number of warmup iterations (default: 25) +- `iters` - Number of timing iterations (default: 100) +- `heap_size` - Iris heap size (default: 1<<33) +- `auto_print` - Auto-print results (default: False) + +### BenchmarkResult +Stores benchmark results with automatic statistics. + +**Methods:** +- `print_summary()` - Human-readable output +- `to_dict()` - Convert to dictionary +- `to_json()` - Convert to JSON string + +### Utilities +- `torch_dtype_from_str(dtype_str)` - Convert string to torch.dtype +- `compute_bandwidth_gbps(bytes, time_ms)` - Calculate bandwidth + +## License + +MIT License - Copyright (c) 2025-2026 Advanced Micro Devices, Inc. diff --git a/docs/bench_harness.md b/docs/bench_harness.md new file mode 100644 index 000000000..3c7a3f87f --- /dev/null +++ b/docs/bench_harness.md @@ -0,0 +1,244 @@ +# Benchmarking Harness (iris.bench) + +The `iris.bench` module provides a unified, decorator-based infrastructure for benchmarking Iris operations. + +## Overview + +The benchmarking harness eliminates code duplication by providing: + +- **Automatic iris instance management**: The decorator creates and manages the iris instance +- **Code organization**: Use @setup, @preamble, @measure annotations +- **Automatic statistics**: mean, median, p50, p99, min, max +- **Barrier synchronization**: Built-in multi-GPU support +- **Structured output**: JSON export for CI/CD + +## Quick Start + +```python +from iris.bench import benchmark + +@benchmark(name="my_kernel", warmup=5, iters=50) +def run_benchmark(shmem, size=1024): + # shmem is automatically created by the decorator + + @setup + def allocate(): + buffer = shmem.zeros(size, size) + return buffer + + @measure + def kernel_launch(buffer): + my_kernel[grid](buffer) + +result = run_benchmark(size=2048) +result.print_summary() +``` + +## API Reference + +### @benchmark Decorator + +Main decorator for benchmarking with automatic iris instance management. + +```python +@benchmark( + name: str, + warmup: int = 25, + iters: int = 100, + heap_size: int = 1 << 33, + auto_print: bool = False, +) +``` + +**Parameters:** +- `name` - Benchmark name +- `warmup` - Number of warmup iterations (default: 25) +- `iters` - Number of timing iterations (default: 100) +- `heap_size` - Iris symmetric heap size (default: 1<<33) +- `auto_print` - Automatically print results (default: False) + +**Returns:** BenchmarkResult + +### Code Annotations + +Within your benchmark function, use these decorators to organize code: + +#### @setup +Runs **once** before any timing starts. + +**Use for:** +- Tensor allocation +- Initial data setup +- One-time configuration + +**Returns:** Values passed to @preamble and @measure + +#### @preamble +Runs **before each timed iteration**. + +**Use for:** +- Resetting output buffers +- Clearing flags/state +- Per-iteration setup + +**Parameters:** Receives values from @setup + +#### @measure (Required) +The code that gets **timed**. + +**Use for:** +- Kernel launches +- The operation you want to benchmark + +**Parameters:** Receives values from @setup + +### BenchmarkResult + +Dataclass storing benchmark results. + +**Attributes:** +- `name: str` - Benchmark name +- `mean_ms: float` - Mean time in milliseconds +- `median_ms: float` - Median time +- `p50_ms: float` - 50th percentile +- `p99_ms: float` - 99th percentile +- `min_ms: float` - Minimum time +- `max_ms: float` - Maximum time +- `n_warmup: int` - Number of warmup iterations +- `n_repeat: int` - Number of timing iterations +- `params: Dict` - Benchmark parameters +- `raw_times: List[float]` - Raw timing measurements + +**Methods:** +- `to_dict(include_raw_times=False)` - Convert to dictionary +- `to_json(include_raw_times=False, indent=2)` - Convert to JSON +- `print_summary()` - Print formatted summary + +### Utility Functions + +#### torch_dtype_from_str + +```python +dtype = torch_dtype_from_str("fp16") # -> torch.float16 +``` + +Supported: `"int8"`, `"fp16"`, `"bf16"`, `"fp32"` + +#### compute_bandwidth_gbps + +```python +bandwidth = compute_bandwidth_gbps(total_bytes, time_ms) +``` + +Computes bandwidth in GiB/s. + +## Examples + +### Example 1: Simple Benchmark + +```python +from iris.bench import benchmark + +@benchmark(name="vector_add", warmup=5, iters=50) +def bench_add(shmem, size=1024): + + @setup + def allocate(): + a = shmem.randn(size) + b = shmem.randn(size) + c = shmem.zeros(size) + return a, b, c + + @measure + def compute(a, b, c): + c.copy_(a + b) + +result = bench_add(size=1024) +result.print_summary() +``` + +### Example 2: With Preamble + +```python +@benchmark(name="gemm", warmup=5, iters=50, heap_size=1<<33) +def bench_gemm(shmem, m=8192, n=4608, k=36864): + + @setup + def allocate(): + A = shmem.randn(m, k, dtype=torch.float16) + B = shmem.randn(k, n, dtype=torch.float16) + C = shmem.zeros(m, n, dtype=torch.float16) + return A, B, C + + @preamble + def reset(A, B, C): + C.zero_() + + @measure + def compute(A, B, C): + gemm_kernel[grid](A, B, C, m, n, k) + +result = bench_gemm() +``` + +### Example 3: Bandwidth Calculation + +```python +from iris.bench import benchmark, compute_bandwidth_gbps + +@benchmark(name="copy", warmup=5, iters=50) +def bench_copy(shmem, size=1024*1024*256): + + @setup + def allocate(): + src = shmem.randn(size, dtype=torch.float16) + dst = shmem.zeros(size, dtype=torch.float16) + return src, dst + + @measure + def copy(src, dst): + dst.copy_(src) + +result = bench_copy() + +# Compute bandwidth +element_size = 2 # float16 +total_bytes = size * element_size +bandwidth = compute_bandwidth_gbps(total_bytes, result.mean_ms) +print(f"Bandwidth: {bandwidth:.2f} GiB/s") +``` + +### Example 4: JSON Export + +```python +result = bench_gemm(m=8192, n=4608, k=36864) + +# Export to JSON +with open("results.json", "w") as f: + f.write(result.to_json(include_raw_times=True)) + +# Or use to_dict for custom processing +data = result.to_dict() +print(f"Mean: {data['mean_ms']:.2f} ms") +``` + +## Integration + +The harness uses `iris.do_bench` internally for timing, ensuring consistency with existing code. The @benchmark decorator: +- Creates the iris instance +- Manages barrier synchronization automatically +- Handles warmup and iteration loops +- Computes statistics automatically + +## Notes + +- The `shmem` parameter is automatically injected by the decorator +- `@setup`, `@preamble`, and `@measure` are injected at runtime +- At least one `@measure` decorated function is required +- `@setup` and `@preamble` are optional + +## See Also + +- [Quick Start Guide](README_bench.md) +- [Migration Examples](bench_migration_example.md) +- [Working Examples](../examples/benchmark/bench_harness_example.py) diff --git a/docs/bench_migration_example.md b/docs/bench_migration_example.md new file mode 100644 index 000000000..f392e9bba --- /dev/null +++ b/docs/bench_migration_example.md @@ -0,0 +1,234 @@ +# Benchmark Harness Migration Guide + +This guide shows how to migrate existing Iris benchmarks to use the new `iris.bench` decorator. + +## Key Changes + +The new harness: +1. **Decorator-only** - Uses @benchmark decorator exclusively +2. **Automatic iris instance** - Creates and passes `shmem` to your function +3. **Code annotations** - @setup, @preamble, @measure organize your code + +## Before: Original Pattern + +Original benchmarks had ~100 lines of duplicated boilerplate: + +```python +import argparse +import iris +import torch + +def torch_dtype_from_str(datatype: str) -> torch.dtype: + """Duplicated in many files""" + dtype_map = { + "int8": torch.int8, + "fp16": torch.float16, + "bf16": torch.bfloat16, + "fp32": torch.float32, + } + return dtype_map.get(datatype, torch.float16) + +def parse_args(): + """Duplicated argument parsing""" + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--datatype", default="fp16") + parser.add_argument("-w", "--num_warmup", type=int, default=1) + parser.add_argument("-n", "--num_experiments", type=int, default=10) + # ... more arguments + return vars(parser.parse_args()) + +def bench_load(shmem, source_buffer, result_buffer, dtype, + num_experiments=10, num_warmup=1): + """Manual timing and statistics""" + cur_rank = shmem.get_rank() + n_elements = source_buffer.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) + + def run_kernel(): + if cur_rank == 0: + load_kernel[grid](source_buffer, result_buffer, n_elements) + + # Manual warmup + for _ in range(num_warmup): + run_kernel() + shmem.barrier() + + # Manual timing + triton_ms = iris.do_bench(run_kernel, shmem.barrier, + n_repeat=num_experiments, + n_warmup=0) # Already warmed up + + # Manual bandwidth calculation + element_size_bytes = torch.tensor([], dtype=dtype).element_size() + total_bytes = n_elements * element_size_bytes + bandwidth_gbps = total_bytes / (triton_ms * 1e-3) / 2**30 + + print(f"Time: {triton_ms:.4f} ms") + print(f"Bandwidth: {bandwidth_gbps:.4f} GiB/s") + + return bandwidth_gbps + +# Main +args = parse_args() +shmem = iris.iris(args["heap_size"]) +dtype = torch_dtype_from_str(args["datatype"]) +source_buffer = shmem.ones(args["buffer_size"], dtype=dtype) +result_buffer = shmem.zeros_like(source_buffer) + +bandwidth = bench_load(shmem, source_buffer, result_buffer, dtype, + num_experiments=args["num_experiments"], + num_warmup=args["num_warmup"]) +``` + +**Issues:** +- ~100+ lines of boilerplate +- Duplicated utility functions across 10+ files +- No standardized statistics (p50, p99) +- Manual warmup and timing +- No JSON export + +## After: Using iris.bench + +Clean, focused code: + +```python +import torch +from iris.bench import benchmark, torch_dtype_from_str, compute_bandwidth_gbps + +@benchmark(name="load_operation", warmup=5, iters=50, heap_size=1<<33) +def bench_load(shmem, buffer_size=1<<32, dtype_str="fp16"): + """Clean benchmark using iris.bench""" + # shmem is automatically created by the decorator + + dtype = torch_dtype_from_str(dtype_str) + + @setup + def allocate_buffers(): + # Runs once before timing + source_buffer = shmem.ones(buffer_size, dtype=dtype) + result_buffer = shmem.zeros(buffer_size, dtype=dtype) + return source_buffer, result_buffer + + @preamble + def reset_output(source_buffer, result_buffer): + # Runs before each timed iteration + result_buffer.zero_() + + @measure + def run_kernel(source_buffer, result_buffer): + # This gets timed + n_elements = source_buffer.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) + load_kernel[grid](source_buffer, result_buffer, n_elements) + +# Run benchmark +result = bench_load(buffer_size=1<<32, dtype_str="fp16") + +# Automatic statistics available +result.print_summary() # Shows mean, p50, p99, etc. + +# Compute bandwidth using helper +element_size = torch.tensor([], dtype=torch_dtype_from_str("fp16")).element_size() +bandwidth = compute_bandwidth_gbps(1<<32 * element_size, result.mean_ms) +print(f"Bandwidth: {bandwidth:.2f} GiB/s") + +# Export to JSON +result.to_json("results.json") +``` + +**Benefits:** +- ~50% less code (50 lines vs 100 lines) +- No duplicated utility functions +- Automatic statistics (mean, median, p50, p99) +- No manual warmup/timing logic +- JSON export included +- Cleaner code organization with @setup/@preamble/@measure + +## Code Size Comparison + +| Component | Before (lines) | After (lines) | Reduction | +|-----------|----------------|---------------|-----------| +| Utility functions | 15 | 1 (import) | 93% | +| Argument parsing | 25 | 0 (use params) | 100% | +| iris setup | 5 | 0 (automatic) | 100% | +| Warmup/timing | 15 | 0 (automatic) | 100% | +| Statistics | 5 | 0 (automatic) | 100% | +| Result output | 10 | 1 (print_summary) | 90% | +| **Total** | **~100** | **~50** | **~50%** | + +## Migration Steps + +1. **Replace manual setup with @benchmark decorator** + - Remove manual `iris.iris()` creation + - Add `shmem` as first parameter + - Add @benchmark decorator with config + +2. **Organize code with annotations** + - Move tensor allocation to @setup + - Move per-iteration setup to @preamble + - Mark kernel launch with @measure + +3. **Remove boilerplate** + - Delete duplicated utility functions (use `iris.bench.torch_dtype_from_str`) + - Remove manual warmup loops + - Remove manual timing code + - Remove manual statistics computation + +4. **Use structured output** + - Replace manual printing with `result.print_summary()` + - Use `result.to_json()` for CI integration + +## Parameter Sweeps + +### Before +```python +for size in [1024, 2048, 4096]: + for dtype_str in ["fp16", "fp32"]: + result = bench_func(size, dtype_str) + # Manual result tracking + results.append({"size": size, "dtype": dtype_str, "time": result}) +``` + +### After +```python +results = [] +for size in [1024, 2048, 4096]: + for dtype_str in ["fp16", "fp32"]: + result = bench_func(size=size, dtype_str=dtype_str) + results.append(result.to_dict()) + +# Export all results +import json +with open("sweep_results.json", "w") as f: + json.dump(results, f, indent=2) +``` + +## Best Practices + +1. **Use @setup for expensive one-time operations** + - Tensor allocation + - Data initialization + - Configuration setup + +2. **Use @preamble for state reset** + - Zeroing output buffers + - Resetting flags + - Clearing caches + +3. **Keep @measure focused** + - Only the kernel launch + - The operation being benchmarked + - No setup or teardown code + +4. **Leverage automatic features** + - Let decorator handle iris instance creation + - Use automatic barrier synchronization + - Trust automatic statistics computation + +## Examples + +See `examples/benchmark/bench_harness_example.py` for complete working examples. + +## License + +MIT License - Copyright (c) 2025-2026 Advanced Micro Devices, Inc. diff --git a/examples/benchmark/bench_harness_example.py b/examples/benchmark/bench_harness_example.py new file mode 100644 index 000000000..ba5ee7e39 --- /dev/null +++ b/examples/benchmark/bench_harness_example.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. + +""" +Example demonstrating the unified benchmarking harness (iris.bench). + +This example shows how to use the @benchmark decorator with @setup, @preamble, +and @measure annotations. The decorator automatically creates an iris instance +and passes it to your function. + +Note: setup, preamble, and measure are injected by the @benchmark decorator +at runtime and are not imported. This is intentional. +""" + +# ruff: noqa: F821 + +import torch +from iris.bench import benchmark, torch_dtype_from_str, compute_bandwidth_gbps + + +# Example 1: Simple benchmark with setup and measure +@benchmark(name="simple_operation", warmup=2, iters=5, auto_print=True) +def benchmark_simple(shmem, size=1024): + """Simple benchmark using decorator with setup and measure.""" + + @setup + def allocate_tensors(): + # Runs once before timing starts + tensor = shmem.zeros(size, size, dtype=torch.float32) + return tensor + + @measure + def run_operation(tensor): + # This is what gets timed + result = tensor + 1.0 + + +# Example 2: Benchmark with preamble for resetting state +@benchmark(name="with_preamble", warmup=2, iters=5) +def benchmark_with_preamble(shmem, size=2048): + """Benchmark demonstrating preamble usage.""" + + @setup + def allocate(): + tensor = shmem.ones(size, size, dtype=torch.float16) + output = shmem.zeros(size, size, dtype=torch.float16) + return tensor, output + + @preamble + def reset_output(tensor, output): + # Runs before each timed iteration + output.zero_() + + @measure + def compute(tensor, output): + # This gets timed + output.copy_(tensor * 2.0) + + +# Example 3: Bandwidth calculation +@benchmark(name="bandwidth_test", warmup=2, iters=5) +def benchmark_bandwidth(shmem, size=1024 * 1024 * 256, dtype_str="fp16"): + """Benchmark with bandwidth calculation.""" + dtype = torch_dtype_from_str(dtype_str) + element_size = torch.tensor([], dtype=dtype).element_size() + + @setup + def allocate(): + tensor = shmem.zeros(size, dtype=dtype) + result = shmem.zeros(size, dtype=dtype) + return tensor, result + + @measure + def copy_data(tensor, result): + result.copy_(tensor) + + +if __name__ == "__main__": + if not torch.cuda.is_available(): + print("CUDA is not available. This example requires a CUDA-enabled GPU.") + exit(1) + + print("=" * 70) + print("Iris Benchmarking Harness Examples (Decorator-Only)") + print("=" * 70) + + print("\n### Example 1: Simple operation ###") + result1 = benchmark_simple(size=1024) + # Note: auto_print=True so summary is printed automatically + + print("\n### Example 2: With preamble ###") + result2 = benchmark_with_preamble(size=2048) + result2.print_summary() + + print("\n### Example 3: Bandwidth test ###") + result3 = benchmark_bandwidth(size=1024 * 1024 * 256, dtype_str="fp16") + + # Compute bandwidth + dtype = torch_dtype_from_str("fp16") + element_size = torch.tensor([], dtype=dtype).element_size() + total_bytes = 1024 * 1024 * 256 * element_size + bandwidth = compute_bandwidth_gbps(total_bytes, result3.mean_ms) + + print(f"\nBandwidth: {bandwidth:.2f} GiB/s") + print(f"Size: {total_bytes / 2**30:.2f} GiB") + + print("\n" + "=" * 70) + print("All examples completed successfully!") + print("=" * 70) diff --git a/iris/__init__.py b/iris/__init__.py index 476158d15..400fc7ff9 100644 --- a/iris/__init__.py +++ b/iris/__init__.py @@ -67,6 +67,9 @@ do_bench, ) +# Import benchmarking utilities +from . import bench + from . import hip # Import experimental features (optional, for users who want experimental APIs) @@ -106,6 +109,7 @@ "atomic_min", "atomic_max", "do_bench", + "bench", # Benchmarking utilities "hip", "experimental", # Experimental features including iris_gluon "ops", # Fused GEMM+CCL operations diff --git a/iris/bench.py b/iris/bench.py new file mode 100644 index 000000000..8fe74fe90 --- /dev/null +++ b/iris/bench.py @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. + +""" +Unified benchmarking harness for Iris. + +This module provides a decorator-based infrastructure for benchmarking operations: +- Automatic iris instance creation and management +- Warmup and iteration handling +- Timing and synchronization +- Statistics computation (mean, p50, p99) +- Structured result output (JSON or dict) + +The harness automatically constructs the iris instance and passes it to your +benchmark function, allowing you to annotate different parts of your code: +- @setup: Runs once before any timing (e.g., tensor allocation) +- @preamble: Runs before each iteration (e.g., resetting flags) +- @measure: The code to actually benchmark (e.g., kernel launch) + +Example usage: + + from iris.bench import benchmark + + @benchmark(name="gemm_kernel", warmup=5, iters=50, heap_size=1<<33) + def run_benchmark(shmem, m=8192, n=4608, k=36864): + # shmem is automatically created by the decorator + + @setup + def allocate_tensors(): + # Runs once before timing starts + A = shmem.randn(m, k, dtype=torch.float16) + B = shmem.randn(k, n, dtype=torch.float16) + C = shmem.zeros(m, n, dtype=torch.float16) + return A, B, C + + @preamble + def reset_output(C): + # Runs before each timed iteration + C.zero_() + + @measure + def run_kernel(A, B, C): + # This is what gets timed + gemm_kernel[grid](A, B, C, m, n, k) + + result = run_benchmark(m=8192, n=4608, k=36864) + result.print_summary() +""" + +import json +from dataclasses import dataclass, field, asdict +from typing import Any, Callable, Dict, List +import functools +import torch + + +def _compute_percentile(values: List[float], percentile: float) -> float: + """Compute percentile from a list of values.""" + if not values: + return 0.0 + sorted_values = sorted(values) + k = (len(sorted_values) - 1) * (percentile / 100.0) + f = int(k) + c = f + 1 if f + 1 < len(sorted_values) else f + if f == c: + return sorted_values[int(k)] + d0 = sorted_values[f] * (c - k) + d1 = sorted_values[c] * (k - f) + return d0 + d1 + + +@dataclass +class BenchmarkResult: + """ + Stores results from a benchmark run. + + Attributes: + name: Name of the benchmark + mean_ms: Mean time in milliseconds + median_ms: Median time in milliseconds + p50_ms: 50th percentile time in milliseconds + p99_ms: 99th percentile time in milliseconds + min_ms: Minimum time in milliseconds + max_ms: Maximum time in milliseconds + n_warmup: Number of warmup iterations + n_repeat: Number of timing iterations + params: Additional parameters passed to the benchmark + metadata: Additional metadata + raw_times: Raw timing measurements in milliseconds + """ + + name: str + mean_ms: float + median_ms: float + p50_ms: float + p99_ms: float + min_ms: float + max_ms: float + n_warmup: int + n_repeat: int + params: Dict[str, Any] = field(default_factory=dict) + metadata: Dict[str, Any] = field(default_factory=dict) + raw_times: List[float] = field(default_factory=list) + + def to_dict(self, include_raw_times: bool = False) -> Dict[str, Any]: + """ + Convert result to dictionary. + + Args: + include_raw_times: Whether to include raw timing measurements + + Returns: + Dictionary representation of the result + """ + result = asdict(self) + if not include_raw_times: + result.pop("raw_times", None) + return result + + def to_json(self, include_raw_times: bool = False, indent: int = 2) -> str: + """ + Convert result to JSON string. + + Args: + include_raw_times: Whether to include raw timing measurements + indent: JSON indentation level + + Returns: + JSON string representation of the result + """ + return json.dumps(self.to_dict(include_raw_times=include_raw_times), indent=indent) + + def print_summary(self): + """Print a human-readable summary of the benchmark result.""" + print(f"\n{'=' * 60}") + print(f"Benchmark: {self.name}") + if self.params: + print(f"Parameters: {self.params}") + print(f"{'-' * 60}") + print(f"Mean: {self.mean_ms:10.4f} ms") + print(f"Median: {self.median_ms:10.4f} ms") + print(f"P50: {self.p50_ms:10.4f} ms") + print(f"P99: {self.p99_ms:10.4f} ms") + print(f"Min: {self.min_ms:10.4f} ms") + print(f"Max: {self.max_ms:10.4f} ms") + print(f"{'-' * 60}") + print(f"Warmup iterations: {self.n_warmup}") + print(f"Timing iterations: {self.n_repeat}") + if self.metadata: + print(f"Metadata: {self.metadata}") + print(f"{'=' * 60}\n") + + +class _BenchmarkContext: + """Internal context for collecting setup, preamble, and measure functions.""" + + def __init__(self): + self.setup_fn = None + self.preamble_fn = None + self.measure_fn = None + + def setup(self, fn): + """Mark a function as setup code (runs once before timing).""" + self.setup_fn = fn + return fn + + def preamble(self, fn): + """Mark a function as preamble code (runs before each timed iteration).""" + self.preamble_fn = fn + return fn + + def measure(self, fn): + """Mark a function as the code to measure (gets timed).""" + self.measure_fn = fn + return fn + + +def benchmark( + name: str, + warmup: int = 25, + iters: int = 100, + heap_size: int = 1 << 33, + auto_print: bool = False, +): + """ + Decorator for benchmarking functions with automatic iris instance management. + + The decorator creates an iris instance and passes it to your benchmark function. + Within your function, use @setup, @preamble, and @measure decorators to annotate + different parts of your benchmark code. + + Args: + name: Name of the benchmark + warmup: Number of warmup iterations + iters: Number of timing iterations + heap_size: Size of iris symmetric heap + auto_print: Whether to automatically print results + + Returns: + Decorated function that returns BenchmarkResult + + Example: + @benchmark(name="my_kernel", warmup=5, iters=50) + def run(shmem, size=1024): + @setup + def allocate(): + buffer = shmem.zeros(size, size) + return buffer + + @measure + def kernel_launch(buffer): + my_kernel[grid](buffer) + + result = run(size=2048) + result.print_summary() + """ + + def decorator(func: Callable) -> Callable: + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Import iris here to avoid circular dependencies + from . import iris as iris_module + + # Create iris instance + shmem = iris_module.iris(heap_size) + + # Create benchmark context for collecting annotated functions + ctx = _BenchmarkContext() + + # Make decorators available in the function scope + import builtins + + original_setup = getattr(builtins, "setup", None) + original_preamble = getattr(builtins, "preamble", None) + original_measure = getattr(builtins, "measure", None) + + try: + # Inject decorators into builtins temporarily + builtins.setup = ctx.setup + builtins.preamble = ctx.preamble + builtins.measure = ctx.measure + + # Call user function to collect setup/preamble/measure functions + func(shmem, *args, **kwargs) + + finally: + # Restore original builtins + if original_setup is not None: + builtins.setup = original_setup + elif hasattr(builtins, "setup"): + delattr(builtins, "setup") + + if original_preamble is not None: + builtins.preamble = original_preamble + elif hasattr(builtins, "preamble"): + delattr(builtins, "preamble") + + if original_measure is not None: + builtins.measure = original_measure + elif hasattr(builtins, "measure"): + delattr(builtins, "measure") + + # Validate that measure function was provided + if ctx.measure_fn is None: + raise ValueError(f"Benchmark '{name}' must have a @measure decorated function") + + # Run setup once if provided + setup_results = () + if ctx.setup_fn is not None: + result = ctx.setup_fn() + # Convert to tuple for consistent handling + if result is None: + setup_results = () + elif isinstance(result, tuple): + setup_results = result + else: + setup_results = (result,) + + # Define preamble_fn for do_bench + def preamble_fn(): + if ctx.preamble_fn is not None: + ctx.preamble_fn(*setup_results) + + # Define measure_fn for do_bench + def measure_fn(): + ctx.measure_fn(*setup_results) + + # Import do_bench at runtime + from .util import do_bench + + # Run benchmark with automatic barrier + raw_times = do_bench( + measure_fn, + barrier_fn=shmem.barrier, + preamble_fn=preamble_fn, + n_warmup=warmup, + n_repeat=iters, + return_mode="all", + ) + + # Compute statistics + mean_ms = sum(raw_times) / len(raw_times) if raw_times else 0.0 + median_ms = _compute_percentile(raw_times, 50) + p50_ms = median_ms # P50 is the same as median + p99_ms = _compute_percentile(raw_times, 99) + min_ms = min(raw_times) if raw_times else 0.0 + max_ms = max(raw_times) if raw_times else 0.0 + + # Extract function parameters for metadata + func_params = {**kwargs} + for i, arg in enumerate(args): + if i < len(func.__code__.co_varnames) - 1: # -1 to skip 'shmem' + param_name = func.__code__.co_varnames[i + 1] # +1 to skip 'shmem' + func_params[param_name] = arg + + result = BenchmarkResult( + name=name, + mean_ms=mean_ms, + median_ms=median_ms, + p50_ms=p50_ms, + p99_ms=p99_ms, + min_ms=min_ms, + max_ms=max_ms, + n_warmup=warmup, + n_repeat=iters, + params=func_params, + raw_times=raw_times, + ) + + if auto_print: + result.print_summary() + + return result + + return wrapper + + return decorator + + +# Utility functions for common patterns + + +def torch_dtype_from_str(datatype: str) -> torch.dtype: + """ + Convert string datatype to torch.dtype. + + Args: + datatype: String representation of datatype + + Returns: + torch.dtype object + + Raises: + ValueError: If datatype is not recognized + """ + dtype_map = { + "int8": torch.int8, + "fp16": torch.float16, + "bf16": torch.bfloat16, + "fp32": torch.float32, + } + if datatype not in dtype_map: + raise ValueError(f"Unknown datatype: {datatype}. Expected one of {list(dtype_map.keys())}") + return dtype_map[datatype] + + +def compute_bandwidth_gbps( + total_bytes: int, + time_ms: float, +) -> float: + """ + Compute bandwidth in GiB/s. + + Args: + total_bytes: Total number of bytes transferred + time_ms: Time in milliseconds + + Returns: + Bandwidth in GiB/s + """ + time_sec = time_ms * 1e-3 + return total_bytes / time_sec / (2**30) + + +__all__ = [ + "BenchmarkResult", + "benchmark", + "torch_dtype_from_str", + "compute_bandwidth_gbps", +] diff --git a/tests/unittests/test_bench_basic.py b/tests/unittests/test_bench_basic.py new file mode 100644 index 000000000..e25d1ee61 --- /dev/null +++ b/tests/unittests/test_bench_basic.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. + +""" +Basic tests for iris.bench module that don't require GPU or iris runtime. +Tests the new decorator-only approach. +""" + +import json +import sys +from pathlib import Path + +# Import bench module directly without going through iris.__init__ +bench_path = Path(__file__).parent.parent.parent / "iris" / "bench.py" +import importlib.util + +spec = importlib.util.spec_from_file_location("bench", bench_path) +bench = importlib.util.module_from_spec(spec) +spec.loader.exec_module(bench) + +import torch + + +def test_benchmark_result_creation(): + """Test creating a BenchmarkResult object.""" + result = bench.BenchmarkResult( + name="test_benchmark", + mean_ms=10.5, + median_ms=10.2, + p50_ms=10.2, + p99_ms=15.3, + min_ms=8.1, + max_ms=16.2, + n_warmup=5, + n_repeat=50, + params={"size": 1024}, + metadata={"gpu": "MI300X"}, + raw_times=[10.1, 10.2, 10.3], + ) + + assert result.name == "test_benchmark" + assert result.mean_ms == 10.5 + assert result.median_ms == 10.2 + assert result.p50_ms == 10.2 + assert result.p99_ms == 15.3 + assert result.min_ms == 8.1 + assert result.max_ms == 16.2 + assert result.n_warmup == 5 + assert result.n_repeat == 50 + assert result.params == {"size": 1024} + assert result.metadata == {"gpu": "MI300X"} + assert result.raw_times == [10.1, 10.2, 10.3] + print("✓ test_benchmark_result_creation passed") + + +def test_benchmark_result_to_dict(): + """Test converting BenchmarkResult to dictionary.""" + result = bench.BenchmarkResult( + name="test", + mean_ms=10.0, + median_ms=10.0, + p50_ms=10.0, + p99_ms=12.0, + min_ms=9.0, + max_ms=13.0, + n_warmup=5, + n_repeat=10, + raw_times=[9.0, 10.0, 11.0, 12.0, 13.0], + ) + + # Without raw times + d = result.to_dict(include_raw_times=False) + assert "raw_times" not in d + assert d["name"] == "test" + assert d["mean_ms"] == 10.0 + + # With raw times + d = result.to_dict(include_raw_times=True) + assert "raw_times" in d + assert d["raw_times"] == [9.0, 10.0, 11.0, 12.0, 13.0] + print("✓ test_benchmark_result_to_dict passed") + + +def test_benchmark_result_to_json(): + """Test converting BenchmarkResult to JSON.""" + result = bench.BenchmarkResult( + name="test", + mean_ms=10.0, + median_ms=10.0, + p50_ms=10.0, + p99_ms=12.0, + min_ms=9.0, + max_ms=13.0, + n_warmup=5, + n_repeat=10, + ) + + json_str = result.to_json() + parsed = json.loads(json_str) + assert parsed["name"] == "test" + assert parsed["mean_ms"] == 10.0 + print("✓ test_benchmark_result_to_json passed") + + +def test_compute_percentile(): + """Test percentile computation.""" + values = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] + + p50 = bench._compute_percentile(values, 50) + assert 5.0 <= p50 <= 6.0 + + p99 = bench._compute_percentile(values, 99) + assert p99 > 9.0 + + # Edge cases + assert bench._compute_percentile([], 50) == 0.0 + assert bench._compute_percentile([5.0], 50) == 5.0 + print("✓ test_compute_percentile passed") + + +def test_torch_dtype_from_str(): + """Test torch_dtype_from_str utility.""" + assert bench.torch_dtype_from_str("int8") == torch.int8 + assert bench.torch_dtype_from_str("fp16") == torch.float16 + assert bench.torch_dtype_from_str("bf16") == torch.bfloat16 + assert bench.torch_dtype_from_str("fp32") == torch.float32 + + try: + bench.torch_dtype_from_str("invalid") + assert False, "Should have raised ValueError" + except ValueError as e: + assert "Unknown datatype" in str(e) + print("✓ test_torch_dtype_from_str passed") + + +def test_compute_bandwidth_gbps(): + """Test bandwidth computation.""" + # 1 GiB in 1 second = 1 GiB/s + bandwidth = bench.compute_bandwidth_gbps(2**30, 1000) + assert abs(bandwidth - 1.0) < 0.001 + + # 2 GiB in 0.5 seconds = 4 GiB/s + bandwidth = bench.compute_bandwidth_gbps(2 * 2**30, 500) + assert abs(bandwidth - 4.0) < 0.001 + + # 512 MiB in 100ms = 5 GiB/s + bandwidth = bench.compute_bandwidth_gbps(512 * 2**20, 100) + assert abs(bandwidth - 5.0) < 0.01 + print("✓ test_compute_bandwidth_gbps passed") + + +if __name__ == "__main__": + test_benchmark_result_creation() + test_benchmark_result_to_dict() + test_benchmark_result_to_json() + test_compute_percentile() + test_torch_dtype_from_str() + test_compute_bandwidth_gbps() + print("\n✅ All tests passed!")