Skip to content

Commit 63bdeab

Browse files
committed
- increased size of numpy.dot test
- added xgboost tests for different tree methods - added script to generate text result tables
1 parent 8ae59e7 commit 63bdeab

4 files changed

Lines changed: 278 additions & 20 deletions

File tree

benchmarks/suites/numpy_suite.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ def add_numpy_benchmarks(runner: pyperf.Runner) -> None:
66
rng = np.random.default_rng(0)
77

88
# 1D dot product (BLAS-backed on many builds)
9-
a1 = rng.standard_normal(2048, dtype=np.float64)
10-
b1 = rng.standard_normal(2048, dtype=np.float64)
9+
a1 = rng.standard_normal(2**22, dtype=np.float64)
10+
b1 = rng.standard_normal(2**22, dtype=np.float64)
1111

1212
def dot_1d_f64() -> float:
1313
return float(np.dot(a1, b1))
1414

15-
runner.bench_func("numpy.dot[2048,f64]", dot_1d_f64, inner_loops=50)
15+
runner.bench_func("numpy.dot[2^22,f64]", dot_1d_f64, inner_loops=10)
1616

1717
# Matrix multiply (BLAS-heavy)
1818
a2 = rng.standard_normal((512, 512), dtype=np.float32)
@@ -26,13 +26,13 @@ def matmul_512_f32() -> float:
2626
runner.bench_func("numpy.matmul[512x512,f32]", matmul_512_f32, inner_loops=5)
2727

2828
# FFT (PocketFFT inside NumPy; not BLAS)
29-
a3 = rng.standard_normal(2**18, dtype=np.float64)
29+
a3 = rng.standard_normal(2**17, dtype=np.float64)
3030

31-
def rfft_2p18_f64() -> float:
31+
def rfft_2p17_f64() -> float:
3232
y = np.fft.rfft(a3)
3333
return float(y.real[0])
3434

35-
runner.bench_func("numpy.fft.rfft[2^18,f64]", rfft_2p18_f64, inner_loops=3)
35+
runner.bench_func("numpy.fft.rfft[2^17,f64]", rfft_2p17_f64, inner_loops=3)
3636

3737
# Elementwise ufunc
3838
a4 = rng.standard_normal(2**20, dtype=np.float32)
@@ -45,11 +45,11 @@ def exp_2p20_f32() -> float:
4545
runner.bench_func("numpy.exp[2^20,f32]", exp_2p20_f32, inner_loops=10)
4646

4747
# Reduction
48-
a5 = rng.standard_normal(2**20, dtype=np.float64)
48+
a5 = rng.standard_normal(2**23, dtype=np.float64)
4949

50-
def sum_2p20_f64() -> float:
50+
def sum_2p23_f64() -> float:
5151
return float(a5.sum())
5252

53-
runner.bench_func("numpy.sum[2^20,f64]", sum_2p20_f64, inner_loops=10)
53+
runner.bench_func("numpy.sum[2^23,f64]", sum_2p23_f64, inner_loops=10)
5454

5555

benchmarks/suites/xgboost_suite.py

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,18 @@
66

77
def _xgboost_nthread() -> int:
88
"""
9-
XGBoost has multiple thread controls; for the benchmarks we pin `nthread`
10-
so arch comparisons don't get dominated by different default parallelism.
9+
XGBoost has multiple thread controls; for the benchmarks we use CPU count
10+
as the default to leverage multi-threaded performance, but allow override
11+
via XGBOOST_NTHREAD env var for reproducibility.
1112
"""
12-
try:
13-
n = int(os.environ.get("XGBOOST_NTHREAD", "1"))
14-
except ValueError:
15-
n = 1
13+
if "XGBOOST_NTHREAD" in os.environ:
14+
try:
15+
n = int(os.environ["XGBOOST_NTHREAD"])
16+
except ValueError:
17+
n = 1
18+
else:
19+
# Default to CPU count for better multi-threaded performance
20+
n = os.cpu_count() or 1
1621
return max(1, n)
1722

1823

@@ -37,13 +42,18 @@ def add_xgboost_training_benchmarks(runner: pyperf.Runner) -> None:
3742
Notes:
3843
- We pre-build the `DMatrix` outside the timed region so we primarily
3944
measure the training kernel.
40-
- We pin `nthread` (default via XGBOOST_NTHREAD=1) for comparability.
45+
- We use CPU count as default `nthread` to leverage multi-threaded performance,
46+
but this can be overridden via XGBOOST_NTHREAD env var for reproducibility.
47+
- We set OMP_NUM_THREADS to match nthread to ensure OpenMP respects the thread count.
4148
"""
4249
# If this import fails (missing wheel, missing OpenMP runtime, etc.),
4350
# we want a hard error when the xgboost suite is explicitly requested.
4451
import xgboost as xgb
4552

4653
nthread = _xgboost_nthread()
54+
# Ensure OpenMP respects the thread count (XGBoost uses OpenMP internally)
55+
if "OMP_NUM_THREADS" not in os.environ:
56+
os.environ["OMP_NUM_THREADS"] = str(nthread)
4757

4858
# Moderate size: heavy enough to be above timer noise, small enough to run
4959
# under pyperf without making the suite painfully slow.
@@ -76,6 +86,34 @@ def train_hist() -> float:
7686
inner_loops=1,
7787
)
7888

89+
# Exact method: slower but more accurate, different parallelization pattern
90+
params_exact = params.copy()
91+
params_exact["tree_method"] = "exact"
92+
93+
def train_exact() -> float:
94+
booster = xgb.train(params_exact, dtrain, num_boost_round=num_boost_round)
95+
return float(booster.attributes().get("best_score", "0") or 0.0)
96+
97+
runner.bench_func(
98+
f"xgboost.train_exact[{x_train.shape[0]}x{x_train.shape[1]},rounds={num_boost_round},nt={nthread}]",
99+
train_exact,
100+
inner_loops=1,
101+
)
102+
103+
# Approx method: middle ground between exact and hist
104+
params_approx = params.copy()
105+
params_approx["tree_method"] = "approx"
106+
107+
def train_approx() -> float:
108+
booster = xgb.train(params_approx, dtrain, num_boost_round=num_boost_round)
109+
return float(booster.attributes().get("best_score", "0") or 0.0)
110+
111+
runner.bench_func(
112+
f"xgboost.train_approx[{x_train.shape[0]}x{x_train.shape[1]},rounds={num_boost_round},nt={nthread}]",
113+
train_approx,
114+
inner_loops=1,
115+
)
116+
79117

80118
def add_xgboost_inference_benchmarks(runner: pyperf.Runner) -> None:
81119
"""
@@ -86,6 +124,9 @@ def add_xgboost_inference_benchmarks(runner: pyperf.Runner) -> None:
86124
import xgboost as xgb
87125

88126
nthread = _xgboost_nthread()
127+
# Ensure OpenMP respects the thread count (XGBoost uses OpenMP internally)
128+
if "OMP_NUM_THREADS" not in os.environ:
129+
os.environ["OMP_NUM_THREADS"] = str(nthread)
89130

90131
# Train once (not timed), then benchmark predict.
91132
x_train, y_train = _make_binary_classification(n_samples=50_000, n_features=64, seed=0)

benchmarks/table.py

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
import argparse
2+
from dataclasses import dataclass
3+
from pathlib import Path
4+
from typing import Any
5+
6+
import pyperf
7+
8+
9+
@dataclass(frozen=True)
10+
class BenchmarkStats:
11+
name: str
12+
min: float
13+
median: float
14+
max: float
15+
16+
17+
def extract_stats(bench: Any) -> BenchmarkStats:
18+
"""Extract min, median, and max from a pyperf Benchmark object."""
19+
import statistics
20+
21+
# Try to use pyperf's built-in methods if available
22+
try:
23+
min_val = float(bench.min())
24+
median_val = float(bench.median())
25+
max_val = float(bench.max())
26+
except (AttributeError, TypeError):
27+
# Fallback: extract all values from runs and compute statistics
28+
values = []
29+
for run in bench.get_runs():
30+
# Skip calibration runs (they don't have 'values', only 'warmups')
31+
if hasattr(run, "values") and run.values:
32+
values.extend(run.values)
33+
34+
if not values:
35+
# If no values found, try to use mean as fallback
36+
min_val = median_val = max_val = float(bench.mean())
37+
else:
38+
min_val = float(min(values))
39+
median_val = float(statistics.median(values))
40+
max_val = float(max(values))
41+
42+
return BenchmarkStats(
43+
name=bench.get_name(),
44+
min=min_val,
45+
median=median_val,
46+
max=max_val,
47+
)
48+
49+
50+
def load_benchmark_stats(path: Path) -> dict[str, BenchmarkStats]:
51+
"""Load a pyperf JSON file and extract statistics for each benchmark."""
52+
suite = pyperf.BenchmarkSuite.load(str(path))
53+
stats: dict[str, BenchmarkStats] = {}
54+
for bench in suite:
55+
stats[bench.get_name()] = extract_stats(bench)
56+
return stats
57+
58+
59+
def format_time(seconds: float) -> str:
60+
"""Format time in seconds to a readable string."""
61+
if seconds < 1e-6:
62+
return f"{seconds * 1e9:.3f} ns"
63+
elif seconds < 1e-3:
64+
return f"{seconds * 1e6:.3f} μs"
65+
elif seconds < 1.0:
66+
return f"{seconds * 1e3:.3f} ms"
67+
else:
68+
return f"{seconds:.3f} s"
69+
70+
71+
def get_time_unit(seconds: float) -> str:
72+
"""Get the appropriate unit for a time value in seconds."""
73+
if seconds < 1e-6:
74+
return "ns"
75+
elif seconds < 1e-3:
76+
return "μs"
77+
elif seconds < 1.0:
78+
return "ms"
79+
else:
80+
return "s"
81+
82+
83+
def print_table(
84+
a_stats: dict[str, BenchmarkStats],
85+
b_stats: dict[str, BenchmarkStats],
86+
a_label: str,
87+
b_label: str,
88+
csv: bool = False,
89+
) -> None:
90+
"""Print a formatted table comparing two benchmark result sets."""
91+
# Find common benchmarks
92+
common_names = sorted(set(a_stats.keys()) & set(b_stats.keys()))
93+
if not common_names:
94+
print("ERROR: No common benchmark names found between the two files.")
95+
return
96+
97+
if csv:
98+
# CSV output
99+
print("Benchmark Name,"
100+
f"{a_label}_min (s),{a_label}_median (s),{a_label}_max (s),"
101+
f"{b_label}_min (s),{b_label}_median (s),{b_label}_max (s),"
102+
f"B/A_ratio")
103+
for name in common_names:
104+
a = a_stats[name]
105+
b = b_stats[name]
106+
ratio = b.median / a.median if a.median > 0 else float("inf")
107+
print(f'"{name}",{a.min:.9e},{a.median:.9e},{a.max:.9e},'
108+
f'{b.min:.9e},{b.median:.9e},{b.max:.9e},'
109+
f'{ratio:.6e}')
110+
return
111+
112+
# Determine a common unit for all values (pick based on median values)
113+
# Use the most common unit across all median values
114+
all_medians = [a_stats[name].median for name in common_names] + \
115+
[b_stats[name].median for name in common_names]
116+
sample_median = sorted(all_medians)[len(all_medians) // 2] # Use median of medians
117+
common_unit = get_time_unit(sample_median)
118+
119+
# Convert seconds to the common unit
120+
def to_common_unit(seconds: float) -> float:
121+
if common_unit == "ns":
122+
return seconds * 1e9
123+
elif common_unit == "μs":
124+
return seconds * 1e6
125+
elif common_unit == "ms":
126+
return seconds * 1e3
127+
else:
128+
return seconds
129+
130+
# Calculate maximum width for median values across ALL benchmarks
131+
# Using numbers without units - convert to common unit first
132+
all_med_values = [f"{to_common_unit(a_stats[name].median):.3f}" for name in common_names] + \
133+
[f"{to_common_unit(b_stats[name].median):.3f}" for name in common_names]
134+
135+
med_width = max(len(v) for v in all_med_values)
136+
137+
# Table header - make benchmark name column narrower
138+
max_name_len = max(len(name) for name in common_names)
139+
name_col_width = max(25, max_name_len + 1) # Reduced from 50 to 25
140+
141+
# Calculate stats column width (just median value width)
142+
stats_col_width = med_width
143+
144+
# Ratio column width - narrow, just enough for values like "2.10x" or "0.50x slower"
145+
ratio_header = f"{a_label} faster by"
146+
ratio_col_width = 10 # Fixed at 10 chars max
147+
148+
total_width = name_col_width + stats_col_width + stats_col_width + ratio_col_width + 9 # 9 for separators
149+
150+
# Build header line to measure its actual length
151+
header_line = (
152+
f"{'Benchmark Name':<{name_col_width}} | "
153+
f"{'A':>{stats_col_width}} | "
154+
f"{'B':>{stats_col_width}} | "
155+
f"{ratio_header:>{ratio_col_width}}"
156+
)
157+
separator_width = len(header_line)
158+
159+
print(f"\nBenchmark Comparison (median {common_unit}): {a_label} vs {b_label}")
160+
print("=" * separator_width)
161+
162+
# Simple column headers: just "A" and "B"
163+
print(header_line)
164+
print("-" * separator_width)
165+
166+
# Table rows with median values only (numbers only, no units)
167+
for name in common_names:
168+
a = a_stats[name]
169+
b = b_stats[name]
170+
171+
# Format median value with right alignment (no units)
172+
# Convert to common unit
173+
a_med_str = f"{to_common_unit(a.median):.3f}".rjust(stats_col_width)
174+
b_med_str = f"{to_common_unit(b.median):.3f}".rjust(stats_col_width)
175+
176+
# Calculate ratio: B/A shows how many times faster A is than B
177+
# > 1 means A is faster, < 1 means A is slower (B is faster)
178+
ratio = b.median / a.median if a.median > 0 else float("inf")
179+
if ratio == float("inf"):
180+
ratio_str = "inf"
181+
elif ratio >= 1.0:
182+
# A is faster: "A is 2.1x faster than B"
183+
ratio_str = f"{ratio:.2f}x"
184+
else:
185+
# A is slower: show as "B is faster" by inverting
186+
ratio_inv = a.median / b.median
187+
ratio_str = f"{ratio_inv:.2f}x slower"
188+
189+
print(f"{name:<{name_col_width}} | {a_med_str} | {b_med_str} | {ratio_str:>{ratio_col_width}}")
190+
191+
print("=" * separator_width)
192+
193+
194+
def main(argv: list[str] | None = None) -> int:
195+
p = argparse.ArgumentParser(
196+
description="Compare two pyperf JSON files and print a statistics table"
197+
)
198+
p.add_argument("a", type=Path, help="First pyperf JSON (e.g. arm64.json)")
199+
p.add_argument("b", type=Path, help="Second pyperf JSON (e.g. x86_64.json)")
200+
p.add_argument("--a-label", default=None, help="Label for first file (default: filename stem)")
201+
p.add_argument("--b-label", default=None, help="Label for second file (default: filename stem)")
202+
p.add_argument("--csv", action="store_true", help="Output as CSV instead of formatted table")
203+
args = p.parse_args(argv)
204+
205+
a_stats = load_benchmark_stats(args.a)
206+
b_stats = load_benchmark_stats(args.b)
207+
208+
a_label = args.a_label or args.a.stem
209+
b_label = args.b_label or args.b.stem
210+
211+
print_table(a_stats, b_stats, a_label, b_label, csv=args.csv)
212+
return 0
213+
214+
215+
if __name__ == "__main__":
216+
raise SystemExit(main())
217+

bin/run_both_arch.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,8 @@ warn_if_thread_env_set() {
101101
warn_if_thread_env_set
102102

103103
echo "==> Creating venvs (if needed)"
104-
uv venv -p "$ARM_PY" "$ARM_VENV" >/dev/null
105-
uv venv -p "$X86_PY" "$X86_VENV" >/dev/null
104+
uv venv --allow-existing -p "$ARM_PY" "$ARM_VENV" >/dev/null
105+
uv venv --allow-existing -p "$X86_PY" "$X86_VENV" >/dev/null
106106

107107
ensure_arch() {
108108
local venv="$1"
@@ -176,8 +176,8 @@ run_in_env() {
176176
fi
177177

178178
# Confirm the output metadata matches the expected arch.
179-
python -c "import json; m=json.load(open('$out','r'))['metadata']; print('wrote', '$out', 'machine=', m.get('platform.machine'), 'python_executable=', m.get('python_executable'))"
180-
if [[ "$(python -c "import json; print(json.load(open('$out','r'))['metadata'].get('platform.machine',''))")" != "$label" ]]; then
179+
python -c "import json; m=json.load(open(\"$out\", \"r\"))[\"metadata\"]; print(\"wrote\", \"$out\", \"machine=\", m.get(\"platform.machine\"), \"python_executable=\", m.get(\"python_executable\"))"
180+
if [[ "$(python -c "import json; print(json.load(open(\"$out\", \"r\"))[\"metadata\"].get(\"platform.machine\", \"\"))")" != "$label" ]]; then
181181
echo "ERROR: $out recorded platform.machine != $label"
182182
exit 1
183183
fi

0 commit comments

Comments
 (0)