Skip to content

Commit f52b88c

Browse files
committed
Add benchmark report generation
1 parent 795045f commit f52b88c

11 files changed

Lines changed: 410 additions & 6 deletions

File tree

README.md

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,19 +77,25 @@ npm run dev -- validate ./examples/hello-world-bundle
7777
npm run dev -- scan ./examples
7878
```
7979

80-
6. Generate starter inputs:
80+
6. Generate a benchmark report:
81+
82+
```bash
83+
npm run dev -- report ./examples --out ./dist/benchmark-report.md
84+
```
85+
86+
7. Generate starter inputs:
8187

8288
```bash
8389
npm run dev -- init --out ./starter
8490
```
8591

86-
7. Pack from the generated config:
92+
8. Pack from the generated config:
8793

8894
```bash
8995
npm run dev -- pack --config ./starter/taskbundle.config.json
9096
```
9197

92-
8. Archive the result:
98+
9. Archive the result:
9399

94100
```bash
95101
npm run dev -- archive ./starter/bundle-output --out ./starter/bundle-output.tar.gz
@@ -201,6 +207,13 @@ Scan a directory for bundle folders:
201207
npm run dev -- scan ./examples
202208
```
203209

210+
### `taskbundle report`
211+
Generate a benchmark-style ranking and optional Markdown report:
212+
213+
```bash
214+
npm run dev -- report ./examples --out ./dist/benchmark-report.md
215+
```
216+
204217
## Example Bundles
205218

206219
The repository includes two real examples:
@@ -209,6 +222,8 @@ The repository includes two real examples:
209222

210223
They represent the same task captured from different tool/model combinations so `compare` has something meaningful to show.
211224

225+
You can also point `taskbundle report` at the same directory to generate a small benchmark-style leaderboard.
226+
212227
## Bundle Format At A Glance
213228

214229
- `bundle.json`: top-level metadata and artifact pointers

README.zh-CN.md

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,19 +77,25 @@ npm run dev -- validate ./examples/hello-world-bundle
7777
npm run dev -- scan ./examples
7878
```
7979

80-
6. 生成 starter 输入目录:
80+
6. 生成 benchmark 风格报告:
81+
82+
```bash
83+
npm run dev -- report ./examples --out ./dist/benchmark-report.md
84+
```
85+
86+
7. 生成 starter 输入目录:
8187

8288
```bash
8389
npm run dev -- init --out ./starter
8490
```
8591

86-
7. 直接从配置文件打包:
92+
8. 直接从配置文件打包:
8793

8894
```bash
8995
npm run dev -- pack --config ./starter/taskbundle.config.json
9096
```
9197

92-
8. 把 bundle 归档成 `.tar.gz`
98+
9. 把 bundle 归档成 `.tar.gz`
9399

94100
```bash
95101
npm run dev -- archive ./starter/bundle-output --out ./starter/bundle-output.tar.gz
@@ -201,6 +207,13 @@ npm run dev -- validate ./examples/hello-world-bundle
201207
npm run dev -- scan ./examples
202208
```
203209

210+
### `taskbundle report`
211+
生成 benchmark 风格的排行榜和可选 Markdown 报告:
212+
213+
```bash
214+
npm run dev -- report ./examples --out ./dist/benchmark-report.md
215+
```
216+
204217
## 示例 Bundle
205218

206219
仓库里现在有两个示例:
@@ -209,6 +222,8 @@ npm run dev -- scan ./examples
209222

210223
它们表达的是同一个任务,但来自不同的工具 / 模型组合,所以 `compare` 命令有真实可看的结果。
211224

225+
你也可以直接把这个目录交给 `taskbundle report`,生成一份小型 benchmark 排行榜。
226+
212227
## Bundle 格式一眼看懂
213228

214229
- `bundle.json`:顶层元数据和 artifact 指针

ROADMAP.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Task Bundle started as a small CLI MVP. This roadmap turns it into a practical f
1414
- Done: `validate` and `scan` commands for replay checks and bundle collections
1515
- Done: artifact hashes and sizes in `bundle.json`
1616
- Done: benchmark-style outcome fields in bundle metadata
17+
- Done: benchmark report generation with ranking, leaderboard, and Markdown export
1718
- Done: CLI smoke tests and GitHub Actions CI
1819
- Done: Chinese and English documentation
1920

ROADMAP.zh-CN.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Task Bundle 目前已经从一个小型 CLI MVP,走到了“可实际使用的
1414
- 已完成:`validate``scan` 命令,用于 replay 校验和 bundle 集合扫描
1515
- 已完成:artifact 哈希和大小写入 `bundle.json`
1616
- 已完成:bundle metadata 中的 benchmark / judge 结果字段
17+
- 已完成:benchmark report 生成,支持排行榜、leaderboard 和 Markdown 导出
1718
- 已完成:CLI smoke tests 和 GitHub Actions CI
1819
- 已完成:中英文文档
1920

src/cli/commands/report.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import path from "node:path";
2+
import { Command } from "commander";
3+
import { generateBenchmarkReport, renderBenchmarkReportMarkdown } from "../../core/report";
4+
import { writeTextFile } from "../../utils/fs";
5+
import { printKeyValue } from "../../utils/output";
6+
7+
export function registerReportCommand(program: Command): void {
8+
program
9+
.command("report")
10+
.description("Generate a benchmark-style report for a directory of bundles.")
11+
.option("--json", "Print machine-readable JSON instead of text")
12+
.option("--out <file>", "Write a Markdown report to a file")
13+
.argument("<rootDir>", "Directory that contains bundle folders")
14+
.action(async (rootDir: string, options: { json?: boolean; out?: string }) => {
15+
const report = await generateBenchmarkReport(path.resolve(rootDir));
16+
17+
if (options.out) {
18+
const markdown = renderBenchmarkReportMarkdown(report);
19+
await writeTextFile(path.resolve(options.out), markdown);
20+
}
21+
22+
if (options.json) {
23+
console.log(JSON.stringify(report, null, 2));
24+
return;
25+
}
26+
27+
console.log("Task Bundle Benchmark Report");
28+
console.log("----------------------------");
29+
printKeyValue("Root", report.rootDir);
30+
printKeyValue("Bundles", String(report.bundleCount));
31+
printKeyValue("Scored bundles", String(report.scoredBundleCount));
32+
printKeyValue("Average score", report.averageScore !== undefined ? Number(report.averageScore.toFixed(4)).toString() : "n/a");
33+
console.log("");
34+
console.log("Ranking");
35+
for (const entry of report.ranking) {
36+
console.log(
37+
`${entry.rank}. ${entry.title} | ${entry.tool ?? "unknown"} / ${entry.model ?? "unknown"} | ${
38+
entry.status ?? "unknown"
39+
} | score ${entry.score !== undefined ? Number(entry.score.toFixed(4)).toString() : "n/a"}`
40+
);
41+
}
42+
console.log("");
43+
console.log("Leaderboard");
44+
for (const entry of report.leaderboard) {
45+
console.log(
46+
`- ${entry.tool ?? "unknown"} / ${entry.model ?? "unknown"} | runs ${entry.runs} | avg ${
47+
entry.averageScore !== undefined ? Number(entry.averageScore.toFixed(4)).toString() : "n/a"
48+
} | best ${entry.bestScore !== undefined ? Number(entry.bestScore.toFixed(4)).toString() : "n/a"}`
49+
);
50+
}
51+
52+
if (options.out) {
53+
console.log("");
54+
console.log(`Markdown report: ${path.resolve(options.out)}`);
55+
}
56+
});
57+
}

src/cli/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { registerCompareCommand } from "./commands/compare";
55
import { registerInitCommand } from "./commands/init";
66
import { registerInspectCommand } from "./commands/inspect";
77
import { registerPackCommand } from "./commands/pack";
8+
import { registerReportCommand } from "./commands/report";
89
import { registerScanCommand } from "./commands/scan";
910
import { registerValidateCommand } from "./commands/validate";
1011

@@ -23,6 +24,7 @@ async function main(): Promise<void> {
2324
registerArchiveCommands(program);
2425
registerValidateCommand(program);
2526
registerScanCommand(program);
27+
registerReportCommand(program);
2628

2729
await program.parseAsync(process.argv);
2830
}

src/core/bundle.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ export async function inspectBundle(bundleDir: string): Promise<BundleInspection
246246
const artifacts = await detectArtifacts(resolvedBundleDir);
247247

248248
return {
249+
bundleDir: resolvedBundleDir,
249250
title: bundle.metadata.title,
250251
schemaVersion: bundle.metadata.schemaVersion,
251252
createdAt: bundle.metadata.createdAt,

0 commit comments

Comments
 (0)