diff --git a/README.md b/README.md index 4bccd89b4..f5e45874b 100644 --- a/README.md +++ b/README.md @@ -21,16 +21,19 @@ Unlike prior SQL-only or single-database benchmarks, DAB stresses agents under * ## 🏆 Leaderboard -| Rank | Model | Pass@1 | Date | -| ---- | -------------- | ------------------- | ------- | -| 1 | PromptQL (Gemini-3.1-Pro) (5 trials/query) | 0.543 | 2026-03-018 | -| 2 | PromptQL (Claude-Opus-4.6) (5 trials/query) | 0.508 | 2026-03-02 | -| 3 | Claude-Opus-4.6 (5 trials/query) | 0.4376 | 2026-03-18 | -| 4 | Gemini-3-Pro | 0.38 | 2026-03-02 | -| 5 | GPT-5-mini | 0.30 | 2026-03-02 | -| 6 | GPT-5.2 | 0.25 | 2026-03-02 | -| 7 | Kimi-K2 | 0.23 | 2026-03-02 | -| 8 | Gemini-2.5-Flash | 0.09 | 2026-03-02 | +| Rank | Model | Pass@1 | Date | Submission | +| ---- | -------------- | ------------------- | ------- | ---------- | +| 1 | Pi Coding Agent (Claude-Opus-4.6) (5 trials/query) | 0.5603 | 2026-04-21 | [#31](https://github.com/ucbepic/DataAgentBench/pull/31) | +| 2 | PromptQL (Gemini-3.1-Pro) (5 trials/query) | 0.543 | 2026-03-18 | [#24](https://github.com/ucbepic/DataAgentBench/pull/24) | +| 3 | PromptQL (Claude-Opus-4.6) (5 trials/query) | 0.508 | 2026-03-02 | [#23](https://github.com/ucbepic/DataAgentBench/pull/23) | +| 4 | Oracle Forge — Tenacious Intelligence (Claude-Sonnet-4.6) (5–7 trials/query) | 0.4554 | 2026-04-21 | [#32](https://github.com/ucbepic/DataAgentBench/pull/32) | +| 5 | Claude-Opus-4.6 (5 trials/query) | 0.4376 | 2026-03-18 | [#22](https://github.com/ucbepic/DataAgentBench/pull/22) | +| 6 | Gemini-3-Pro | 0.38 | 2026-03-02 | — | +| 7 | GPT-5-mini | 0.30 | 2026-03-02 | — | +| 8 | GPT-5.2 | 0.25 | 2026-03-02 | — | +| 9 | Kimi-K2 | 0.23 | 2026-03-02 | — | +| 10 | Oracle Forge — Team Cohere (Gemini-2.0-Flash) (5 trials/query) | 0.128 | 2026-04-21 | [#38](https://github.com/ucbepic/DataAgentBench/pull/38) | +| 11 | Gemini-2.5-Flash | 0.09 | 2026-03-02 | — | ### How to Submit to the Leaderboard diff --git a/docs/app.js b/docs/app.js index 901021ec8..3fb9ca5b0 100644 --- a/docs/app.js +++ b/docs/app.js @@ -172,6 +172,18 @@ function renderOverallLeaderboard(rows) { const trialsTd = createElement("td", "num", String(row.trials)); const passTd = createElement("td", "num", formatPercentFromScore(row.passAt1)); const dateTd = createElement("td", "", row.date); + const prTd = document.createElement("td"); + if (row.prUrl) { + const prMatch = row.prUrl.match(/\/pull\/(\d+)/); + const prLabel = prMatch ? `PR #${prMatch[1]}` : "PR"; + const prLink = createElement("a", "", prLabel); + prLink.href = row.prUrl; + prLink.target = "_blank"; + prLink.rel = "noopener noreferrer"; + prTd.appendChild(prLink); + } else { + prTd.textContent = "—"; + } tr.appendChild(rankTd); tr.appendChild(agentTd); @@ -179,6 +191,7 @@ function renderOverallLeaderboard(rows) { tr.appendChild(trialsTd); tr.appendChild(passTd); tr.appendChild(dateTd); + tr.appendChild(prTd); tbody.appendChild(tr); }); } diff --git a/docs/data/leaderboards.json b/docs/data/leaderboards.json index 3de92cda7..e34e1620e 100644 --- a/docs/data/leaderboards.json +++ b/docs/data/leaderboards.json @@ -1,5 +1,5 @@ { - "updatedAt": "2026-03-21", + "updatedAt": "2026-04-21", "sources": [ "README leaderboard table", "PromptQL stratified table shared by maintainers", @@ -8,33 +8,56 @@ "overallLeaderboard": [ { "rank": 1, + "agent": "Pi Coding Agent + Claude Opus 4.6", + "trials": 5, + "passAt1": 0.5603, + "team": "Pi Coding Agent", + "teamUrl": "https://github.com/mariozechner/pi-coding-agent", + "prUrl": "https://github.com/ucbepic/DataAgentBench/pull/31", + "date": "2026-04-21" + }, + { + "rank": 2, "agent": "PromptQL + Gemini 3.1 Pro", "trials": 5, "passAt1": 0.543, "team": "Hasura PromptQL", "teamUrl": "https://promptql.io/", + "prUrl": "https://github.com/ucbepic/DataAgentBench/pull/24", "date": "2026-03-18" }, { - "rank": 2, + "rank": 3, "agent": "PromptQL + Claude Opus 4.6", "trials": 5, "passAt1": 0.508, "team": "Hasura PromptQL", "teamUrl": "https://promptql.io/", + "prUrl": "https://github.com/ucbepic/DataAgentBench/pull/23", "date": "2026-03-02" }, { - "rank": 3, + "rank": 4, + "agent": "Oracle Forge (Tenacious Intelligence) + Claude Sonnet 4.6", + "trials": 5, + "passAt1": 0.4554, + "team": "Tenacious Intelligence Corp", + "teamUrl": "https://github.com/ucbepic/DataAgentBench/pull/32", + "prUrl": "https://github.com/ucbepic/DataAgentBench/pull/32", + "date": "2026-04-21" + }, + { + "rank": 5, "agent": "Claude Opus 4.6 ReAct", "trials": 5, "passAt1": 0.4376, "team": "EPIC Data Lab", "teamUrl": "https://epic.berkeley.edu/", + "prUrl": "https://github.com/ucbepic/DataAgentBench/pull/22", "date": "2026-03-18" }, { - "rank": 4, + "rank": 6, "agent": "Gemini-3-Pro ReAct", "trials": 50, "passAt1": 0.38, @@ -43,7 +66,7 @@ "date": "2026-03-02" }, { - "rank": 5, + "rank": 7, "agent": "GPT-5-mini ReAct", "trials": 50, "passAt1": 0.3, @@ -52,7 +75,7 @@ "date": "2026-03-02" }, { - "rank": 6, + "rank": 8, "agent": "GPT-5.2 ReAct", "trials": 50, "passAt1": 0.25, @@ -61,7 +84,7 @@ "date": "2026-03-02" }, { - "rank": 7, + "rank": 9, "agent": "Kimi-K2 ReAct", "trials": 50, "passAt1": 0.23, @@ -70,7 +93,17 @@ "date": "2026-03-02" }, { - "rank": 8, + "rank": 10, + "agent": "Oracle Forge (Team Cohere) + Gemini 2.0 Flash", + "trials": 5, + "passAt1": 0.128, + "team": "Team Cohere", + "teamUrl": "https://github.com/trp1-cohere-team/data-analytics-agent", + "prUrl": "https://github.com/ucbepic/DataAgentBench/pull/38", + "date": "2026-04-21" + }, + { + "rank": 11, "agent": "Gemini-2.5-Flash ReAct", "trials": 50, "passAt1": 0.09, diff --git a/docs/index.html b/docs/index.html index d5b991c8f..8437331f5 100644 --- a/docs/index.html +++ b/docs/index.html @@ -128,6 +128,7 @@

Leaderboard

n Pass@1 Date + Submission