Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/assets/index-BD-3HzhN.js → docs/assets/index-BhcfnKmx.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<link rel="preconnect" href="https://fonts.googleapis.com" />
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet" />
<script type="module" crossorigin src="/eva/assets/index-BD-3HzhN.js"></script>
<script type="module" crossorigin src="/eva/assets/index-BhcfnKmx.js"></script>
<link rel="stylesheet" crossorigin href="/eva/assets/index-DNsPq0CK.css">
</head>
<body>
Expand Down
107 changes: 82 additions & 25 deletions website/src/components/leaderboard/MetricHeatmap.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,91 @@ export interface AggregateColumn {
getValue: (s: SystemScore) => number;
}

// Palette of distinct colors — ordered so adjacent colors always contrast (warm/cool alternating)
const componentPaletteDark = [
'#F59E0B', '#38BDF8', '#34D399', '#A78BFA', // amber, sky, emerald, purple
'#F87171', '#22D3EE', '#FB923C', '#818CF8', // red, cyan, orange, indigo
'#F472B6', '#4ADE80', '#FACC15', '#2DD4BF', // pink, green, yellow, teal
'#C084FC', '#FB7185', '#67E8F9', '#A3E635', // violet, rose, light-cyan, lime
];
// Per-category color palettes — each category uses maximally distinct colors so
// components you actually compare (e.g. two STT models) never look alike.
const categoryPalettesDark: Record<string, string[]> = {
stt: [
'#F59E0B', // amber
'#38BDF8', // sky blue
'#34D399', // emerald
'#F87171', // red
'#A78BFA', // purple
'#FACC15', // yellow
],
llm: [
'#22D3EE', // cyan
'#FB923C', // orange
'#818CF8', // indigo
'#4ADE80', // green
'#F59E0B', // amber
'#F472B6', // pink
'#94A3B8', // slate
'#A3E635', // lime
'#E879F9', // fuchsia
'#F87171', // red
],
tts: [
'#A3E635', // lime
'#FB7185', // rose
'#67E8F9', // light cyan
'#C084FC', // violet
'#FDBA74', // peach
'#2DD4BF', // teal
],
};

const componentPaletteLight = [
'#B45309', '#0369A1', '#047857', '#6D28D9', // amber, sky, emerald, purple
'#B91C1C', '#0E7490', '#C2410C', '#4338CA', // red, cyan, orange, indigo
'#BE185D', '#15803D', '#A16207', '#0D9488', // pink, green, yellow, teal
'#7C3AED', '#E11D48', '#0891B2', '#65A30D', // violet, rose, light-cyan, lime
];
const categoryPalettesLight: Record<string, string[]> = {
stt: [
'#B45309', // amber
'#0369A1', // sky blue
'#047857', // emerald
'#B91C1C', // red
'#6D28D9', // purple
'#A16207', // yellow
],
llm: [
'#0E7490', // cyan
'#C2410C', // orange
'#4338CA', // indigo
'#15803D', // green
'#B45309', // amber
'#BE185D', // pink
'#475569', // slate
'#65A30D', // lime
'#A21CAF', // fuchsia
'#B91C1C', // red
],
tts: [
'#65A30D', // lime
'#E11D48', // rose
'#0891B2', // light cyan
'#7C3AED', // violet
'#EA580C', // peach
'#0D9488', // teal
],
};

function getComponentColorMap(systems: SystemScore[], palette: string[]): Map<string, string> {
const allComponents = new Set<string>();
function getComponentColorMap(systems: SystemScore[], isDark: boolean): Map<string, string> {
const palettes = isDark ? categoryPalettesDark : categoryPalettesLight;

// Collect unique names per category
const sttNames: string[] = [];
const llmNames: string[] = [];
const ttsNames: string[] = [];
const seen = new Set<string>();
for (const s of systems) {
if (s.stt !== '-') allComponents.add(s.stt);
allComponents.add(s.llm);
if (s.tts !== '-') allComponents.add(s.tts);
if (s.stt !== '-' && !seen.has('stt:' + s.stt)) { sttNames.push(s.stt); seen.add('stt:' + s.stt); }
if (!seen.has('llm:' + s.llm)) { llmNames.push(s.llm); seen.add('llm:' + s.llm); }
if (s.tts !== '-' && !seen.has('tts:' + s.tts)) { ttsNames.push(s.tts); seen.add('tts:' + s.tts); }
}

const map = new Map<string, string>();
let i = 0;
for (const name of allComponents) {
map.set(name, palette[i % palette.length]);
i++;
}
const assign = (names: string[], pal: string[]) => {
names.forEach((name, i) => map.set(name, pal[i % pal.length]));
};
assign(sttNames, palettes.stt);
assign(llmNames, palettes.llm);
assign(ttsNames, palettes.tts);
return map;
}

Expand Down Expand Up @@ -99,8 +156,8 @@ export function MetricHeatmap({ title, description, metricKeys, metricLabels, da
const themeColors = useThemeColors();
const themeMode = useThemeMode();
const aggCols = aggregateColumns ?? [];
const palette = themeMode === 'light' ? componentPaletteLight : componentPaletteDark;
const componentColors = useMemo(() => getComponentColorMap(systems, palette), [systems, palette]);
const isDark = themeMode !== 'light';
const componentColors = useMemo(() => getComponentColorMap(systems, isDark), [systems, isDark]);

const [sortKey, setSortKey] = useState<string | null>(null);
const [sortDir, setSortDir] = useState<SortDir>('desc');
Expand Down
106 changes: 82 additions & 24 deletions website/src/components/leaderboard/TurnTakingAnalysis.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,69 @@ import { useThemeColors, useThemeMode } from '../../styles/theme';

// ─── Shared constants & utilities ────────────────────────────────────────────

const componentPaletteDark = [
'#F59E0B', '#38BDF8', '#34D399', '#A78BFA', // amber, sky, emerald, purple
'#F87171', '#22D3EE', '#FB923C', '#818CF8', // red, cyan, orange, indigo
'#F472B6', '#4ADE80', '#FACC15', '#2DD4BF', // pink, green, yellow, teal
'#C084FC', '#FB7185', '#67E8F9', '#A3E635', // violet, rose, light-cyan, lime
];
const componentPaletteLight = [
'#B45309', '#0369A1', '#047857', '#6D28D9', // amber, sky, emerald, purple
'#B91C1C', '#0E7490', '#C2410C', '#4338CA', // red, cyan, orange, indigo
'#BE185D', '#15803D', '#A16207', '#0D9488', // pink, green, yellow, teal
'#7C3AED', '#E11D48', '#0891B2', '#65A30D', // violet, rose, light-cyan, lime
];
// Per-category color palettes — each category uses maximally distinct colors so
// components you actually compare (e.g. two STT models) never look alike.
const categoryPalettesDark: Record<string, string[]> = {
stt: [
'#F59E0B', // amber
'#38BDF8', // sky blue
'#34D399', // emerald
'#F87171', // red
'#A78BFA', // purple
'#FACC15', // yellow
],
llm: [
'#22D3EE', // cyan
'#FB923C', // orange
'#818CF8', // indigo
'#4ADE80', // green
'#F59E0B', // amber
'#F472B6', // pink
'#94A3B8', // slate
'#A3E635', // lime
'#E879F9', // fuchsia
'#F87171', // red
],
tts: [
'#A3E635', // lime
'#FB7185', // rose
'#67E8F9', // light cyan
'#C084FC', // violet
'#FDBA74', // peach
'#2DD4BF', // teal
],
};

const categoryPalettesLight: Record<string, string[]> = {
stt: [
'#B45309', // amber
'#0369A1', // sky blue
'#047857', // emerald
'#B91C1C', // red
'#6D28D9', // purple
'#A16207', // yellow
],
llm: [
'#0E7490', // cyan
'#C2410C', // orange
'#4338CA', // indigo
'#15803D', // green
'#B45309', // amber
'#BE185D', // pink
'#475569', // slate
'#65A30D', // lime
'#A21CAF', // fuchsia
'#B91C1C', // red
],
tts: [
'#65A30D', // lime
'#E11D48', // rose
'#0891B2', // light cyan
'#7C3AED', // violet
'#EA580C', // peach
'#0D9488', // teal
],
};

const distributionColors = {
dark: { onTime: '#34D399', late: '#F68EC4', early: '#F59E0B', indeterminate: '#64748B' },
Expand All @@ -32,19 +83,26 @@ const breakdownColors = {
light: { withToolCalls: '#0891B2', withoutToolCalls: '#7C3AED' },
};

function getComponentColorMap(systems: SystemScore[], palette: string[]): Map<string, string> {
const allComponents = new Set<string>();
function getComponentColorMap(systems: SystemScore[], isDark: boolean): Map<string, string> {
const palettes = isDark ? categoryPalettesDark : categoryPalettesLight;

const sttNames: string[] = [];
const llmNames: string[] = [];
const ttsNames: string[] = [];
const seen = new Set<string>();
for (const s of systems) {
if (s.stt !== '-') allComponents.add(s.stt);
allComponents.add(s.llm);
if (s.tts !== '-') allComponents.add(s.tts);
if (s.stt !== '-' && !seen.has('stt:' + s.stt)) { sttNames.push(s.stt); seen.add('stt:' + s.stt); }
if (!seen.has('llm:' + s.llm)) { llmNames.push(s.llm); seen.add('llm:' + s.llm); }
if (s.tts !== '-' && !seen.has('tts:' + s.tts)) { ttsNames.push(s.tts); seen.add('tts:' + s.tts); }
}

const map = new Map<string, string>();
let i = 0;
for (const name of allComponents) {
map.set(name, palette[i % palette.length]);
i++;
}
const assign = (names: string[], pal: string[]) => {
names.forEach((name, i) => map.set(name, pal[i % pal.length]));
};
assign(sttNames, palettes.stt);
assign(llmNames, palettes.llm);
assign(ttsNames, palettes.tts);
return map;
}

Expand Down Expand Up @@ -790,8 +848,8 @@ interface TurnTakingAnalysisProps {

export function TurnTakingAnalysis({ systems }: TurnTakingAnalysisProps) {
const themeMode = useThemeMode();
const palette = themeMode === 'light' ? componentPaletteLight : componentPaletteDark;
const componentColors = useMemo(() => getComponentColorMap(systems, palette), [systems, palette]);
const isDark = themeMode !== 'light';
const componentColors = useMemo(() => getComponentColorMap(systems, isDark), [systems, isDark]);
const [isOpen, setIsOpen] = useState(false);
const [layout, setLayout] = useState<LayoutMode>('sideBySide');

Expand Down
30 changes: 30 additions & 0 deletions website/src/data/leaderboardData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,21 @@ export const ossSystems: SystemScore[] = [
experience: { pass_threshold: 0.4400, mean: 0.6024, pass_at_k: 0.7600, pass_k: 0.2356 },
},
},
{
id: 'cohere-transcribe-gpt-5-4-mini-kokoro',
name: 'cohere-transcribe + gpt-5.4-mini + kokoro',
shortName: 'gpt-5.4-mini (cohere-transcribe)',
stt: 'cohere-transcribe', llm: 'gpt-5.4-mini', tts: 'kokoro',
type: 'cascade',
evaA: 0.2600, evaX: 0.3733,
accuracyMetrics: { task_completion: 0.3933, agent_tts_fidelity: 0.9846, faithfulness: 0.3300 },
experienceMetrics: { turn_taking: 0.5097, conciseness: 0.8345, conversation_progression: 0.3433 },
diagnosticMetrics: { key_entity_transcription: 0.6150, response_speed: 4.9451 },
successRates: {
accuracy: { pass_threshold: 0.2600, mean: 0.5693, pass_at_k: 0.5200, pass_k: 0.1133 },
experience: { pass_threshold: 0.3733, mean: 0.5625, pass_at_k: 0.8000, pass_k: 0.1304 },
},
},
{
id: 'gpt-4o-mini-transcribe-gpt-5-mini-gpt-4o-mini-tts',
name: 'gpt-4o-mini-transcribe + gpt-5-mini + gpt-4o-mini-tts',
Expand Down Expand Up @@ -324,6 +339,21 @@ export const ossSystems: SystemScore[] = [
experience: { pass_threshold: 0.2467, mean: 0.5337, pass_at_k: 0.5000, pass_k: 0.0985 },
},
},
{
id: 'voxtral-mini-3b-gpt-5-4-mini-voxtral-4b-tts',
name: 'voxtral-mini-3b + gpt-5.4-mini + voxtral-4b-tts',
shortName: 'gpt-5.4-mini (voxtral-mini-3b)',
stt: 'voxtral-mini-3b', llm: 'gpt-5.4-mini', tts: 'voxtral-4b-tts',
type: 'cascade',
evaA: 0.2733, evaX: 0.0067,
accuracyMetrics: { task_completion: 0.4200, agent_tts_fidelity: 0.9831, faithfulness: 0.3733 },
experienceMetrics: { turn_taking: 0.0629, conciseness: 0.8469, conversation_progression: 0.3800 },
diagnosticMetrics: { key_entity_transcription: 0.6682, response_speed: 8.5170 },
successRates: {
accuracy: { pass_threshold: 0.2733, mean: 0.5921, pass_at_k: 0.4600, pass_k: 0.1548 },
experience: { pass_threshold: 0.0067, mean: 0.4299, pass_at_k: 0.0200, pass_k: 0.0007 },
},
},
{
id: 'whisper-large-v3-gpt-oss-20b-chatterbox-turbo',
name: 'whisper-large-v3 + gpt-oss-20b + chatterbox-turbo',
Expand Down
Loading