diff --git a/src/__tests__/qualityScore.test.js b/src/__tests__/qualityScore.test.js new file mode 100644 index 0000000..2d23828 --- /dev/null +++ b/src/__tests__/qualityScore.test.js @@ -0,0 +1,182 @@ +import { describe, expect, it } from "vitest"; +import { + computeQualityScore, + scoreToGrade, + gradeColor, + formatScoreTooltip, +} from "../lib/qualityScore.js"; +import { theme } from "../lib/theme.js"; + +describe("qualityScore", function () { + describe("computeQualityScore", function () { + it("returns a score between 0 and 1", function () { + var result = computeQualityScore( + { totalEvents: 100, totalTurns: 10, totalToolCalls: 50, errorCount: 2, uniqueToolCount: 8 }, + { efficiency: 0.7 }, + ); + expect(result.score).toBeGreaterThanOrEqual(0); + expect(result.score).toBeLessThanOrEqual(1); + }); + + it("gives A grade for a perfect session", function () { + var result = computeQualityScore( + { totalEvents: 200, totalTurns: 10, totalToolCalls: 80, errorCount: 0, uniqueToolCount: 12 }, + { efficiency: 0.95 }, + ); + expect(result.grade).toBe("A"); + expect(result.score).toBeGreaterThanOrEqual(0.9); + }); + + it("gives D or F for high error rate", function () { + var result = computeQualityScore( + { totalEvents: 10, totalTurns: 2, totalToolCalls: 5, errorCount: 5, uniqueToolCount: 2 }, + { efficiency: 0.3 }, + ); + expect(["D", "F"]).toContain(result.grade); + }); + + it("handles missing stats gracefully", function () { + var result = computeQualityScore(null, null); + expect(result).toBeDefined(); + expect(result.score).toBeGreaterThanOrEqual(0); + expect(result.score).toBeLessThanOrEqual(1); + expect(result.grade).toBeDefined(); + expect(result.components).toBeDefined(); + }); + + it("handles undefined inputs without crashing", function () { + expect(function () { computeQualityScore(undefined, undefined); }).not.toThrow(); + expect(function () { computeQualityScore({}, {}); }).not.toThrow(); + expect(function () { computeQualityScore({}, undefined); }).not.toThrow(); + expect(function () { computeQualityScore(undefined, {}); }).not.toThrow(); + }); + + it("handles empty stats object", function () { + var result = computeQualityScore({}, {}); + expect(result.score).toBeGreaterThanOrEqual(0); + expect(result.score).toBeLessThanOrEqual(1); + }); + + it("returns all five component scores", function () { + var result = computeQualityScore( + { totalEvents: 50, totalTurns: 5, totalToolCalls: 20, errorCount: 1, uniqueToolCount: 5 }, + { efficiency: 0.6 }, + ); + expect(result.components.errorRate).toBeDefined(); + expect(result.components.autonomy).toBeDefined(); + expect(result.components.toolDiversity).toBeDefined(); + expect(result.components.completion).toBeDefined(); + expect(result.components.efficiency).toBeDefined(); + }); + }); + + describe("scoreToGrade", function () { + it("returns A for score >= 0.9", function () { + expect(scoreToGrade(0.9)).toBe("A"); + expect(scoreToGrade(1.0)).toBe("A"); + expect(scoreToGrade(0.95)).toBe("A"); + }); + + it("returns B for score >= 0.8", function () { + expect(scoreToGrade(0.8)).toBe("B"); + expect(scoreToGrade(0.89)).toBe("B"); + }); + + it("returns C for score >= 0.65", function () { + expect(scoreToGrade(0.65)).toBe("C"); + expect(scoreToGrade(0.79)).toBe("C"); + }); + + it("returns D for score >= 0.5", function () { + expect(scoreToGrade(0.5)).toBe("D"); + expect(scoreToGrade(0.64)).toBe("D"); + }); + + it("returns F for score < 0.5", function () { + expect(scoreToGrade(0.49)).toBe("F"); + expect(scoreToGrade(0.0)).toBe("F"); + }); + }); + + describe("gradeColor", function () { + it("returns success color for A and B", function () { + expect(gradeColor("A")).toBe(theme.semantic.success); + expect(gradeColor("B")).toBe(theme.semantic.success); + }); + + it("returns warning color for C", function () { + expect(gradeColor("C")).toBe(theme.semantic.warning); + }); + + it("returns error color for D and F", function () { + expect(gradeColor("D")).toBe(theme.semantic.error); + expect(gradeColor("F")).toBe(theme.semantic.error); + }); + }); + + describe("formatScoreTooltip", function () { + it("formats tooltip with grade and percentages", function () { + var result = computeQualityScore( + { totalEvents: 100, totalTurns: 5, totalToolCalls: 30, errorCount: 0, uniqueToolCount: 8 }, + { efficiency: 0.8 }, + ); + var tooltip = formatScoreTooltip(result); + expect(tooltip).toContain("Quality:"); + expect(tooltip).toContain(result.grade); + expect(tooltip).toContain("Errors:"); + expect(tooltip).toContain("Autonomy:"); + expect(tooltip).toContain("Tool diversity:"); + expect(tooltip).toContain("Completion:"); + expect(tooltip).toContain("Efficiency:"); + }); + + it("returns empty string for null input", function () { + expect(formatScoreTooltip(null)).toBe(""); + expect(formatScoreTooltip(undefined)).toBe(""); + }); + }); + + describe("tool diversity scoring edge cases", function () { + it("returns 0.5 for zero tool calls", function () { + var result = computeQualityScore( + { totalEvents: 10, totalTurns: 2, totalToolCalls: 0, errorCount: 0 }, + { efficiency: 0.8 }, + ); + expect(result.components.toolDiversity).toBe(0.5); + }); + + it("handles sessions with only 1 unique tool", function () { + var result = computeQualityScore( + { totalEvents: 100, totalTurns: 5, totalToolCalls: 100, errorCount: 0, uniqueToolCount: 1 }, + { efficiency: 0.8 }, + ); + // ratio = 1/50 = 0.02 which is below 0.05, so score is 0.3 + expect(result.components.toolDiversity).toBe(0.3); + }); + + it("handles sessions with many unique tools", function () { + var result = computeQualityScore( + { totalEvents: 10, totalTurns: 2, totalToolCalls: 5, errorCount: 0, uniqueToolCount: 5 }, + { efficiency: 0.8 }, + ); + // ratio = 5/5 = 1.0 which is > 0.5, so score is 0.7 + expect(result.components.toolDiversity).toBe(0.7); + }); + }); + + describe("score clamping", function () { + it("never produces component scores outside 0-1", function () { + var extremes = [ + { totalEvents: 1, totalTurns: 0, totalToolCalls: 0, errorCount: 100 }, + { totalEvents: 1000, totalTurns: 100, totalToolCalls: 5000, errorCount: 0, uniqueToolCount: 50 }, + ]; + extremes.forEach(function (stats) { + var result = computeQualityScore(stats, { efficiency: 1.5 }); + Object.keys(result.components).forEach(function (key) { + expect(result.components[key]).toBeGreaterThanOrEqual(0); + expect(result.components[key]).toBeLessThanOrEqual(1); + }); + }); + }); + }); +}); diff --git a/src/components/DashboardView.jsx b/src/components/DashboardView.jsx index cf27449..21993f9 100644 --- a/src/components/DashboardView.jsx +++ b/src/components/DashboardView.jsx @@ -15,6 +15,8 @@ import { sortDiscoveredLandingEntries, sortLandingEntries, } from "../lib/landingSessions.js"; +import { computeQualityScore } from "../lib/qualityScore.js"; +import QualityBadge from "./QualityBadge.jsx"; import Icon from "./Icon.jsx"; import usePersistentState from "../hooks/usePersistentState.js"; import ToolbarButton from "./ui/ToolbarButton.jsx"; @@ -105,6 +107,7 @@ function SessionCard({ entry, onClick }) { var summary = isDiscovered ? null : getCardSummary(entry, title); var meta = buildCardMeta(entry, title); var updatedLabel = formatRelativeTime(entry.updatedAt || entry.importedAt); + var quality = isDiscovered ? null : computeQualityScore(entry.stats, entry.autonomyMetrics); var chips = [ entry.reviewScore != null ? { label: "Needs review", value: entry.reviewScore.toFixed(1) } : null, autonomy.autonomyEfficiency != null ? { label: "Autonomy", value: formatAutonomyEfficiency(autonomy.autonomyEfficiency) } : null, @@ -164,8 +167,12 @@ function SessionCard({ entry, onClick }) { overflow: "hidden", textOverflow: "ellipsis", whiteSpace: "nowrap", + display: "flex", + alignItems: "center", + gap: 6, }}> {title} + {quality && } {updatedLabel} diff --git a/src/components/InboxView.jsx b/src/components/InboxView.jsx index 1bbb236..9792580 100644 --- a/src/components/InboxView.jsx +++ b/src/components/InboxView.jsx @@ -19,6 +19,8 @@ import { import Icon from "./Icon.jsx"; import ToolbarButton from "./ui/ToolbarButton.jsx"; import ToolbarSelect from "./ui/ToolbarSelect.jsx"; +import { computeQualityScore, formatScoreTooltip } from "../lib/qualityScore.js"; +import QualityBadge from "./QualityBadge.jsx"; import usePersistentState from "../hooks/usePersistentState.js"; var SORT_OPTIONS = [ @@ -27,6 +29,7 @@ var SORT_OPTIONS = [ { id: "most-expensive", label: LANDING_SORT_LABELS["most-expensive"] }, { id: "highest-babysitting", label: "Most human response time" }, { id: "highest-idle", label: "Highest idle" }, + { id: "highest-quality", label: "Highest quality" }, { id: "most-recent", label: LANDING_SORT_LABELS["most-recent"] }, ]; @@ -43,6 +46,14 @@ function sortEntries(entries, sortMode) { }); } + if (sortMode === "highest-quality") { + return (entries || []).slice().sort(function (left, right) { + var leftScore = computeQualityScore(left.stats, left.autonomyMetrics).score; + var rightScore = computeQualityScore(right.stats, right.autonomyMetrics).score; + return rightScore - leftScore; + }); + } + return sortLandingEntries(entries, sortMode); } @@ -474,6 +485,7 @@ export default function InboxView({ entries, onOpenSession, onImport, onLoadSamp var canOpen = Boolean(entry.hasContent || entry.discoveredPath); var title = getLandingEntryDisplayTitle(entry); var secondaryText = getLandingEntrySecondaryText(entry, title); + var quality = computeQualityScore(entry.stats, entry.autonomyMetrics); return (
{title} +
{renderMeta(entry)} diff --git a/src/components/QualityBadge.jsx b/src/components/QualityBadge.jsx new file mode 100644 index 0000000..5b59c6e --- /dev/null +++ b/src/components/QualityBadge.jsx @@ -0,0 +1,32 @@ +import { theme } from "../lib/theme.js"; +import { gradeColor } from "../lib/qualityScore.js"; + +export default function QualityBadge({ grade, score, style }) { + if (!grade) return null; + + var color = gradeColor(grade); + + return ( + + {grade} + + ); +} diff --git a/src/lib/qualityScore.js b/src/lib/qualityScore.js new file mode 100644 index 0000000..37b80d1 --- /dev/null +++ b/src/lib/qualityScore.js @@ -0,0 +1,113 @@ +/** + * Session quality scoring for AGENTVIZ landing page. + * + * Computes a composite quality score from session stats + * and autonomy metrics, inspired by waza readiness checks. + */ + +import { theme } from "./theme.js"; + +var WEIGHTS = { + errorRate: 0.30, + autonomy: 0.25, + toolDiversity: 0.20, + completion: 0.15, + efficiency: 0.10, +}; + +function clamp(value, min, max) { + return Math.max(min, Math.min(max, value)); +} + +// Error rate: 0 errors = 1.0, scales down with more errors per event +function scoreErrorRate(stats) { + if (!stats || !stats.totalEvents || stats.totalEvents === 0) return 1.0; + var rate = (stats.errorCount || 0) / stats.totalEvents; + return clamp(1.0 - (rate * 10), 0, 1); +} + +// Autonomy: uses existing efficiency metric (0-1 scale, higher is better) +function scoreAutonomy(autonomyMetrics) { + if (!autonomyMetrics || autonomyMetrics.efficiency == null) return 0.5; + return clamp(autonomyMetrics.efficiency, 0, 1); +} + +// Tool diversity: unique tools / total tool calls (moderate diversity is good) +function scoreToolDiversity(stats) { + if (!stats || !stats.totalToolCalls || stats.totalToolCalls === 0) return 0.5; + var uniqueTools = stats.uniqueToolCount || 1; + var ratio = uniqueTools / Math.min(stats.totalToolCalls, 50); + // Sweet spot is 0.05-0.5 (some variety but not random) + if (ratio >= 0.05 && ratio <= 0.5) return 1.0; + if (ratio > 0.5) return 0.7; // Too many unique tools relative to calls + return 0.3; // Only 1 tool used repeatedly +} + +// Completion: sessions that end naturally score higher +function scoreCompletion(stats) { + if (!stats) return 0.5; + // Heuristic: sessions with > 2 turns and no errors at the end are "complete" + if ((stats.totalTurns || 0) >= 2 && (stats.errorCount || 0) === 0) return 1.0; + if ((stats.totalTurns || 0) >= 2) return 0.7; + return 0.3; +} + +// Efficiency: tool calls per turn (lower is more focused) +function scoreEfficiency(stats) { + if (!stats || !stats.totalTurns || stats.totalTurns === 0) return 0.5; + var toolsPerTurn = (stats.totalToolCalls || 0) / stats.totalTurns; + // 2-15 tools per turn is healthy + if (toolsPerTurn >= 2 && toolsPerTurn <= 15) return 1.0; + if (toolsPerTurn > 15 && toolsPerTurn <= 30) return 0.7; + if (toolsPerTurn > 30) return 0.4; + return 0.6; // Less than 2 is low-tool session +} + +export function computeQualityScore(stats, autonomyMetrics) { + var components = { + errorRate: scoreErrorRate(stats), + autonomy: scoreAutonomy(autonomyMetrics), + toolDiversity: scoreToolDiversity(stats), + completion: scoreCompletion(stats), + efficiency: scoreEfficiency(stats), + }; + + var score = 0; + var keys = Object.keys(WEIGHTS); + for (var i = 0; i < keys.length; i++) { + score += components[keys[i]] * WEIGHTS[keys[i]]; + } + + return { + score: Math.round(score * 100) / 100, + grade: scoreToGrade(score), + components: components, + }; +} + +export function scoreToGrade(score) { + if (score >= 0.9) return "A"; + if (score >= 0.8) return "B"; + if (score >= 0.65) return "C"; + if (score >= 0.5) return "D"; + return "F"; +} + +export function gradeColor(grade) { + if (grade === "A" || grade === "B") return theme.semantic.success; + if (grade === "C") return theme.semantic.warning; + return theme.semantic.error; +} + +export function formatScoreTooltip(result) { + if (!result) return ""; + var lines = [ + "Quality: " + result.grade + " (" + Math.round(result.score * 100) + "%)", + "Errors: " + Math.round(result.components.errorRate * 100) + "%", + "Autonomy: " + Math.round(result.components.autonomy * 100) + "%", + "Tool diversity: " + Math.round(result.components.toolDiversity * 100) + "%", + "Completion: " + Math.round(result.components.completion * 100) + "%", + "Efficiency: " + Math.round(result.components.efficiency * 100) + "%", + ]; + return lines.join("\n"); +}