From 787c50fd7c3f2628ab072e61e94646d01e516212 Mon Sep 17 00:00:00 2001 From: matthiasschalk Date: Sun, 17 May 2026 06:48:40 +0200 Subject: [PATCH 1/6] fix(types,dashboard): exclude reasoning from token totals (ccusage parity) ccusage never adds reasoning into its total: for Codex the reasoning tokens are already counted inside output_tokens (OpenAI semantics), and ccusage's Codex loader uses the reported total_tokens (=== input+output). totalTokenCount() summed reasoning on top, double-counting every Codex event in the dashboard total, heatmap, topModel and every chart. Drop reasoning from totalTokenCount and every client-side token sum in dashboard.ts. Claude is unaffected (reasoning is always 0 there). The reasoning field stays on TokenCounts and is shown as a non-additive informational row. Adds types.test.ts and a Codex parity invariant in codex.test.ts (totalTokenCount == reported total_tokens for well-formed logs). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/dashboard.ts | 19 +++++++++++------ src/loaders/codex.test.ts | 12 +++++++++++ src/types.test.ts | 45 +++++++++++++++++++++++++++++++++++++++ src/types.ts | 9 +++++++- 4 files changed, 78 insertions(+), 7 deletions(-) create mode 100644 src/types.test.ts diff --git a/src/dashboard.ts b/src/dashboard.ts index d2c83af..1b8be10 100644 --- a/src/dashboard.ts +++ b/src/dashboard.ts @@ -410,7 +410,9 @@ function addTokens(a, b) { }; } function sumTokens(tokens) { - return tokens.input + tokens.output + tokens.cacheCreation + tokens.cacheRead + tokens.reasoning; + // reasoning is informational only — already inside output for Codex; never + // added to the total (mirrors server-side totalTokenCount / ccusage). + return tokens.input + tokens.output + tokens.cacheCreation + tokens.cacheRead; } function unique(values) { return [...new Set(values)]; @@ -1042,7 +1044,7 @@ function escapeHtml(s) { function totalProjectTokens(p) { const t = p.tokens || {}; - return (t.input || 0) + (t.output || 0) + (t.cacheCreation || 0) + (t.cacheRead || 0) + (t.reasoning || 0); + return (t.input || 0) + (t.output || 0) + (t.cacheCreation || 0) + (t.cacheRead || 0); } function renderProjects(data) { @@ -1120,7 +1122,7 @@ function renderProjects(data) { tbody.appendChild(emptyTr); } else { for (const s of perSource) { - const tot = (s.tokens.input||0) + (s.tokens.output||0) + (s.tokens.cacheCreation||0) + (s.tokens.cacheRead||0) + (s.tokens.reasoning||0); + const tot = (s.tokens.input||0) + (s.tokens.output||0) + (s.tokens.cacheCreation||0) + (s.tokens.cacheRead||0); const chip = '' + @@ -1562,7 +1564,8 @@ function buildTokensPopup(data) { { label: 'Cache Read', val: tok.cacheRead, color: '#34d399' }, { label: 'Cache Write', val: tok.cacheCreation, color: '#a78bfa' } ]; - if (tok.reasoning > 0) segs.push({ label: 'Reasoning', val: tok.reasoning, color: '#fbbf24' }); + // reasoning is NOT a segment: for Codex it is already inside output, so the + // donut/total stay equal to ccusage. It is surfaced as an info row below. const dSegs = segs.map(s => ({ ...s, pct: s.val / total })); let html = ''; @@ -1583,6 +1586,10 @@ function buildTokensPopup(data) { + '' + ''; + if (tok.reasoning > 0) { + html += pRow('Reasoning', fmt(tok.reasoning), 'already counted inside Output'); + } + const peakDay = data.daily.reduce((b, d) => sumTokens(d.tokens) > sumTokens(b.tokens) ? d : b, data.daily[0]); if (peakDay) { html += pSection('Peak Usage'); @@ -1686,7 +1693,7 @@ function buildCostPerDayPopup(data) { function buildTopModelPopup(data) { const modelsWithTok = data.byModel.map(m => ({ m, - tok: m.tokens.input + m.tokens.output + m.tokens.cacheRead + m.tokens.cacheCreation + (m.tokens.reasoning || 0) + tok: m.tokens.input + m.tokens.output + m.tokens.cacheRead + m.tokens.cacheCreation })); modelsWithTok.sort((a, b) => b.tok - a.tok); const totalTok = Math.max(modelsWithTok.reduce((s, r) => s + r.tok, 0), 1); @@ -1768,7 +1775,7 @@ function buildSourceChartPopup(data) { const sourceTok = data.bySource.map(s => ({ source: s.source, tokens: s.tokens, - total: s.tokens.input + s.tokens.output + s.tokens.cacheRead + s.tokens.cacheCreation + (s.tokens.reasoning || 0) + total: s.tokens.input + s.tokens.output + s.tokens.cacheRead + s.tokens.cacheCreation })); const total = Math.max(sourceTok.reduce((a, b) => a + b.total, 0), 1); const dSegs = sourceTok.map(s => ({ diff --git a/src/loaders/codex.test.ts b/src/loaders/codex.test.ts index 6f5f4d9..1b94235 100644 --- a/src/loaders/codex.test.ts +++ b/src/loaders/codex.test.ts @@ -4,6 +4,7 @@ import { mkdtempSync, rmSync, mkdirSync, writeFileSync, utimesSync } from 'node: import { tmpdir } from 'node:os'; import path from 'node:path'; import { loadCodexEvents, loadCodexRateLimits, normalizeUsage, subtractUsage, type RawUsage } from './codex.js'; +import { totalTokenCount } from '../types.js'; function makeSession(dir: string, name: string, lines: string[], mtimeSec?: number): string { const file = path.join(dir, name); @@ -321,5 +322,16 @@ describe('loadCodexEvents', () => { cacheRead: 100, reasoning: 5, }); + + // ccusage parity invariant (the documented Fix #1 residual): ccusage's + // Codex total is the reported `total_tokens` (data-loader.ts:91, + // `total_tokens > 0 ? total_tokens : input + output`). We reconstruct it + // from the 5-field breakdown instead. Because OpenAI defines + // total_tokens === input_tokens + output_tokens and cached ⊆ input, our + // totalTokenCount (input+output+cacheCreation+cacheRead, reasoning + // excluded) must equal the reported total_tokens for well-formed logs. + // Turn 1 reported total = 1100; turn 2 reported delta total = 1310-1100. + assert.equal(totalTokenCount(events[0].tokens), 1100); + assert.equal(totalTokenCount(events[1].tokens), 1310 - 1100); }); }); diff --git a/src/types.test.ts b/src/types.test.ts new file mode 100644 index 0000000..5f61e3e --- /dev/null +++ b/src/types.test.ts @@ -0,0 +1,45 @@ +import { test, describe } from 'node:test'; +import assert from 'node:assert/strict'; +import { totalTokenCount, addTokens, emptyTokens, type TokenCounts } from './types.js'; + +describe('totalTokenCount — ccusage parity', () => { + test('sums input + output + cacheCreation + cacheRead', () => { + const t: TokenCounts = { + input: 1000, output: 500, cacheCreation: 2000, cacheRead: 300, reasoning: 0, + }; + // Matches ccusage getTotalTokens (_token-utils.ts): 1000+500+2000+300. + assert.equal(totalTokenCount(t), 3800); + }); + + test('EXCLUDES reasoning — it is informational only (already inside output for Codex)', () => { + const withReasoning: TokenCounts = { + input: 100, output: 50, cacheCreation: 0, cacheRead: 0, reasoning: 9999, + }; + // ccusage never adds reasoning into its total; for Codex it lives inside + // `output`. Adding it here would double-count. Total must ignore it. + assert.equal(totalTokenCount(withReasoning), 150); + }); + + test('Claude-shaped tokens (reasoning always 0) are unaffected by the change', () => { + const claude: TokenCounts = { + input: 1234, output: 567, cacheCreation: 89, cacheRead: 4321, reasoning: 0, + }; + assert.equal(totalTokenCount(claude), 1234 + 567 + 89 + 4321); + }); + + test('addTokens still tracks reasoning so it stays available for display', () => { + const sum = addTokens( + { input: 1, output: 2, cacheCreation: 3, cacheRead: 4, reasoning: 5 }, + { input: 1, output: 2, cacheCreation: 3, cacheRead: 4, reasoning: 5 }, + ); + assert.equal(sum.reasoning, 10); + // ...but the total of the aggregate still excludes it. + assert.equal(totalTokenCount(sum), 2 + 4 + 6 + 8); + }); + + test('emptyTokens carries the reasoning field', () => { + assert.deepEqual(emptyTokens(), { + input: 0, output: 0, cacheCreation: 0, cacheRead: 0, reasoning: 0, + }); + }); +}); diff --git a/src/types.ts b/src/types.ts index 85b343f..d1aa23b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -161,8 +161,15 @@ export function addTokens(a: TokenCounts, b: TokenCounts): TokenCounts { }; } +// `reasoning` is informational only and is deliberately NOT part of the total. +// For Codex/OpenAI the reasoning tokens are already counted inside `output` +// (OpenAI semantics), so adding them would double-count. ccusage never adds +// reasoning into its total either — apps/codex/src/data-loader.ts uses the +// reported `total_tokens` (=== input + output). Excluding it here leaves Claude +// totals unchanged (reasoning is always 0 there) and makes the Codex total +// match ccusage. The `reasoning` field stays on TokenCounts for display. export function totalTokenCount(t: TokenCounts): number { - return t.input + t.output + t.cacheCreation + t.cacheRead + t.reasoning; + return t.input + t.output + t.cacheCreation + t.cacheRead; } // Returns true iff `s` parses to a finite Date. Loaders use this to drop From ec8618378091b253d5fd38e61f35a55ed185a62d Mon Sep 17 00:00:00 2001 From: matthiasschalk Date: Sun, 17 May 2026 06:48:53 +0200 Subject: [PATCH 2/6] fix(pricing): apply LiteLLM >200k tiered pricing like ccusage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit calculateCost used flat per-token rates only. ccusage applies LiteLLM's *_above_200k_tokens long-context rates per token type, per event, at the 200k threshold (packages/internal/src/pricing.ts calculateTieredCost). Claude Code turns routinely carry cache_read > 200k, so we materially undercharged long sessions versus ccusage. Port calculateTieredCost faithfully and route all four token types through it. Models without the above_200k fields stay flat — identical to ccusage (it does not implement Gemini's 128k tier either). Extend ModelPricing + the Sonnet 4 FALLBACK entry. Adds boundary tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/pricing.test.ts | 47 ++++++++++++++++++++++++++++ src/pricing.ts | 76 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 114 insertions(+), 9 deletions(-) diff --git a/src/pricing.test.ts b/src/pricing.test.ts index d936886..56f518e 100644 --- a/src/pricing.test.ts +++ b/src/pricing.test.ts @@ -34,6 +34,19 @@ mock.method(globalThis, 'fetch', async () => input_cost_per_token: 10e-6, output_cost_per_token: 30e-6, }, + // Claude-shaped tiered model: tokens above 200k (per type, per event) + // are billed at the *_above_200k_tokens rate. Mirrors ccusage's + // calculateTieredCost (packages/internal/src/pricing.ts:284). + 'tiered-model': { + input_cost_per_token: 3e-6, + output_cost_per_token: 15e-6, + cache_creation_input_token_cost: 3.75e-6, + cache_read_input_token_cost: 0.3e-6, + input_cost_per_token_above_200k_tokens: 6e-6, + output_cost_per_token_above_200k_tokens: 22.5e-6, + cache_creation_input_token_cost_above_200k_tokens: 7.5e-6, + cache_read_input_token_cost_above_200k_tokens: 0.6e-6, + }, }), { status: 200, headers: { 'Content-Type': 'application/json' } }, ), @@ -72,6 +85,40 @@ describe('calculateCost — cache pricing fallback (C11)', () => { }); }); +describe('calculateCost — tiered >200k pricing (ccusage parity)', () => { + test('flat pricing below the 200k threshold', async () => { + const cost = await calculateCost('tiered-model', counts({ + input: 100_000, output: 0, cacheCreation: 0, cacheRead: 0, + })); + // 100k * 3e-6 = 0.30 — no tier applied below threshold. + assert.equal(cost.toFixed(6), (0.3).toFixed(6)); + }); + + test('input above 200k splits at the threshold (ccusage example)', async () => { + const cost = await calculateCost('tiered-model', counts({ + input: 300_000, output: 0, cacheCreation: 0, cacheRead: 0, + })); + // ccusage docs example: 200k*3e-6 + 100k*6e-6 = 0.6 + 0.6 = 1.2 + assert.equal(cost.toFixed(6), (1.2).toFixed(6)); + }); + + test('each token type is tiered independently, per event', async () => { + const cost = await calculateCost('tiered-model', counts({ + input: 0, output: 0, cacheCreation: 0, cacheRead: 250_000, + })); + // cacheRead: 200k*0.3e-6 + 50k*0.6e-6 = 0.06 + 0.03 = 0.09 + assert.equal(cost.toFixed(6), (0.09).toFixed(6)); + }); + + test('models WITHOUT above_200k fields stay flat even past 200k (== ccusage)', async () => { + const cost = await calculateCost('gpt-4o', counts({ + input: 500_000, output: 0, cacheCreation: 0, cacheRead: 0, + })); + // No tiered field → flat: 500k * 5e-6 = 2.5 (ccusage falls back to base too). + assert.equal(cost.toFixed(6), (2.5).toFixed(6)); + }); +}); + describe('findModelPricing — prefix lookups (C12)', () => { test('finds Gemini models via the gemini/ prefix', async () => { // Loader emits the bare model name; LiteLLM keys it with the prefix. diff --git a/src/pricing.ts b/src/pricing.ts index f82d595..d3bc1cc 100644 --- a/src/pricing.ts +++ b/src/pricing.ts @@ -8,6 +8,16 @@ type ModelPricing = { output_cost_per_token?: number; cache_creation_input_token_cost?: number; cache_read_input_token_cost?: number; + // Long-context tiered rates: tokens of a given type ABOVE 200k (per event) + // are priced at these higher rates. LiteLLM publishes them for Claude/ + // Anthropic models; ccusage applies exactly the 200k threshold per token + // type per entry (packages/internal/src/pricing.ts calculateTieredCost). + // Models without these fields fall back to flat pricing — identical to + // ccusage, which does the same (it does NOT implement Gemini's 128k tier). + input_cost_per_token_above_200k_tokens?: number; + output_cost_per_token_above_200k_tokens?: number; + cache_creation_input_token_cost_above_200k_tokens?: number; + cache_read_input_token_cost_above_200k_tokens?: number; }; const FALLBACK_PRICES: Record = { @@ -16,6 +26,12 @@ const FALLBACK_PRICES: Record = { output_cost_per_token: 15e-6, cache_read_input_token_cost: 0.3e-6, cache_creation_input_token_cost: 3.75e-6, + // Anthropic >200k long-context rates (offline fallback only; live + // LiteLLM carries these verbatim and they're used the same way). + input_cost_per_token_above_200k_tokens: 6e-6, + output_cost_per_token_above_200k_tokens: 22.5e-6, + cache_read_input_token_cost_above_200k_tokens: 0.6e-6, + cache_creation_input_token_cost_above_200k_tokens: 7.5e-6, }, 'claude-opus-4-20250514': { input_cost_per_token: 15e-6, @@ -106,22 +122,64 @@ function findModelPricing( return null; } +// Faithful port of ccusage's tiered-cost helper +// (packages/internal/src/pricing.ts:284). Tokens of a single token type, for a +// single event, above `threshold` are billed at `tieredPrice`; the rest at +// `basePrice`. When `tieredPrice` is absent the model is flat-priced — exactly +// ccusage's behaviour, so non-Claude models (and Claude pre-tier) match. +// The threshold is applied PER EVENT, before any daily/monthly aggregation, +// because enrichCosts calls this once per UnifiedTokenEvent. +function calculateTieredCost( + totalTokens: number | undefined, + basePrice: number | undefined, + tieredPrice: number | undefined, + threshold = 200_000, +): number { + if (totalTokens == null || totalTokens <= 0) return 0; + + if (totalTokens > threshold && tieredPrice != null) { + const tokensBelowThreshold = Math.min(totalTokens, threshold); + const tokensAboveThreshold = Math.max(0, totalTokens - threshold); + + let tieredCost = tokensAboveThreshold * tieredPrice; + if (basePrice != null) tieredCost += tokensBelowThreshold * basePrice; + return tieredCost; + } + + if (basePrice != null) return totalTokens * basePrice; + return 0; +} + export async function calculateCost(model: string, tokens: TokenCounts): Promise { const prices = await fetchPricing(); const pricing = findModelPricing(prices, model); if (!pricing) return 0; - const inputCost = tokens.input * (pricing.input_cost_per_token ?? 0); - const outputCost = tokens.output * (pricing.output_cost_per_token ?? 0); + const inputCost = calculateTieredCost( + tokens.input, + pricing.input_cost_per_token, + pricing.input_cost_per_token_above_200k_tokens, + ); + const outputCost = calculateTieredCost( + tokens.output, + pricing.output_cost_per_token, + pricing.output_cost_per_token_above_200k_tokens, + ); // Cache pricing is provider-specific (Anthropic charges 1.25× input for // writes / 0.1× for reads; OpenAI ~0.5× for reads). When LiteLLM doesn't - // publish explicit cache rates for a model, fall back to 0 — falling back - // to the input rate would inflate cache-read cost up to 10× (Anthropic) - // and silently misprice every Claude Code session. - const cacheCreateCost = - tokens.cacheCreation * (pricing.cache_creation_input_token_cost ?? 0); - const cacheReadCost = - tokens.cacheRead * (pricing.cache_read_input_token_cost ?? 0); + // publish explicit cache rates for a model, calculateTieredCost returns 0 — + // falling back to the input rate would inflate cache-read cost up to 10× + // (Anthropic) and silently misprice every Claude Code session. + const cacheCreateCost = calculateTieredCost( + tokens.cacheCreation, + pricing.cache_creation_input_token_cost, + pricing.cache_creation_input_token_cost_above_200k_tokens, + ); + const cacheReadCost = calculateTieredCost( + tokens.cacheRead, + pricing.cache_read_input_token_cost, + pricing.cache_read_input_token_cost_above_200k_tokens, + ); return inputCost + outputCost + cacheCreateCost + cacheReadCost; } From dadd1b49b7b4aa3679e5f5c6263a0165c030a6d1 Mon Sep 17 00:00:00 2001 From: matthiasschalk Date: Sun, 17 May 2026 06:51:07 +0200 Subject: [PATCH 3/6] fix(loaders/claude,store): match ccusage usage schema; finite-number guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parseLine coerced via Number(x ?? 0): a non-numeric input_tokens became NaN (?? only catches null/undefined), the 0/0 guard let NaN through, and addTokens propagated it — one bad line turned the whole dashboard total into NaN. ccusage's valibot usageDataSchema instead requires input_tokens/ output_tokens to be real numbers (entry dropped otherwise) and treats the cache fields as v.optional(v.number()) (absent -> 0, present -> must be a number). Add requiredTokenNumber/optionalTokenNumber mirroring that contract (no string coercion; present null/non-number rejects the entry; non-finite rejected as intentional hardening). Harden store.ts isTokenCounts to require finite numbers and use it in loadFile so a poisoned historical line (NaN -> null on disk) is dropped instead of re-poisoning the store. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/loaders/claude.test.ts | 126 +++++++++++++++++++++++++++++++++++++ src/loaders/claude.ts | 39 ++++++++++-- src/store.test.ts | 19 ++++++ src/store.ts | 20 ++++-- 4 files changed, 194 insertions(+), 10 deletions(-) create mode 100644 src/loaders/claude.test.ts diff --git a/src/loaders/claude.test.ts b/src/loaders/claude.test.ts new file mode 100644 index 0000000..b24e24f --- /dev/null +++ b/src/loaders/claude.test.ts @@ -0,0 +1,126 @@ +import { test, describe, before, after, beforeEach } from 'node:test'; +import assert from 'node:assert/strict'; +import { mkdtempSync, rmSync, mkdirSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { loadClaudeEvents } from './claude.js'; + +// loadClaudeEvents reads CLAUDE_CONFIG_DIR/projects/**/*.jsonl. We isolate the +// loader cache to a throwaway dir and disable it so each case is deterministic. +let tmpRoot: string; +let projDir: string; +const ORIG_CFG = process.env.CLAUDE_CONFIG_DIR; +const ORIG_DATA = process.env.TOKENBBQ_DATA_DIR; +const ORIG_NOCACHE = process.env.TOKENBBQ_DISABLE_LOADER_CACHE; + +function writeSession(name: string, objs: unknown[]): void { + writeFileSync( + path.join(projDir, name), + objs.map((o) => JSON.stringify(o)).join('\n') + '\n', + 'utf-8', + ); +} + +const line = (over: Record = {}): Record => ({ + timestamp: '2026-05-01T10:00:00.000Z', + sessionId: 's1', + requestId: 'req-' + Math.random().toString(36).slice(2), + message: { + id: 'msg-' + Math.random().toString(36).slice(2), + model: 'claude-sonnet-4-20250514', + usage: { input_tokens: 100, output_tokens: 50 }, + }, + ...over, +}); + +describe('loadClaudeEvents — ccusage usageDataSchema parity', () => { + before(() => { + tmpRoot = mkdtempSync(path.join(tmpdir(), 'claude-test-')); + projDir = path.join(tmpRoot, 'projects', 'proj'); + mkdirSync(projDir, { recursive: true }); + process.env.CLAUDE_CONFIG_DIR = tmpRoot; + process.env.TOKENBBQ_DATA_DIR = path.join(tmpRoot, '.data'); + process.env.TOKENBBQ_DISABLE_LOADER_CACHE = '1'; + }); + + after(() => { + if (ORIG_CFG === undefined) delete process.env.CLAUDE_CONFIG_DIR; + else process.env.CLAUDE_CONFIG_DIR = ORIG_CFG; + if (ORIG_DATA === undefined) delete process.env.TOKENBBQ_DATA_DIR; + else process.env.TOKENBBQ_DATA_DIR = ORIG_DATA; + if (ORIG_NOCACHE === undefined) delete process.env.TOKENBBQ_DISABLE_LOADER_CACHE; + else process.env.TOKENBBQ_DISABLE_LOADER_CACHE = ORIG_NOCACHE; + rmSync(tmpRoot, { recursive: true, force: true }); + }); + + beforeEach(() => { + rmSync(projDir, { recursive: true, force: true }); + mkdirSync(projDir, { recursive: true }); + }); + + test('counts a well-formed entry and defaults absent cache fields to 0', async () => { + writeSession('a.jsonl', [line({ + message: { id: 'm1', model: 'claude-sonnet-4-20250514', usage: { input_tokens: 120, output_tokens: 30 } }, + requestId: 'r1', + })]); + const events = await loadClaudeEvents(); + assert.equal(events.length, 1); + assert.deepEqual(events[0].tokens, { + input: 120, output: 30, cacheCreation: 0, cacheRead: 0, reasoning: 0, + }); + }); + + test('keeps cache token fields when present', async () => { + writeSession('b.jsonl', [line({ + message: { + id: 'm2', model: 'claude-sonnet-4-20250514', + usage: { input_tokens: 10, output_tokens: 5, cache_creation_input_tokens: 7, cache_read_input_tokens: 900 }, + }, + requestId: 'r2', + })]); + const events = await loadClaudeEvents(); + assert.equal(events.length, 1); + assert.equal(events[0].tokens.cacheCreation, 7); + assert.equal(events[0].tokens.cacheRead, 900); + }); + + test('drops the entry when input_tokens is a string (== ccusage v.number())', async () => { + writeSession('c.jsonl', [line({ + message: { id: 'm3', model: 'x', usage: { input_tokens: '100', output_tokens: 50 } }, + requestId: 'r3', + })]); + assert.equal((await loadClaudeEvents()).length, 0); + }); + + test('drops the entry when input_tokens is missing (required field)', async () => { + writeSession('d.jsonl', [line({ + message: { id: 'm4', model: 'x', usage: { output_tokens: 50 } }, + requestId: 'r4', + })]); + assert.equal((await loadClaudeEvents()).length, 0); + }); + + test('drops the entry when a present cache field is not a number', async () => { + writeSession('e.jsonl', [line({ + message: { + id: 'm5', model: 'x', + usage: { input_tokens: 10, output_tokens: 5, cache_read_input_tokens: 'lots' }, + }, + requestId: 'r5', + })]); + assert.equal((await loadClaudeEvents()).length, 0); + }); + + test('drops the entry when a cache field is present but null (valibot v.optional rejects null)', async () => { + // valibot v.optional(v.number()) only excuses an ABSENT key; a present + // JSON null is not a number → ccusage drops the whole entry. + writeSession('e2.jsonl', [line({ + message: { + id: 'm5b', model: 'x', + usage: { input_tokens: 10, output_tokens: 5, cache_creation_input_tokens: null }, + }, + requestId: 'r5b', + })]); + assert.equal((await loadClaudeEvents()).length, 0); + }); +}); diff --git a/src/loaders/claude.ts b/src/loaders/claude.ts index aa0404a..9d9ac31 100644 --- a/src/loaders/claude.ts +++ b/src/loaders/claude.ts @@ -29,6 +29,21 @@ function getClaudePaths(): string[] { return candidates.filter((p) => existsSync(path.join(p, 'projects'))); } +// valibot `v.number()` (required): present and a real number, else the entry +// is rejected. Returns null to signal "reject the whole event". +function requiredTokenNumber(x: unknown): number | null { + return typeof x === 'number' && Number.isFinite(x) ? x : null; +} + +// valibot `v.optional(v.number())`: only an ABSENT key (JS `undefined`) is +// allowed to be missing → default 0. A PRESENT value must be a real number; +// `null` (JSON null), strings, etc. are not numbers, so valibot fails the +// parse and ccusage drops the whole entry. Returning null signals "reject". +function optionalTokenNumber(x: unknown): number | null { + if (x === undefined) return 0; + return typeof x === 'number' && Number.isFinite(x) ? x : null; +} + function parseLine(raw: Record): UnifiedTokenEvent | null { if (!isValidTimestamp(raw.timestamp)) return null; @@ -39,8 +54,24 @@ function parseLine(raw: Record): UnifiedTokenEvent | null { if (!usage) return null; const model = String(message.model ?? 'unknown'); - const input = Number(usage.input_tokens ?? 0); - const output = Number(usage.output_tokens ?? 0); + + // Mirror ccusage's usageDataSchema (apps/ccusage/src/data-loader.ts:167): + // message.usage.input_tokens / output_tokens are required `v.number()`, + // the two cache fields are `v.optional(v.number())`. A required field that + // is absent or not a number makes ccusage drop the whole entry; an optional + // field absent defaults to 0 but, if present, must be a number. We coerce + // nothing (string "100" is rejected, just like valibot) and additionally + // reject non-finite numbers (Infinity from `1e999`) — intentional hardening + // over bare v.number(); such values can't occur in well-formed JSONL. + const input = requiredTokenNumber(usage.input_tokens); + if (input === null) return null; + const output = requiredTokenNumber(usage.output_tokens); + if (output === null) return null; + const cacheCreation = optionalTokenNumber(usage.cache_creation_input_tokens); + if (cacheCreation === null) return null; + const cacheRead = optionalTokenNumber(usage.cache_read_input_tokens); + if (cacheRead === null) return null; + if (input === 0 && output === 0) return null; return { @@ -51,8 +82,8 @@ function parseLine(raw: Record): UnifiedTokenEvent | null { tokens: { input, output, - cacheCreation: Number(usage.cache_creation_input_tokens ?? 0), - cacheRead: Number(usage.cache_read_input_tokens ?? 0), + cacheCreation, + cacheRead, reasoning: 0, }, costUSD: typeof raw.costUSD === 'number' ? raw.costUSD : 0, diff --git a/src/store.test.ts b/src/store.test.ts index 1bff687..d6f0997 100644 --- a/src/store.test.ts +++ b/src/store.test.ts @@ -101,6 +101,25 @@ describe('loadStore', () => { assert.equal(state.events.length, 1); }); + test('rejects a NaN-poisoned token line (serializes to null on disk)', () => { + const good = ev({ sessionId: 'ok' }); + const goodLine = JSON.stringify({ v: 1, ...good, eventHash: hashEvent(good) }) + '\n'; + // A pre-fix loader could let NaN into tokens.input; JSON.stringify turns + // NaN into null, so the persisted line carries `"input": null`. loadStore + // must drop it instead of summing null/NaN into every aggregate. + const poisoned = ev({ sessionId: 'bad' }); + const poisonedLine = JSON.stringify({ + v: 1, ...poisoned, + tokens: { ...poisoned.tokens, input: null }, + eventHash: 'x', + }) + '\n'; + appendFileSync(legacyPath(), goodLine + poisonedLine); + + const state = loadStore(); + assert.equal(state.events.length, 1); + assert.equal(state.events[0].sessionId, 'ok'); + }); + test('ignores a poisoned cache written by the pre-fix version', () => { const eventsDir = path.join(tmp, 'events'); mkdirSync(eventsDir, { recursive: true }); diff --git a/src/store.ts b/src/store.ts index e7b655b..3932a08 100644 --- a/src/store.ts +++ b/src/store.ts @@ -133,15 +133,23 @@ function sameFileSet(a: StoreFileMeta[], b: StoreFileMeta[]): boolean { return true; } +function isFiniteNumber(x: unknown): x is number { + return typeof x === 'number' && Number.isFinite(x); +} + function isTokenCounts(v: unknown): v is UnifiedTokenEvent['tokens'] { if (!v || typeof v !== 'object') return false; const t = v as Record; + // Require *finite* numbers, not just `typeof number`: NaN/Infinity are + // typeof 'number' and would poison every aggregate that sums tokens. JSON + // can't carry NaN (a poisoned value serializes to null), so this also + // rejects historical lines written before the loader-side finite guard. return ( - typeof t.input === 'number' && - typeof t.output === 'number' && - typeof t.cacheCreation === 'number' && - typeof t.cacheRead === 'number' && - typeof t.reasoning === 'number' + isFiniteNumber(t.input) && + isFiniteNumber(t.output) && + isFiniteNumber(t.cacheCreation) && + isFiniteNumber(t.cacheRead) && + isFiniteNumber(t.reasoning) ); } @@ -236,7 +244,7 @@ function loadFile(file: string, into: LoadOutcome): void { typeof parsed.timestamp !== 'string' || typeof parsed.sessionId !== 'string' || typeof parsed.model !== 'string' || - !parsed.tokens || typeof parsed.tokens !== 'object' + !isTokenCounts(parsed.tokens) ) { into.badSeen++; continue; From 3856dc23fd3ac4581510f9237098209d4c60989c Mon Sep 17 00:00:00 2001 From: matthiasschalk Date: Sun, 17 May 2026 06:51:42 +0200 Subject: [PATCH 4/6] fix(loaders/claude): keep zero-token & cache-only entries (ccusage parity) parseLine dropped any entry with input_tokens === 0 && output_tokens === 0, before reading the cache fields. ccusage's schema accepts 0/0 and calculateTotals still sums cache_creation/cache_read, so a cache-only turn (input=0, output=0, cache_read>0) is real usage. Dropping it undercounted tokens and cost versus ccusage. Remove the guard. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/loaders/claude.test.ts | 23 +++++++++++++++++++++++ src/loaders/claude.ts | 5 ++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/loaders/claude.test.ts b/src/loaders/claude.test.ts index b24e24f..6454027 100644 --- a/src/loaders/claude.test.ts +++ b/src/loaders/claude.test.ts @@ -123,4 +123,27 @@ describe('loadClaudeEvents — ccusage usageDataSchema parity', () => { })]); assert.equal((await loadClaudeEvents()).length, 0); }); + + test('keeps a cache-only entry (input=0, output=0, cache_read>0) — ccusage parity', async () => { + writeSession('f.jsonl', [line({ + message: { + id: 'm6', model: 'claude-sonnet-4-20250514', + usage: { input_tokens: 0, output_tokens: 0, cache_read_input_tokens: 1500 }, + }, + requestId: 'r6', + })]); + const events = await loadClaudeEvents(); + assert.equal(events.length, 1); + assert.deepEqual(events[0].tokens, { + input: 0, output: 0, cacheCreation: 0, cacheRead: 1500, reasoning: 0, + }); + }); + + test('keeps a pure zero-token entry (ccusage schema accepts 0/0)', async () => { + writeSession('g.jsonl', [line({ + message: { id: 'm7', model: 'x', usage: { input_tokens: 0, output_tokens: 0 } }, + requestId: 'r7', + })]); + assert.equal((await loadClaudeEvents()).length, 1); + }); }); diff --git a/src/loaders/claude.ts b/src/loaders/claude.ts index 9d9ac31..d133259 100644 --- a/src/loaders/claude.ts +++ b/src/loaders/claude.ts @@ -72,7 +72,10 @@ function parseLine(raw: Record): UnifiedTokenEvent | null { const cacheRead = optionalTokenNumber(usage.cache_read_input_tokens); if (cacheRead === null) return null; - if (input === 0 && output === 0) return null; + // No zero-token drop: ccusage's schema accepts input_tokens/output_tokens + // of 0 and still sums cache_creation/cache_read (calculateTotals). A cache- + // only turn (input=0, output=0, cache_read>0) is real usage; dropping it + // here undercounted tokens and cost versus ccusage. return { source: 'claude-code', From cc7114d7113d25b6d2736c35e65eb8ab072d57ff Mon Sep 17 00:00:00 2001 From: matthiasschalk Date: Sun, 17 May 2026 06:52:52 +0200 Subject: [PATCH 5/6] fix(loaders/claude): never dedupe entries missing message/request id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dedupe fallback key `timestamp:model:input:output` (used when messageId or requestId was absent) could collapse genuinely distinct ID-less turns — and it ignored cache tokens entirely — making totals lower than ccusage. ccusage's createUniqueHash returns null when either id is missing and isDuplicateEntry(null) === false: ID-less entries are never deduped. Make dedupeKey `string | null`, drop the synthetic fallback, and skip both the seen-check and seen-insert for null keys (== ccusage markAsProcessed(null) noop). Bump the loader CACHE_VERSION to 2 so v1 records carrying stale synthetic keys are reparsed. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/loaders/cache.ts | 6 +++++- src/loaders/claude.test.ts | 29 +++++++++++++++++++++++++++++ src/loaders/claude.ts | 18 +++++++++++++++--- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/loaders/cache.ts b/src/loaders/cache.ts index e063fd9..8b0bb18 100644 --- a/src/loaders/cache.ts +++ b/src/loaders/cache.ts @@ -3,7 +3,11 @@ import path from 'node:path'; import { getStoreDir } from '../store.js'; import type { Source, UnifiedTokenEvent } from '../types.js'; -const CACHE_VERSION = 1; +// v2: the Claude loader's dedupeKey is now `string | null` (null for entries +// missing messageId/requestId, never deduped — ccusage parity). v1 records +// carry the old synthetic `ts:model:in:out` fallback string; bumping forces a +// one-time reparse so stale synthetic keys can't suppress ID-less events. +const CACHE_VERSION = 2; interface FileCacheEntry { mtimeMs: number; diff --git a/src/loaders/claude.test.ts b/src/loaders/claude.test.ts index 6454027..7a81a41 100644 --- a/src/loaders/claude.test.ts +++ b/src/loaders/claude.test.ts @@ -146,4 +146,33 @@ describe('loadClaudeEvents — ccusage usageDataSchema parity', () => { })]); assert.equal((await loadClaudeEvents()).length, 1); }); + + test('dedupes entries that share messageId:requestId (== ccusage)', async () => { + const dup = { + timestamp: '2026-05-01T10:00:00.000Z', sessionId: 's1', requestId: 'same-req', + message: { id: 'same-msg', model: 'claude-sonnet-4-20250514', usage: { input_tokens: 100, output_tokens: 50 } }, + }; + writeSession('h.jsonl', [dup, { ...dup }]); + assert.equal((await loadClaudeEvents()).length, 1); + }); + + test('NEVER dedupes ID-less entries — even byte-identical ones (ccusage isDuplicateEntry(null)=false)', async () => { + // Same timestamp/model/input/output and NO requestId: the old synthetic + // fallback key collapsed these into 1, undercounting vs ccusage. + const idless = { + timestamp: '2026-05-01T10:00:00.000Z', sessionId: 's1', + message: { model: 'claude-sonnet-4-20250514', usage: { input_tokens: 100, output_tokens: 50 } }, + }; + writeSession('i.jsonl', [idless, { ...idless }, { ...idless }]); + assert.equal((await loadClaudeEvents()).length, 3); + }); + + test('missing only requestId → not deduped (either id absent ⇒ null key)', async () => { + const noReq = { + timestamp: '2026-05-01T10:00:00.000Z', sessionId: 's1', + message: { id: 'msg-x', model: 'claude-sonnet-4-20250514', usage: { input_tokens: 100, output_tokens: 50 } }, + }; + writeSession('j.jsonl', [noReq, { ...noReq }]); + assert.equal((await loadClaudeEvents()).length, 2); + }); }); diff --git a/src/loaders/claude.ts b/src/loaders/claude.ts index d133259..82e8eb6 100644 --- a/src/loaders/claude.ts +++ b/src/loaders/claude.ts @@ -93,8 +93,11 @@ function parseLine(raw: Record): UnifiedTokenEvent | null { }; } +// dedupeKey is null when the upstream entry lacks a messageId or requestId. +// ccusage's createUniqueHash returns null in that case and isDuplicateEntry +// (null) === false — i.e. ID-less entries are NEVER treated as duplicates. type CachedClaudeEvent = { - dedupeKey: string; + dedupeKey: string | null; event: UnifiedTokenEvent; }; @@ -103,7 +106,7 @@ function isCachedClaudeEvent(value: unknown): value is CachedClaudeEvent { const record = value as Record; const event = record.event as Record | undefined; return ( - typeof record.dedupeKey === 'string' && + (typeof record.dedupeKey === 'string' || record.dedupeKey === null) && !!event && typeof event.source === 'string' && typeof event.timestamp === 'string' && @@ -166,9 +169,14 @@ export async function loadClaudeEvents(): Promise { const requestId = String(parsed.requestId ?? ''); const messageId = String((parsed.message as Record)?.id ?? ''); + // Match ccusage exactly: a stable key ONLY when both ids exist; + // otherwise null → never deduplicated. The previous synthetic + // `timestamp:model:input:output` fallback could collapse genuinely + // distinct ID-less events (it also ignored cache tokens), making + // totals lower than ccusage. const dedupeKey = requestId && messageId ? `${messageId}:${requestId}` - : `${event.timestamp}:${event.model}:${event.tokens.input}:${event.tokens.output}`; + : null; fileEvents.push({ dedupeKey, event }); } @@ -177,6 +185,10 @@ export async function loadClaudeEvents(): Promise { const seen = new Set(); const events = records.flatMap((record) => { + // null key (missing messageId/requestId) is never a duplicate and is + // never recorded — mirrors ccusage isDuplicateEntry(null)===false + + // markAsProcessed(null)=noop. Only id-bearing entries are deduped. + if (record.dedupeKey === null) return [record.event]; if (seen.has(record.dedupeKey)) return []; seen.add(record.dedupeKey); return [record.event]; From 49a5c0573b75469c4964493178481a8c50a73d79 Mon Sep 17 00:00:00 2001 From: matthiasschalk Date: Sun, 17 May 2026 06:53:46 +0200 Subject: [PATCH 6/6] docs(store): document store-vs-ccusage divergence; guard distinct turns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ccusage is stateless; TokenBBQ persists an append-only store with a content-hash dedup that differs from ccusage's messageId:requestId. Per product decision (harden + document, no risky store-hash migration, no new mode): document the parity invariant on hashEvent — the loader is the dedup authority and matches ccusage; the content hash is only for multi-process safety and is injective for realistically-distinct Claude turns. Add a regression test proving two distinct turns with identical token counts both survive, and record the accepted residuals (post-prune TokenBBQ >= ccusage is intended) in CCUSAGE_PARITY_REVIEW.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- CCUSAGE_PARITY_REVIEW.md | 147 +++++++++++++++++++++++++++++++++++++++ src/store.test.ts | 27 +++++++ src/store.ts | 15 ++++ 3 files changed, 189 insertions(+) create mode 100644 CCUSAGE_PARITY_REVIEW.md diff --git a/CCUSAGE_PARITY_REVIEW.md b/CCUSAGE_PARITY_REVIEW.md new file mode 100644 index 0000000..59f8eeb --- /dev/null +++ b/CCUSAGE_PARITY_REVIEW.md @@ -0,0 +1,147 @@ +# 🔥 Code-Review: Token-Berechnung TokenBBQ vs. ccusage + +> **Datum:** 2026-05-17 +> **Scope:** Token- & Kosten-Berechnung im Dashboard (`src/loaders/`, `src/aggregator.ts`, `src/pricing.ts`, `src/store.ts`) +> **Referenz:** ccusage v18.0.8 (vendored unter `ccusage/`) +> **Methode:** Unabhängige Code-Analyse + Cross-Check via Codex; alle Codex-Behauptungen am Code gegengeprüft. +> **Ziel:** Unsere Zahlen sollen mit ccusage übereinstimmen. + +## TL;DR — Stimmen unsere Zahlen mit ccusage überein? + +**Claude Code (ccusage-Kerndomäne):** Token-Summen stimmen *fast* — Formel, Dedup und Datums-Bucketing sind identisch. Aber die **Kosten** weichen systematisch nach unten ab (fehlende >200k-Staffelpreise). **Codex:** Die Dashboard-Token sind **systematisch zu hoch** (Reasoning-Doppelzählung) — das ist die sichtbarste Abweichung beim Vergleich pro Tool. + +| # | Schwere | Bereich | Effekt | +|---|---------|---------|--------| +| 1 | 🔴 Kritisch | Codex-Tokens | Total systematisch **zu hoch** | +| 2 | 🔴 Kritisch | Claude-Kosten | Kosten **zu niedrig** bei großen Kontexten | +| 3 | 🟠 Wichtig | Robustheit | Ein kaputtes Feld kann Total auf `NaN` ziehen | +| 4 | 🟡 Mittel | Claude-Tokens | Cache-only-Events fallen raus (Unterzählung) | +| 5 | 🟡 Mittel | Claude-Tokens | Synthetischer Dedup-Key kollabiert echte Events | +| 6 | 🟢 Architektur | Alle | Persistenter Store driftet vs. stateless ccusage | + +--- + +## 🔴 1. Codex: Reasoning-Tokens werden doppelt gezählt + +**Ort:** `src/loaders/codex.ts:166` (`reasoning: raw.reasoning`) + `src/types.ts:165` (`totalTokenCount`) + `src/dashboard.ts:413` + +**Divergenz:** OpenAI/Codex liefert `output_tokens` **inklusive** der Reasoning-Tokens; `reasoning_output_tokens` ist nur eine informative Teilmenge. ccusage implementiert das explizit: + +- `ccusage/apps/codex/src/data-loader.ts:60-62`: *„includes them as a separate field but does not add them to total_tokens"* → `total = input + output` +- `ccusage/apps/codex/src/token-utils.ts:39`: *„Reasoning tokens are already included in output_tokens, so they are not added separately to avoid double-counting"* — die Kostenformel hat **keinen** Reasoning-Term. + +TokenBBQ speichert `reasoning` separat **und** `totalTokenCount` summiert `input+output+cacheCreation+cacheRead+reasoning`. Da `output` die Reasoning-Tokens bereits enthält, zählt jedes Codex-Event seine Reasoning-Tokens **zweimal**. Bei gpt-5/o-Modellen (Codex' Standard) oft 50–90 % der Output-Tokens → massive Überzählung in Dashboard-Total, Heatmap, `topModel` und jeder Tokens-Chart. + +**Fix:** Für Codex `output` als Brutto behalten und `reasoning` nur als Anzeige-Metainfo führen (nicht im Total). ccusage' Weg: Codex' eigenes `total_tokens` (in `normalizeUsage` bereits berechnet, aber nie verwendet) als Wahrheit nehmen statt selbst zu rekonstruieren. + +--- + +## 🔴 2. Claude: Fehlende Staffelpreise >200k Tokens + +**Ort:** `src/pricing.ts:109-127` (`calculateCost`) + +**Divergenz:** ccusage rechnet Claude/Anthropic-Modelle mit **Tiered Pricing** ab — Tokens über 200k pro Token-Typ pro Event zur höheren Rate (`ccusage/packages/internal/src/pricing.ts:284-336`, `calculateTieredCost`). Für Sonnet 4: cache_read 200k+ zu $0,60/M statt $0,30/M (2×), Input 2×, Output 1,5×. + +`pricing.ts` nutzt ausschließlich Flat-Raten. In Claude-Code-Sessions ist `cache_read` pro Turn praktisch der gesamte bisherige Kontext — regelmäßig **>200k**. Ergebnis: TokenBBQ **unterberechnet** Kosten heavy-user-Sessions deutlich. Wahrscheinlich der größte moderne Kosten-Mismatch. + +**Fix:** `ModelPricing`-Typ um `*_above_200k_tokens`-Felder erweitern und `calculateTieredCost`-Logik (200k-Schwelle, pro Token-Typ) nachbauen. Schwelle nur für Claude/Anthropic (GPT = flat, Gemini = 128k — Letzteres setzt auch ccusage nicht um). + +**Nebenpunkt:** Bei `auto` nutzt ccusage `data.costUSD`, sobald `!= null`. TokenBBQ rechnet neu, sobald `costUSD <= 0`. Bei modernen Logs ohne `costUSD` rechnen beide → Punkt 2 dominiert. Bei Logs *mit* `costUSD: 0` weicht ihr ab (ihr rechnet, ccusage nimmt 0). Selten. + +--- + +## 🟠 3. Robustheit: Nicht-numerische Token-Felder vergiften das Total mit `NaN` + +**Ort:** `src/loaders/claude.ts:42-44` + +```ts +const input = Number(usage.input_tokens ?? 0); +const output = Number(usage.output_tokens ?? 0); +if (input === 0 && output === 0) return null; +``` + +**Divergenz:** `?? 0` fängt nur `null`/`undefined`. Ist `usage.input_tokens` ein String/Objekt, wird `Number(...)` → `NaN`. Der Guard `input === 0 && output === 0` ist bei `NaN` `false` → das Event passiert mit `tokens.input = NaN`. `addTokens` propagiert das → **das gesamte Dashboard-Total wird `NaN`**. ccusage' valibot-`v.number()` lehnt solche Einträge hart ab. + +Verschärfend: `store.ts:isTokenCounts` prüft `typeof t.input === 'number'` — `NaN` ist `typeof 'number'`, läuft also durch und wird **dauerhaft persistiert** (Store-Cache vergiftet bis manueller Eingriff). + +**Fix:** Nach Konvertierung `Number.isFinite()` erzwingen für alle 4 Token-Felder (`const input = Number(usage.input_tokens); if (!Number.isFinite(input)) return null;`), zusätzlich in `store.ts:isTokenCounts` `Number.isFinite` statt nur `typeof === 'number'`. + +--- + +## 🟡 4. Claude: Cache-only-Events werden verworfen + +**Ort:** `src/loaders/claude.ts:44` — `if (input === 0 && output === 0) return null;` + +**Divergenz:** ccusage' `usageDataSchema` verlangt nur, dass `input_tokens`/`output_tokens` Zahlen sind (0 erlaubt) und summiert `cache_creation`/`cache_read` trotzdem (`ccusage/apps/ccusage/src/data-loader.ts:441`). TokenBBQ wirft das ganze Event weg, *bevor* die Cache-Felder gelesen werden — Events mit `input=0, output=0, cacheRead>0` gehen verloren → Unterzählung von Tokens **und** Kosten. Selten, aber real. + +**Fix:** Guard erst nach Berechnung aller fünf Felder anwenden und nur verwerfen, wenn die Gesamtsumme 0 ist (oder ganz weglassen — ccusage filtert hier nicht). + +--- + +## 🟡 5. Claude: Synthetischer Dedup-Fallback kollabiert echte Events + +**Ort:** `src/loaders/claude.ts:135-137` + +```ts +const dedupeKey = requestId && messageId + ? `${messageId}:${requestId}` + : `${event.timestamp}:${event.model}:${event.tokens.input}:${event.tokens.output}`; +``` + +**Divergenz:** ccusage' `createUniqueHash` gibt `null` zurück, wenn messageId **oder** requestId fehlt — `isDuplicateEntry(null)` ist immer `false`, ID-lose Events werden **nie** dedupliziert (alle gezählt). TokenBBQ baut einen synthetischen Key und dedupliziert sie doch → potenzielle Unterzählung. + +**Zusatz:** Der Fallback-Key enthält **nur** `timestamp:model:input:output` — *nicht* `cacheCreation`, `cacheRead`, `costUSD` oder Session/Datei. Zwei ID-lose Events mit gleichem input/output aber unterschiedlichen Cache-Tokens kollidieren und eines wird fälschlich verworfen. Moderne Logs haben immer beide IDs (geringe Praxisrelevanz), aber für exakte ccusage-Parität: Fallback streichen und ID-lose Events wie ccusage immer durchzählen. + +--- + +## 🟢 6. Architektur: Persistenter Store driftet vs. stateless ccusage + +**Ort:** `src/store.ts` (`hashEvent`, `appendEvents`) + `src/index.ts:117` + +**Divergenz:** ccusage ist **stateless** — liest bei jedem Lauf die JSONL-Dateien neu. TokenBBQ persistiert append-only und dedupliziert per **Content-Hash** (`source|sessionId|timestamp|model|input|output|cacheRead|cacheCreation|reasoning`) — ein *anderer* Schlüssel als ccusage' `messageId:requestId` (inkl. timestamp, ohne requestId). Folgen: + +1. **Drift nach Log-Rotation:** Löscht/rotiert der User alte Claude-JSONLs, zeigt ccusage weniger, TokenBBQ behält die Historie → TokenBBQ > ccusage. Bewusst so designt, bricht aber exakte Parität. +2. Claude kommt im Dashboard aus `store.events` (nicht aus dem Frisch-Scan), der Store-Hash gewinnt. Kollidieren zwei legitim verschiedene Events im Content-Hash, ist eines **dauerhaft** weg (Hash bleibt in `state.hashes`, Re-Scan heilt nicht). + +**Empfehlung:** Design-Entscheidung dokumentieren („TokenBBQ ≥ ccusage nach Log-Pruning" = erwartet). Falls Bit-Parität gewünscht: optionaler „stateless/ccusage-compat"-Modus, der nur den Frisch-Scan ohne Store rendert. + +--- + +## ✅ Was bereits korrekt mit ccusage übereinstimmt + +- **Claude-Total-Formel** `input+output+cacheCreation+cacheRead` = ccusage `getTotalTokens` (Reasoning bei Claude immer 0) — exakt gleich. +- **Datums-Bucketing:** Beide lokale Zeitzone, `YYYY-MM-DD`. Identisch, solange ccusage ohne explizites `--timezone` läuft. +- **Dedup bei vorhandenen IDs:** `messageId:requestId` — identisch zu ccusage. +- **`isApiErrorMessage`:** ccusage filtert das **nicht** aus den Totals (nur für Reset-Time-Extraktion) — TokenBBQ ebenso. Kein Handlungsbedarf. +- **``-Modell:** Nur Anzeige-Divergenz (ccusage versteckt die Modellzeile, zählt Tokens mit). Totals unberührt — TokenBBQ zeigt zusätzlich eine ``-Zeile, kann `topModel` beeinflussen. Kosmetisch. +- **Fehlendes `message.usage`:** Beide überspringen. Konsistent. + +--- + +## Empfohlene Fix-Reihenfolge (Aufwand vs. Wirkung) + +1. **#1 Codex-Reasoning** — größte sichtbare Token-Abweichung, kleiner gezielter Fix. +2. **#3 NaN-Guard** — echter Bug, billig, schützt zusätzlich den Store. +3. **#2 Tiered Pricing** — größte Kosten-Abweichung, mittlerer Aufwand (`calculateTieredCost` portieren). +4. **#4 + #5 Claude-Loader** — zusammen in `claude.ts` erledigbar (Guard nach hinten, Fallback-Key streichen). +5. **#6** — Doku / optionaler Compat-Modus, kein dringender Code-Fix. + +--- + +## Resolution (umgesetzt 2026-05-17, Branch `fix/ccusage-parity`, Codex-abgenommen) + +Alle 6 Findings behoben, je ein Commit, Design von Codex mit **PASS** abgenommen. + +| # | Umsetzung | Tests | +|---|-----------|-------| +| 1 | `totalTokenCount` ohne `reasoning` (types.ts); alle Client-Summen in dashboard.ts; Reasoning als nicht-additiver Info-Wert | `types.test.ts`, Codex-Paritäts-Invariante in `codex.test.ts` | +| 2 | `calculateTieredCost` (faithful port, 200k, pro Token-Typ, pro Event) in `pricing.ts`; `ModelPricing` + FALLBACK erweitert | `pricing.test.ts` (Boundary + flat-fallback) | +| 3 | `parseLine` verlangt finite `v.number()`-Parität; `store.ts` `isTokenCounts`/`loadFile` finite-gehärtet | `claude.test.ts`, `store.test.ts` | +| 4 | Zero-Token-Drop entfernt (Cache-only & 0/0 bleiben wie bei ccusage) | `claude.test.ts` | +| 5 | `dedupeKey=null` bei fehlender msgId/reqId, nie dedupliziert; Fallback entfernt; Loader-`CACHE_VERSION` 1→2 | `claude.test.ts` | +| 6 | Store-Härtungs-Regressionstest + dokumentierte Invariante (kein Hash-Migration, keine neue Betriebsart — Userentscheidung) | `store.test.ts` | + +### Bewusst akzeptierte Rest-Divergenzen (von Codex bestätigt, vom User so entschieden) + +- **#1 Codex-Total-Quelle:** ccusage nutzt die gemeldete `total_tokens`; wir rekonstruieren `freshInput+cacheRead+output`. Da OpenAI `total_tokens ≡ input+output` definiert, sind sie für wohlgeformte Logs **gleich** (Regressionstest sichert das ab). Eine separate „reported total" durch das vereinheitlichte 5-Feld-Modell zu schleifen wäre invasiv — bewusst nicht umgesetzt. +- **#6 Store vs. stateless:** Nach **manuellem** Log-Pruning behält TokenBBQ Historie (TokenBBQ ≥ ccusage) — gewollt. Keine kryptografische Kollisionsgarantie ohne Store-Hash-Migration — bewusst außerhalb des Scopes. diff --git a/src/store.test.ts b/src/store.test.ts index d6f0997..348476e 100644 --- a/src/store.test.ts +++ b/src/store.test.ts @@ -101,6 +101,33 @@ describe('loadStore', () => { assert.equal(state.events.length, 1); }); + test('does NOT collapse distinct Claude turns (store-vs-ccusage parity guard)', () => { + // The Claude loader is the dedup authority and matches ccusage exactly + // (messageId:requestId, ID-less never deduped). The store adds a content + // hash only for multi-process safety. This guards the invariant that the + // content hash is injective for realistically-distinct Claude turns: + // every assistant turn has its own millisecond timestamp, so two genuine + // turns (even with identical token counts) must both survive — otherwise + // the dashboard would silently undercount vs ccusage. + const t1 = ev({ + source: 'claude-code', sessionId: 'sess', model: 'claude-sonnet-4-20250514', + timestamp: '2026-05-01T10:00:00.000Z', + tokens: { input: 100, output: 50, cacheCreation: 0, cacheRead: 0, reasoning: 0 }, + }); + const t2 = ev({ + source: 'claude-code', sessionId: 'sess', model: 'claude-sonnet-4-20250514', + timestamp: '2026-05-01T10:00:03.000Z', // 3s later — distinct turn + tokens: { input: 100, output: 50, cacheCreation: 0, cacheRead: 0, reasoning: 0 }, + }); + appendFileSync( + legacyPath(), + JSON.stringify({ v: 1, ...t1, eventHash: hashEvent(t1) }) + '\n' + + JSON.stringify({ v: 1, ...t2, eventHash: hashEvent(t2) }) + '\n', + ); + const state = loadStore(); + assert.equal(state.events.length, 2); + }); + test('rejects a NaN-poisoned token line (serializes to null on disk)', () => { const good = ev({ sessionId: 'ok' }); const goodLine = JSON.stringify({ v: 1, ...good, eventHash: hashEvent(good) }) + '\n'; diff --git a/src/store.ts b/src/store.ts index 3932a08..c6e5a4b 100644 --- a/src/store.ts +++ b/src/store.ts @@ -57,6 +57,21 @@ function getLegacyFilePath(): string { return path.join(getStoreDir(), 'events.ndjson'); } +// Store-vs-ccusage parity note: +// ccusage is stateless — it re-reads the JSONL on every run. TokenBBQ persists +// an append-only store. Source-level dedup is the LOADER's job and matches +// ccusage exactly (Claude: messageId:requestId, ID-less never deduped). This +// content hash exists ONLY for multi-process safety (two processes racing to +// persist the same scanned event must collapse to one). It deliberately keeps +// timestamp (ms) + sessionId, so it is injective for realistically-distinct +// Claude turns and cannot drop an event ccusage would keep (see the +// "does NOT collapse distinct Claude turns" store test). +// Known, accepted residual: (1) after a user MANUALLY prunes/rotates Claude +// JSONL, TokenBBQ retains history ccusage forgets (TokenBBQ >= ccusage) — this +// is intended. (2) Two genuinely distinct Claude turns that are byte-identical +// on (source,sessionId,timestamp-to-the-ms,model,all token fields) would still +// collapse here; this does not occur in practice and a true cryptographic +// guarantee would need a store-hash migration, deliberately out of scope. export function hashEvent(e: UnifiedTokenEvent): string { const payload = [ e.source,