From a0961ccca86f987d316a32927f96f95d39b5b517 Mon Sep 17 00:00:00 2001 From: ccage-simp Date: Sat, 2 May 2026 09:33:56 -0500 Subject: [PATCH] feat: classification persistence via classifications.jsonl --- src/bookmark-classify-llm.ts | 44 +++++++++++++++++++--- src/bookmarks-db.ts | 72 ++++++++++++++++++++++++++++++++++-- src/cli.ts | 10 +++++ src/paths.ts | 4 ++ src/types.ts | 9 +++++ 5 files changed, 131 insertions(+), 8 deletions(-) diff --git a/src/bookmark-classify-llm.ts b/src/bookmark-classify-llm.ts index 61524ed..22f5a71 100644 --- a/src/bookmark-classify-llm.ts +++ b/src/bookmark-classify-llm.ts @@ -7,9 +7,11 @@ */ import { openDb, saveDb } from './db.js'; -import { twitterBookmarksIndexPath } from './paths.js'; +import { twitterBookmarksIndexPath, twitterClassificationsPath } from './paths.js'; +import { appendLine } from './fs.js'; import type { ResolvedEngine } from './engine.js'; import { invokeEngine } from './engine.js'; +import type { ClassificationRecord } from './types.js'; const BATCH_SIZE = 50; @@ -180,7 +182,7 @@ export async function classifyWithLlm( try { // Fetch unclassified bookmarks const rows = db.exec( - `SELECT id, text, author_handle, links_json FROM bookmarks + `SELECT id, text, author_handle, links_json, domains, primary_domain FROM bookmarks WHERE primary_category = 'unclassified' OR primary_category IS NULL ORDER BY RANDOM()` ); @@ -189,11 +191,13 @@ export async function classifyWithLlm( return { engine: engine.name, totalUnclassified: 0, classified: 0, failed: 0, batches: 0 }; } - const unclassified: UnclassifiedBookmark[] = rows[0].values.map(r => ({ + const unclassified: (UnclassifiedBookmark & { domains: string | null; primaryDomain: string | null })[] = rows[0].values.map(r => ({ id: r[0] as string, text: r[1] as string, authorHandle: r[2] as string | null, links: r[3] as string | null, + domains: r[4] as string | null, + primaryDomain: r[5] as string | null, })); const totalUnclassified = unclassified.length; @@ -218,8 +222,23 @@ export async function classifyWithLlm( const stmt = db.prepare( `UPDATE bookmarks SET categories = ?, primary_category = ? WHERE id = ?` ); + const now = new Date().toISOString(); + const classificationsPath = twitterClassificationsPath(); + + const batchMap = new Map(batch.map(b => [b.id, b])); + for (const r of results) { stmt.run([r.categories.join(','), r.primary, r.id]); + const b = batchMap.get(r.id); + const record: ClassificationRecord = { + id: r.id, + categories: r.categories, + primaryCategory: r.primary, + domains: b?.domains ? b.domains.split(',') : undefined, + primaryDomain: b?.primaryDomain ?? undefined, + classifiedAt: now, + }; + await appendLine(classificationsPath, JSON.stringify(record)); } stmt.free(); @@ -299,7 +318,7 @@ export async function classifyDomainsWithLlm( ? '1=1' : 'primary_domain IS NULL'; const rows = db.exec( - `SELECT id, text, author_handle, categories FROM bookmarks + `SELECT id, text, author_handle, categories, primary_category FROM bookmarks WHERE ${where} ORDER BY RANDOM()` ); @@ -307,11 +326,12 @@ export async function classifyDomainsWithLlm( return { engine: engine.name, totalUnclassified: 0, classified: 0, failed: 0, batches: 0 }; } - const bookmarks: DomainBookmark[] = rows[0].values.map(r => ({ + const bookmarks: (DomainBookmark & { primaryCategory: string | null })[] = rows[0].values.map(r => ({ id: r[0] as string, text: r[1] as string, authorHandle: r[2] as string | null, categories: r[3] as string | null, + primaryCategory: r[4] as string | null, })); const total = bookmarks.length; @@ -335,8 +355,22 @@ export async function classifyDomainsWithLlm( const stmt = db.prepare( `UPDATE bookmarks SET domains = ?, primary_domain = ? WHERE id = ?` ); + const now = new Date().toISOString(); + const classificationsPath = twitterClassificationsPath(); + const batchMap = new Map(batch.map(b => [b.id, b])); + for (const r of results) { stmt.run([r.categories.join(','), r.primary, r.id]); + const b = batchMap.get(r.id); + const record: ClassificationRecord = { + id: r.id, + categories: b?.categories ? b.categories.split(',') : [], + primaryCategory: b?.primaryCategory ?? 'unclassified', + domains: r.categories, + primaryDomain: r.primary, + classifiedAt: now, + }; + await appendLine(classificationsPath, JSON.stringify(record)); } stmt.free(); diff --git a/src/bookmarks-db.ts b/src/bookmarks-db.ts index cfc336b..c265916 100644 --- a/src/bookmarks-db.ts +++ b/src/bookmarks-db.ts @@ -1,9 +1,9 @@ import type { Database } from 'sql.js'; import { openDb, saveDb } from './db.js'; import { parseTimestampMs, toIsoDate } from './date-utils.js'; -import { readJsonLines } from './fs.js'; -import { twitterBookmarksCachePath, twitterBookmarksIndexPath } from './paths.js'; -import type { BookmarkRecord, QuotedTweetSnapshot } from './types.js'; +import { readJsonLines, writeJsonLines } from './fs.js'; +import { twitterBookmarksCachePath, twitterBookmarksIndexPath, twitterClassificationsPath } from './paths.js'; +import type { BookmarkRecord, QuotedTweetSnapshot, ClassificationRecord } from './types.js'; import { classifyCorpus, formatClassificationSummary } from './bookmark-classify.js'; import type { ClassificationSummary } from './bookmark-classify.js'; @@ -480,6 +480,41 @@ export async function buildIndex(options?: { force?: boolean }): Promise<{ dbPat } } catch { /* table may be empty */ } + // Backup source of truth: classifications.jsonl + const classificationsPath = twitterClassificationsPath(); + const storedClassifications = await readJsonLines(classificationsPath); + for (const c of storedClassifications) { + const existing = existingRows.get(c.id); + if (existing) { + // Hydrate domains if missing in DB but present in JSONL + if (!existing.domains && c.domains) { + existing.domains = c.domains.join(','); + existing.primaryDomain = c.primaryDomain ?? null; + } + // Hydrate categories if missing in DB but present in JSONL + if (!existing.categories && c.categories) { + existing.categories = c.categories.join(','); + existing.primaryCategory = c.primaryCategory; + } + } else { + // Create new placeholder for untracked record that has classification data + existingRows.set(c.id, { + categories: c.categories.join(','), + primaryCategory: c.primaryCategory, + domains: c.domains?.join(',') ?? null, + primaryDomain: c.primaryDomain ?? null, + githubUrls: null, + quotedTweetJson: null, + articleTitle: null, + articleText: null, + articleSite: null, + enrichedAt: null, + folderIds: null, + folderNames: null, + }); + } + } + const newRecords: BookmarkRecord[] = records.filter(r => !existingRows.has(r.id)); if (records.length > 0) { @@ -1235,3 +1270,34 @@ export function formatSearchResults(results: SearchResult[]): string { }) .join('\n\n'); } + +export async function exportClassifications(): Promise { + const dbPath = twitterBookmarksIndexPath(); + const db = await openDb(dbPath); + ensureMigrations(db); + + try { + const rows = db.exec( + `SELECT id, categories, primary_category, domains, primary_domain + FROM bookmarks + WHERE primary_category IS NOT NULL AND primary_category != 'unclassified'` + ); + + if (!rows.length || !rows[0].values.length) return 0; + + const now = new Date().toISOString(); + const records: ClassificationRecord[] = rows[0].values.map((r) => ({ + id: r[0] as string, + categories: (r[1] as string)?.split(',') ?? [], + primaryCategory: r[2] as string, + domains: (r[3] as string)?.split(',') ?? undefined, + primaryDomain: (r[4] as string) ?? undefined, + classifiedAt: now, + })); + + await writeJsonLines(twitterClassificationsPath(), records); + return records.length; + } finally { + db.close(); + } +} diff --git a/src/cli.ts b/src/cli.ts index 35de09f..e58d585 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -19,6 +19,7 @@ import { getDomainCounts, getFolderCounts, listBookmarks, + exportClassifications, getBookmarkById, } from './bookmarks-db.js'; import { formatClassificationSummary } from './bookmark-classify.js'; @@ -1097,9 +1098,18 @@ export function buildCli() { .command('classify') .description('Classify bookmarks by category and domain using LLM (requires claude or codex CLI)') .option('--regex', 'Use simple regex classification instead of LLM') + .option('--export', 'Back up all existing SQLite classifications to classifications.jsonl') .addOption(engineOption()) .action(safe(async (options) => { if (!requireData()) return; + + if (options.export) { + process.stderr.write('Exporting classifications to JSONL...\n'); + const count = await exportClassifications(); + console.log(` \u2713 Exported ${count} classifications to classifications.jsonl`); + return; + } + if (options.regex) { process.stderr.write('Classifying bookmarks (regex)...\n'); const result = await classifyAndRebuild(); diff --git a/src/paths.ts b/src/paths.ts index 6a41d63..e405874 100644 --- a/src/paths.ts +++ b/src/paths.ts @@ -85,6 +85,10 @@ export function twitterBookmarksIndexPath(): string { return path.join(dataDir(), 'bookmarks.db'); } +export function twitterClassificationsPath(): string { + return path.join(dataDir(), 'classifications.jsonl'); +} + export function preferencesPath(): string { return path.join(dataDir(), '.preferences'); } diff --git a/src/types.ts b/src/types.ts index 75fb71e..2bdbe99 100644 --- a/src/types.ts +++ b/src/types.ts @@ -102,6 +102,15 @@ export interface BookmarkRecord { quotedTweetFailedAt?: string; } +export interface ClassificationRecord { + id: string; + categories: string[]; + primaryCategory: string; + domains?: string[]; + primaryDomain?: string; + classifiedAt: string; +} + export interface BookmarkFolder { id: string; name: string;