Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 39 additions & 5 deletions src/bookmark-classify-llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
*/

import { openDb, saveDb } from './db.js';
import { twitterBookmarksIndexPath } from './paths.js';
import { twitterBookmarksIndexPath, twitterClassificationsPath } from './paths.js';
import { appendLine } from './fs.js';
import type { ResolvedEngine } from './engine.js';
import { invokeEngine } from './engine.js';
import type { ClassificationRecord } from './types.js';

const BATCH_SIZE = 50;

Expand Down Expand Up @@ -180,7 +182,7 @@ export async function classifyWithLlm(
try {
// Fetch unclassified bookmarks
const rows = db.exec(
`SELECT id, text, author_handle, links_json FROM bookmarks
`SELECT id, text, author_handle, links_json, domains, primary_domain FROM bookmarks
WHERE primary_category = 'unclassified' OR primary_category IS NULL
ORDER BY RANDOM()`
);
Expand All @@ -189,11 +191,13 @@ export async function classifyWithLlm(
return { engine: engine.name, totalUnclassified: 0, classified: 0, failed: 0, batches: 0 };
}

const unclassified: UnclassifiedBookmark[] = rows[0].values.map(r => ({
const unclassified: (UnclassifiedBookmark & { domains: string | null; primaryDomain: string | null })[] = rows[0].values.map(r => ({
id: r[0] as string,
text: r[1] as string,
authorHandle: r[2] as string | null,
links: r[3] as string | null,
domains: r[4] as string | null,
primaryDomain: r[5] as string | null,
}));

const totalUnclassified = unclassified.length;
Expand All @@ -218,8 +222,23 @@ export async function classifyWithLlm(
const stmt = db.prepare(
`UPDATE bookmarks SET categories = ?, primary_category = ? WHERE id = ?`
);
const now = new Date().toISOString();
const classificationsPath = twitterClassificationsPath();

const batchMap = new Map(batch.map(b => [b.id, b]));

for (const r of results) {
stmt.run([r.categories.join(','), r.primary, r.id]);
const b = batchMap.get(r.id);
const record: ClassificationRecord = {
id: r.id,
categories: r.categories,
primaryCategory: r.primary,
domains: b?.domains ? b.domains.split(',') : undefined,
primaryDomain: b?.primaryDomain ?? undefined,
classifiedAt: now,
};
await appendLine(classificationsPath, JSON.stringify(record));
}
stmt.free();

Expand Down Expand Up @@ -299,19 +318,20 @@ export async function classifyDomainsWithLlm(
? '1=1'
: 'primary_domain IS NULL';
const rows = db.exec(
`SELECT id, text, author_handle, categories FROM bookmarks
`SELECT id, text, author_handle, categories, primary_category FROM bookmarks
WHERE ${where} ORDER BY RANDOM()`
);

if (!rows.length || !rows[0].values.length) {
return { engine: engine.name, totalUnclassified: 0, classified: 0, failed: 0, batches: 0 };
}

const bookmarks: DomainBookmark[] = rows[0].values.map(r => ({
const bookmarks: (DomainBookmark & { primaryCategory: string | null })[] = rows[0].values.map(r => ({
id: r[0] as string,
text: r[1] as string,
authorHandle: r[2] as string | null,
categories: r[3] as string | null,
primaryCategory: r[4] as string | null,
}));

const total = bookmarks.length;
Expand All @@ -335,8 +355,22 @@ export async function classifyDomainsWithLlm(
const stmt = db.prepare(
`UPDATE bookmarks SET domains = ?, primary_domain = ? WHERE id = ?`
);
const now = new Date().toISOString();
const classificationsPath = twitterClassificationsPath();
const batchMap = new Map(batch.map(b => [b.id, b]));

for (const r of results) {
stmt.run([r.categories.join(','), r.primary, r.id]);
const b = batchMap.get(r.id);
const record: ClassificationRecord = {
id: r.id,
categories: b?.categories ? b.categories.split(',') : [],
primaryCategory: b?.primaryCategory ?? 'unclassified',
domains: r.categories,
primaryDomain: r.primary,
classifiedAt: now,
};
await appendLine(classificationsPath, JSON.stringify(record));
}
stmt.free();

Expand Down
72 changes: 69 additions & 3 deletions src/bookmarks-db.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import type { Database } from 'sql.js';
import { openDb, saveDb } from './db.js';
import { parseTimestampMs, toIsoDate } from './date-utils.js';
import { readJsonLines } from './fs.js';
import { twitterBookmarksCachePath, twitterBookmarksIndexPath } from './paths.js';
import type { BookmarkRecord, QuotedTweetSnapshot } from './types.js';
import { readJsonLines, writeJsonLines } from './fs.js';
import { twitterBookmarksCachePath, twitterBookmarksIndexPath, twitterClassificationsPath } from './paths.js';
import type { BookmarkRecord, QuotedTweetSnapshot, ClassificationRecord } from './types.js';
import { classifyCorpus, formatClassificationSummary } from './bookmark-classify.js';
import type { ClassificationSummary } from './bookmark-classify.js';

Expand Down Expand Up @@ -480,6 +480,41 @@ export async function buildIndex(options?: { force?: boolean }): Promise<{ dbPat
}
} catch { /* table may be empty */ }

// Backup source of truth: classifications.jsonl
const classificationsPath = twitterClassificationsPath();
const storedClassifications = await readJsonLines<ClassificationRecord>(classificationsPath);
for (const c of storedClassifications) {
const existing = existingRows.get(c.id);
if (existing) {
// Hydrate domains if missing in DB but present in JSONL
if (!existing.domains && c.domains) {
existing.domains = c.domains.join(',');
existing.primaryDomain = c.primaryDomain ?? null;
}
// Hydrate categories if missing in DB but present in JSONL
if (!existing.categories && c.categories) {
existing.categories = c.categories.join(',');
existing.primaryCategory = c.primaryCategory;
}
} else {
// Create new placeholder for untracked record that has classification data
existingRows.set(c.id, {
categories: c.categories.join(','),
primaryCategory: c.primaryCategory,
domains: c.domains?.join(',') ?? null,
primaryDomain: c.primaryDomain ?? null,
githubUrls: null,
quotedTweetJson: null,
articleTitle: null,
articleText: null,
articleSite: null,
enrichedAt: null,
folderIds: null,
folderNames: null,
});
}
}

const newRecords: BookmarkRecord[] = records.filter(r => !existingRows.has(r.id));

if (records.length > 0) {
Expand Down Expand Up @@ -1235,3 +1270,34 @@ export function formatSearchResults(results: SearchResult[]): string {
})
.join('\n\n');
}

export async function exportClassifications(): Promise<number> {
const dbPath = twitterBookmarksIndexPath();
const db = await openDb(dbPath);
ensureMigrations(db);

try {
const rows = db.exec(
`SELECT id, categories, primary_category, domains, primary_domain
FROM bookmarks
WHERE primary_category IS NOT NULL AND primary_category != 'unclassified'`
);

if (!rows.length || !rows[0].values.length) return 0;

const now = new Date().toISOString();
const records: ClassificationRecord[] = rows[0].values.map((r) => ({
id: r[0] as string,
categories: (r[1] as string)?.split(',') ?? [],
primaryCategory: r[2] as string,
domains: (r[3] as string)?.split(',') ?? undefined,
primaryDomain: (r[4] as string) ?? undefined,
classifiedAt: now,
}));

await writeJsonLines(twitterClassificationsPath(), records);
return records.length;
} finally {
db.close();
}
}
10 changes: 10 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
getDomainCounts,
getFolderCounts,
listBookmarks,
exportClassifications,
getBookmarkById,
} from './bookmarks-db.js';
import { formatClassificationSummary } from './bookmark-classify.js';
Expand Down Expand Up @@ -1097,9 +1098,18 @@ export function buildCli() {
.command('classify')
.description('Classify bookmarks by category and domain using LLM (requires claude or codex CLI)')
.option('--regex', 'Use simple regex classification instead of LLM')
.option('--export', 'Back up all existing SQLite classifications to classifications.jsonl')
.addOption(engineOption())
.action(safe(async (options) => {
if (!requireData()) return;

if (options.export) {
process.stderr.write('Exporting classifications to JSONL...\n');
const count = await exportClassifications();
console.log(` \u2713 Exported ${count} classifications to classifications.jsonl`);
return;
}

if (options.regex) {
process.stderr.write('Classifying bookmarks (regex)...\n');
const result = await classifyAndRebuild();
Expand Down
4 changes: 4 additions & 0 deletions src/paths.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ export function twitterBookmarksIndexPath(): string {
return path.join(dataDir(), 'bookmarks.db');
}

export function twitterClassificationsPath(): string {
return path.join(dataDir(), 'classifications.jsonl');
}

export function preferencesPath(): string {
return path.join(dataDir(), '.preferences');
}
Expand Down
9 changes: 9 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,15 @@ export interface BookmarkRecord {
quotedTweetFailedAt?: string;
}

export interface ClassificationRecord {
id: string;
categories: string[];
primaryCategory: string;
domains?: string[];
primaryDomain?: string;
classifiedAt: string;
}

export interface BookmarkFolder {
id: string;
name: string;
Expand Down