Skip to content

Commit 1f8fdeb

Browse files
committed
Add smart dedup: same-origin similarity check before memory writes
Prevents noisy duplicate memories from the same AI tool writing about the same topic repeatedly. Uses Jaccard word similarity on both title (threshold 50%) and content (threshold 40%) within a 48-hour window. Key design: only deduplicates within the SAME origin. Cross-origin memories about the same topic are preserved as different perspectives (e.g. Claude and ChatGPT can both write about beta strategy). When a duplicate is found, the existing memory is updated in-place with the newer content, tags are merged, and the old version is snapshotted to memory_versions for history. Made-with: Cursor
1 parent 74588e6 commit 1f8fdeb

1 file changed

Lines changed: 112 additions & 0 deletions

File tree

src/memory-service.ts

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,11 +115,123 @@ function buildPaginationClause(opts?: PaginationOptions): { sql: string; params:
115115
return { sql, params };
116116
}
117117

118+
// ---------------------------------------------------------------------------
119+
// Deduplication: same-origin similarity check before creating new memories
120+
// ---------------------------------------------------------------------------
121+
122+
function tokenize(text: string): Set<string> {
123+
return new Set(
124+
text
125+
.toLowerCase()
126+
.replace(/[^a-z0-9\s]/g, " ")
127+
.split(/\s+/)
128+
.filter((w) => w.length > 2),
129+
);
130+
}
131+
132+
function jaccardSimilarity(a: Set<string>, b: Set<string>): number {
133+
if (a.size === 0 && b.size === 0) return 1;
134+
let intersection = 0;
135+
for (const w of a) {
136+
if (b.has(w)) intersection++;
137+
}
138+
const union = a.size + b.size - intersection;
139+
return union === 0 ? 0 : intersection / union;
140+
}
141+
142+
const TITLE_SIMILARITY_THRESHOLD = 0.5;
143+
const CONTENT_SIMILARITY_THRESHOLD = 0.4;
144+
const DEDUP_WINDOW_HOURS = 48;
145+
146+
export interface DedupResult {
147+
action: "created" | "merged";
148+
memory: MemoryEntry;
149+
mergedInto?: string;
150+
}
151+
152+
function findSimilarMemory(
153+
db: Database.Database,
154+
userId: string,
155+
input: CreateMemoryInput,
156+
): MemoryRow | null {
157+
const cutoff = new Date(Date.now() - DEDUP_WINDOW_HOURS * 60 * 60 * 1000).toISOString();
158+
159+
const candidates = db
160+
.prepare(
161+
`SELECT ${COLUMNS}
162+
FROM memories
163+
WHERE user_id = ? AND origin = ? AND deleted_at IS NULL AND created_at > ?
164+
ORDER BY created_at DESC
165+
LIMIT 50`,
166+
)
167+
.all(userId, input.origin, cutoff) as MemoryRow[];
168+
169+
if (candidates.length === 0) return null;
170+
171+
const inputTitleTokens = tokenize(input.title);
172+
const inputContentTokens = tokenize(input.content);
173+
174+
for (const candidate of candidates) {
175+
const titleSim = jaccardSimilarity(inputTitleTokens, tokenize(candidate.title));
176+
if (titleSim >= TITLE_SIMILARITY_THRESHOLD) {
177+
const contentSim = jaccardSimilarity(inputContentTokens, tokenize(candidate.content));
178+
if (contentSim >= CONTENT_SIMILARITY_THRESHOLD) {
179+
return candidate;
180+
}
181+
}
182+
}
183+
184+
return null;
185+
}
186+
187+
function mergeTags(existingRaw: string, newTags: string[]): string[] {
188+
const existing = safeJsonArray(existingRaw);
189+
const merged = new Set([...existing, ...newTags]);
190+
return [...merged];
191+
}
192+
118193
export function createMemory(
119194
db: Database.Database,
120195
userId: string,
121196
input: CreateMemoryInput,
122197
): MemoryEntry {
198+
const similar = findSimilarMemory(db, userId, input);
199+
200+
if (similar) {
201+
const now = new Date().toISOString();
202+
const mergedTags = mergeTags(similar.tags, input.tags);
203+
const tagsJson = JSON.stringify(mergedTags);
204+
205+
const maxVersion = db
206+
.prepare(`SELECT COALESCE(MAX(version_number), 0) as max_ver FROM memory_versions WHERE memory_id = ?`)
207+
.get(similar.id) as { max_ver: number };
208+
209+
db.prepare(
210+
`INSERT INTO memory_versions (id, memory_id, user_id, title, content, tags, memory_type, origin, allowed_vendors, version_number, created_at)
211+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
212+
).run(
213+
randomUUID(), similar.id, userId, similar.title, similar.content,
214+
similar.tags, similar.memory_type ?? "semantic", similar.origin, similar.allowed_vendors,
215+
maxVersion.max_ver + 1, now,
216+
);
217+
218+
db.prepare(
219+
`UPDATE memories SET title = ?, content = ?, tags = ?, updated_at = ? WHERE id = ? AND user_id = ?`,
220+
).run(input.title, input.content, tagsJson, now, similar.id, userId);
221+
222+
return {
223+
id: similar.id,
224+
title: input.title,
225+
content: input.content,
226+
tags: mergedTags,
227+
origin: input.origin,
228+
allowed_vendors: safeJsonArray(similar.allowed_vendors),
229+
memory_type: (similar.memory_type as MemoryType) ?? "semantic",
230+
created_at: similar.created_at,
231+
updated_at: now,
232+
};
233+
}
234+
123235
const id = randomUUID();
124236
const now = new Date().toISOString();
125237
const tagsJson = JSON.stringify(input.tags);

0 commit comments

Comments
 (0)