From 523a47b7da17662e996a8e1a525f4a55ab04d9dc Mon Sep 17 00:00:00 2001 From: GeneAI Date: Thu, 7 May 2026 21:21:44 -0400 Subject: [PATCH] perf(retrieval): memoize per-entry tokens on the entry itself MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code-review 2026-05-07: the keyword retriever's ``_score_entry`` re-tokenized path / summary / content-preview / aliases — and looped over related entries' summaries — on EVERY query against EVERY entry. For a corpus of N entries answering Q queries, that's N*Q tokenization passes, all redoing identical work. Add a ``_tokens_cache`` field on :class:`RetrievalEntry` (frozen dataclass; ``compare=False, hash=False, repr=False`` so identity semantics are unchanged) and route the retriever through two helpers: - ``_entry_field_tokens`` — keyed by ``("field_tokens", CONTENT_PREVIEW_CHARS)``. Computes path/summary/content-preview/aliases once per entry per retriever-class. Subclasses with different preview sizes get independent cache slots automatically. - ``_related_summary_tokens`` — keyed by ``("related_tokens", corpus.name)``. Computes the union of related-entry summary tokens once per (entry, corpus) pair. Cache lives on the entry, so when a corpus rebuilds (new entry instances), it's naturally fresh. Three new tests cover: cache populated on first call (same dict returned on subsequent calls), preview-size-keyed independence, and the bottom line — repeated ``_score_entry`` calls against the same entry don't re-tokenize the entry's fields. 306 passed (up from 303). Co-Authored-By: Claude Opus 4.7 --- src/attune_rag/corpus/base.py | 19 +++++++- src/attune_rag/retrieval.py | 62 +++++++++++++++++++----- tests/unit/test_retrieval.py | 89 +++++++++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+), 14 deletions(-) diff --git a/src/attune_rag/corpus/base.py b/src/attune_rag/corpus/base.py index 00770bc..b8dbec1 100644 --- a/src/attune_rag/corpus/base.py +++ b/src/attune_rag/corpus/base.py @@ -9,7 +9,18 @@ @dataclass(frozen=True) class RetrievalEntry: - """A single corpus entry. Task 1.1 shape; task 1.2 wires loaders.""" + """A single corpus entry. Task 1.1 shape; task 1.2 wires loaders. + + The ``_tokens_cache`` field is a per-instance mutable cache used by + :mod:`attune_rag.retrieval` to memoize tokenized representations + (path / summary / content-preview / aliases) so the keyword + retriever doesn't re-tokenize on every query. ``frozen=True`` + prevents the field itself from being reassigned but doesn't stop + callers from mutating its contents — exactly what we want for a + write-once-on-first-access cache. Excluded from hash, equality, + and repr so two entries with the same content compare equal even + if one has cached tokens and the other hasn't. + """ path: str category: str @@ -18,6 +29,12 @@ class RetrievalEntry: related: tuple[str, ...] = () aliases: tuple[str, ...] = () metadata: dict[str, Any] = field(default_factory=dict) + _tokens_cache: dict[Any, Any] = field( + default_factory=dict, + compare=False, + hash=False, + repr=False, + ) class AliasInfo(TypedDict): diff --git a/src/attune_rag/retrieval.py b/src/attune_rag/retrieval.py index 6f808db..3bf4ef1 100644 --- a/src/attune_rag/retrieval.py +++ b/src/attune_rag/retrieval.py @@ -170,25 +170,61 @@ class KeywordRetriever: def _category_weight(self, entry: RetrievalEntry) -> float: return self.CATEGORY_WEIGHTS.get(entry.category, self.DEFAULT_CATEGORY_WEIGHT) + def _entry_field_tokens(self, entry: RetrievalEntry) -> dict[str, set[str]]: + """Memoized tokens for the entry's own fields (path/summary/content/aliases). + + Stored on the entry itself via the ``_tokens_cache`` sidecar so + the keyword retriever stops re-tokenizing on every query — the + review's primary perf concern. Keyed by ``CONTENT_PREVIEW_CHARS`` + so a retriever subclass with a different preview size sees + independent cache entries instead of stale ones. + """ + cache_key = ("field_tokens", self.CONTENT_PREVIEW_CHARS) + cached = entry._tokens_cache.get(cache_key) + if cached is not None: + return cached + tokens = { + "path": _tokenize(entry.path), + "summary": _tokenize(entry.summary or ""), + "content_preview": _tokenize(entry.content[: self.CONTENT_PREVIEW_CHARS]), + "aliases": _tokenize(" ".join(entry.aliases)), + } + entry._tokens_cache[cache_key] = tokens + return tokens + + def _related_summary_tokens(self, entry: RetrievalEntry, corpus: CorpusProtocol) -> set[str]: + """Memoized union of related-entry summary tokens. + + Cached on the entry under a key that includes the corpus name so + the same entry surfaced across different corpora gets independent + caches. When a corpus rebuilds, fresh entries are created and + the cache is naturally empty. + """ + cache_key = ("related_tokens", corpus.name) + cached = entry._tokens_cache.get(cache_key) + if cached is not None: + return cached + related_tokens: set[str] = set() + for related_path in entry.related: + related_entry = corpus.get(related_path) + if related_entry is None or not related_entry.summary: + continue + related_tokens |= _tokenize(related_entry.summary) + entry._tokens_cache[cache_key] = related_tokens + return related_tokens + def _score_entry( self, query_tokens: set[str], entry: RetrievalEntry, corpus: CorpusProtocol, ) -> tuple[float, str]: - path_tokens = _tokenize(entry.path) - summary_tokens = _tokenize(entry.summary or "") - content_preview = entry.content[: self.CONTENT_PREVIEW_CHARS] - content_tokens = _tokenize(content_preview) - - related_summary_tokens: set[str] = set() - for related_path in entry.related: - related_entry = corpus.get(related_path) - if related_entry is None or not related_entry.summary: - continue - related_summary_tokens |= _tokenize(related_entry.summary) - - aliases_tokens = _tokenize(" ".join(entry.aliases)) + field_tokens = self._entry_field_tokens(entry) + path_tokens = field_tokens["path"] + summary_tokens = field_tokens["summary"] + content_tokens = field_tokens["content_preview"] + aliases_tokens = field_tokens["aliases"] + related_summary_tokens = self._related_summary_tokens(entry, corpus) path_hits_raw = len(query_tokens & path_tokens) path_hits = min(path_hits_raw, self.PATH_HIT_CAP) diff --git a/tests/unit/test_retrieval.py b/tests/unit/test_retrieval.py index a52a00d..684d8b9 100644 --- a/tests/unit/test_retrieval.py +++ b/tests/unit/test_retrieval.py @@ -344,3 +344,92 @@ def test_stemming_preserves_short_tokens() -> None: assert _stem("is") == "is" assert _stem("on") == "on" assert _stem("bug") == "bug" + + +# --------------------------------------------------------------------------- +# Per-entry token cache (precomputation on first use, reused on subsequent) +# --------------------------------------------------------------------------- + + +def test_entry_field_tokens_cached_after_first_call() -> None: + """``_entry_field_tokens`` must populate ``_tokens_cache`` and reuse it.""" + from attune_rag.retrieval import KeywordRetriever + + entry = _entry( + path="concepts/example.md", + category="concepts", + summary="explains the example flow", + content="example flow content", + ) + retriever = KeywordRetriever() + + first = retriever._entry_field_tokens(entry) + second = retriever._entry_field_tokens(entry) + + # Same dict instance returned on both calls — no recomputation + assert first is second + # Cache is keyed by ("field_tokens", CONTENT_PREVIEW_CHARS) + assert ("field_tokens", retriever.CONTENT_PREVIEW_CHARS) in entry._tokens_cache + + +def test_entry_field_tokens_recomputed_when_preview_size_differs() -> None: + """A retriever with a different ``CONTENT_PREVIEW_CHARS`` keys its + own cache slot, so the two don't collide. + """ + from attune_rag.retrieval import KeywordRetriever + + class WidePreview(KeywordRetriever): + CONTENT_PREVIEW_CHARS = 200 # narrower than the 500 default + + entry = _entry( + path="concepts/example.md", + category="concepts", + summary="example", + content="long content " * 100, + ) + + default = KeywordRetriever()._entry_field_tokens(entry) + narrow = WidePreview()._entry_field_tokens(entry) + + assert default is not narrow + assert ("field_tokens", 500) in entry._tokens_cache + assert ("field_tokens", 200) in entry._tokens_cache + + +def test_score_entry_does_not_re_tokenize_on_repeat_calls() -> None: + """The hot path: scoring the same entry against multiple queries + should tokenize the entry once. Patches ``_tokenize`` to count. + """ + from attune_rag import retrieval as rmod + from attune_rag.retrieval import KeywordRetriever + + real_tokenize = rmod._tokenize + calls = 0 + + def counting(text): + nonlocal calls + calls += 1 + return real_tokenize(text) + + entry = _entry( + path="concepts/example.md", + category="concepts", + summary="explains the example flow", + content="example flow content", + ) + corpus = FakeCorpus([entry]) + retriever = KeywordRetriever() + + # Prime the cache by calling once with one query + rmod._tokenize = counting + try: + retriever._score_entry({"example"}, entry, corpus) + first_pass = calls + # Subsequent scorings of the SAME entry against different queries + # must not re-tokenize the entry's fields. + retriever._score_entry({"flow"}, entry, corpus) + retriever._score_entry({"content"}, entry, corpus) + finally: + rmod._tokenize = real_tokenize + + assert calls == first_pass, "field tokens must not be recomputed across queries"