diff --git a/src/attune_rag/corpus/directory.py b/src/attune_rag/corpus/directory.py index 5b567fe..4955541 100644 --- a/src/attune_rag/corpus/directory.py +++ b/src/attune_rag/corpus/directory.py @@ -95,6 +95,10 @@ def __init__( self._cache = cache self._loaded: dict[str, RetrievalEntry] | None = None self._aliases: dict[str, AliasInfo] | None = None + # Cached SHA-256 fingerprint of the loaded corpus. Invalidated + # whenever ``_loaded`` is cleared or rebuilt so it always reflects + # the entries currently in memory. + self._version: str | None = None def _within_root(self, candidate: Path) -> bool: try: @@ -181,6 +185,10 @@ def _ensure_loaded(self) -> dict[str, RetrievalEntry]: # index in sync with the most recent build so alias_index reads # are coherent with the entries the caller just observed. self._aliases = built_aliases + # Any rebuild invalidates the version fingerprint — even when + # caching is off, the previous cached hash no longer matches + # the entries we just produced. + self._version = None return built_entries def entries(self) -> Iterable[RetrievalEntry]: @@ -207,7 +215,17 @@ def name(self) -> str: @property def version(self) -> str: + """Stable SHA-256 fingerprint of the loaded corpus. + + Cached after the first computation and invalidated whenever + ``_ensure_loaded`` rebuilds. The previous implementation hashed + the entire corpus on every call, which made this property an + unwitting hot path for any consumer using ``version`` as a cache + key (every API request in attune-gui's RAG route, for example). + """ entries = self._ensure_loaded() + if self._version is not None: + return self._version hasher = hashlib.sha256() for key in sorted(entries): entry = entries[key] @@ -215,4 +233,5 @@ def version(self) -> str: hasher.update(b"\0") hasher.update(entry.content.encode("utf-8")) hasher.update(b"\0\0") - return hasher.hexdigest()[:16] + self._version = hasher.hexdigest()[:16] + return self._version diff --git a/tests/unit/test_corpus_directory.py b/tests/unit/test_corpus_directory.py index 85fa25c..bac691a 100644 --- a/tests/unit/test_corpus_directory.py +++ b/tests/unit/test_corpus_directory.py @@ -128,6 +128,50 @@ def test_version_changes_when_content_changes(tiny_corpus: Path) -> None: assert v1 != v2 +def test_version_cached_after_first_computation(tiny_corpus: Path, monkeypatch) -> None: + """``version`` hashes once and reuses the result on subsequent reads. + + Without this, every API request that uses ``corpus.version`` as a + cache key (e.g. attune-gui's /api/rag routes) hashes the full corpus. + """ + import hashlib + + corpus = DirectoryCorpus(tiny_corpus) + + sha256_calls = 0 + real_sha256 = hashlib.sha256 + + def counting_sha256(*args, **kwargs): + nonlocal sha256_calls + sha256_calls += 1 + return real_sha256(*args, **kwargs) + + monkeypatch.setattr(hashlib, "sha256", counting_sha256) + + v1 = corpus.version + v2 = corpus.version + v3 = corpus.version + + assert v1 == v2 == v3 + assert sha256_calls == 1, "version should hash exactly once when content is stable" + + +def test_version_invalidated_when_corpus_reloaded(tiny_corpus: Path) -> None: + """A no-cache corpus should produce a fresh version after a rebuild, + even if the content didn't change — proves the invalidation hook + fires inside ``_ensure_loaded`` rather than relying on content drift. + """ + corpus = DirectoryCorpus(tiny_corpus, cache=False) + v1 = corpus.version + # Force the first cached value to a sentinel; after a rebuild the + # real hash should overwrite it. + corpus._version = "STALE_SENTINEL_X" # noqa: SLF001 — testing invalidation + corpus._loaded = None # noqa: SLF001 — force a rebuild on next access + v2 = corpus.version + assert v2 != "STALE_SENTINEL_X" + assert v2 == v1 # content unchanged so the recomputed hash matches + + def test_retrievalentry_is_frozen_and_hashable(tiny_corpus: Path) -> None: from dataclasses import FrozenInstanceError