Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion src/attune_rag/corpus/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def __init__(
self._cache = cache
self._loaded: dict[str, RetrievalEntry] | None = None
self._aliases: dict[str, AliasInfo] | None = None
# Cached SHA-256 fingerprint of the loaded corpus. Invalidated
# whenever ``_loaded`` is cleared or rebuilt so it always reflects
# the entries currently in memory.
self._version: str | None = None

def _within_root(self, candidate: Path) -> bool:
try:
Expand Down Expand Up @@ -181,6 +185,10 @@ def _ensure_loaded(self) -> dict[str, RetrievalEntry]:
# index in sync with the most recent build so alias_index reads
# are coherent with the entries the caller just observed.
self._aliases = built_aliases
# Any rebuild invalidates the version fingerprint — even when
# caching is off, the previous cached hash no longer matches
# the entries we just produced.
self._version = None
return built_entries

def entries(self) -> Iterable[RetrievalEntry]:
Expand All @@ -207,12 +215,23 @@ def name(self) -> str:

@property
def version(self) -> str:
"""Stable SHA-256 fingerprint of the loaded corpus.

Cached after the first computation and invalidated whenever
``_ensure_loaded`` rebuilds. The previous implementation hashed
the entire corpus on every call, which made this property an
unwitting hot path for any consumer using ``version`` as a cache
key (every API request in attune-gui's RAG route, for example).
"""
entries = self._ensure_loaded()
if self._version is not None:
return self._version
hasher = hashlib.sha256()
for key in sorted(entries):
entry = entries[key]
hasher.update(key.encode("utf-8"))
hasher.update(b"\0")
hasher.update(entry.content.encode("utf-8"))
hasher.update(b"\0\0")
return hasher.hexdigest()[:16]
self._version = hasher.hexdigest()[:16]
return self._version
44 changes: 44 additions & 0 deletions tests/unit/test_corpus_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,50 @@ def test_version_changes_when_content_changes(tiny_corpus: Path) -> None:
assert v1 != v2


def test_version_cached_after_first_computation(tiny_corpus: Path, monkeypatch) -> None:
"""``version`` hashes once and reuses the result on subsequent reads.

Without this, every API request that uses ``corpus.version`` as a
cache key (e.g. attune-gui's /api/rag routes) hashes the full corpus.
"""
import hashlib

corpus = DirectoryCorpus(tiny_corpus)

sha256_calls = 0
real_sha256 = hashlib.sha256

def counting_sha256(*args, **kwargs):
nonlocal sha256_calls
sha256_calls += 1
return real_sha256(*args, **kwargs)

monkeypatch.setattr(hashlib, "sha256", counting_sha256)

v1 = corpus.version
v2 = corpus.version
v3 = corpus.version

assert v1 == v2 == v3
assert sha256_calls == 1, "version should hash exactly once when content is stable"


def test_version_invalidated_when_corpus_reloaded(tiny_corpus: Path) -> None:
"""A no-cache corpus should produce a fresh version after a rebuild,
even if the content didn't change — proves the invalidation hook
fires inside ``_ensure_loaded`` rather than relying on content drift.
"""
corpus = DirectoryCorpus(tiny_corpus, cache=False)
v1 = corpus.version
# Force the first cached value to a sentinel; after a rebuild the
# real hash should overwrite it.
corpus._version = "STALE_SENTINEL_X" # noqa: SLF001 — testing invalidation
corpus._loaded = None # noqa: SLF001 — force a rebuild on next access
v2 = corpus.version
assert v2 != "STALE_SENTINEL_X"
assert v2 == v1 # content unchanged so the recomputed hash matches


def test_retrievalentry_is_frozen_and_hashable(tiny_corpus: Path) -> None:
from dataclasses import FrozenInstanceError

Expand Down
Loading