diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 4eab899..128069e 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,8 +1,15 @@ { "name": "agentlib", - "version": "1.4.0", - "description": "Agentic Knowledge Navigation — ingest books/papers/databases into chunked metadata layers, then navigate them via a universal skill. No MCP server required.", + "version": "1.8.0", + "description": "Agentic Knowledge Navigation — ingest books and papers into a curated library, then navigate via MCP tools or file-based agent.", "author": { "name": "Nadav Barkai" + }, + "mcpServers": { + "agentlib": { + "command": "/usr/bin/env", + "args": ["uv", "run", "--project", "${CLAUDE_PLUGIN_ROOT}", "python", "${CLAUDE_PLUGIN_ROOT}/server.py"], + "cwd": "${CLAUDE_PLUGIN_ROOT}" + } } } diff --git a/README.md b/README.md index 226dee9..157a11c 100644 --- a/README.md +++ b/README.md @@ -20,36 +20,47 @@ AgentLib changes this. Ingest the books, papers, and documents that matter for y AgentLib has three parts: 1. **Ingestion pipelines** — preprocess books, scientific paper corpora, and databases into small, self-contained chunks with lightweight metadata at multiple layers. -2. **Universal navigation skill** (`agentlib-knowledge`) — teaches the agent to read cheap metadata first, then drill into specific chunks. -3. **Research agent** (`library-researcher`) — runs in an isolated context to keep the main conversation clean. All navigation and chunk reading happens in the agent's context; only a synthesized answer returns. +2. **MCP tools** — the plugin registers an MCP server with 6 tools: `browse_library`, `open_book`, `search_library`, `search_concepts`, `preview_chunks`, `read_chunks`. The agent calls these directly — no sub-agent needed. +3. **Universal navigation skill** (`agentlib-knowledge`) — teaches the agent to search cheap metadata first, then drill into specific chunks via `search_library` → `preview_chunks` → `read_chunks`. -No MCP server required. No tool calls. The agent reads preprocessed files directly from `~/.claude/plugins/agentlib/library/`. +The agent navigates via MCP tool calls against preprocessed files in `~/.claude/plugins/agentlib/library/`. ### How agents navigate the library ```mermaid graph LR - Q["User question"] --> R["library-researcher
(isolated context)"] - R --> NAV["NAVIGATION.md
~50 tok per book"] - R --> CS["concepts.json (Ls)
~200 tok"] - R --> CAT["catalog (L0)
~50 tok per book"] + Q["User question"] --> SL["search_library
concepts + patterns
library_index.json"] + SL --> PC["preview_chunks
chunk metadata
nav.json"] + PC --> RC["read_chunks
2-3 best chunks
300-500 tok each"] + RC --> A["Answer with citations"] +``` - CS --> M{"concept/alias
match?"} - M -- hit --> CH["chunks (L2)
300-500 tok each"] - M -- miss --> MAN["manifest (L1)
~500 tok"] - MAN --> CH +**Fast path (concept hit):** `search_library` → `preview_chunks` → `read_chunks` — **3 tool calls, ~1.5k tokens** - NAV --> CH - CAT --> MAN +**Pattern path (cross-domain):** `search_library` (pattern tags) → `preview_chunks` → `read_chunks` — **3 tool calls, ~2.5k tokens** - CH --> A["Synthesized answer
(returned to user)"] -``` +**Recovery on miss:** related concepts → pattern traversal → `search_concepts` per book → Grep fallback + +#### Unified library index + +`library_index.json` is the single entry point for the entire library. One file, all books and corpora — queried via `search_library`. Each concept carries: + +- **aliases** — abbreviations, acronyms, synonyms (searching "CDX" matches "CycloneDX") +- **related** — directly connected concepts in the same domain ("OAuth 2.0" → "JWT", "access tokens") +- **patterns** — abstract structural fingerprints for cross-domain discovery (see below) +- **sources** — which books/papers contain the concept and their chunk IDs -**Ls hit (fast path):** NAVIGATION → concepts.json → chunks — **2-3 reads, ~1k tokens** +#### Pattern fingerprints — associative recall -**Ls miss (slow path):** NAVIGATION → catalog → manifest → chunks — **5-6 reads, ~5k tokens** +Every concept is tagged with 2-3 **pattern fingerprints**: abstract, domain-independent descriptors of its structural nature. These enable a "this reminds me of..." capability that keyword search can never provide. -The concept index includes **aliases** (abbreviations, acronyms, synonyms) generated by the LLM at ingestion time. Searching "CDX" matches the alias on "CycloneDX"; searching "SBOM" matches "Software Bill of Materials". This turns misses into hits without any runtime cost. +For example, "OAuth token rotation", "TLS certificate renewal", and "SSH key rotation" all share the pattern `credential-cycling`. An agent reading about token rotation can discover structurally analogous solutions in completely different books — without any keyword overlap. + +Pattern tags are integrated directly into `library_index.json` and searchable via `search_library`. A seed vocabulary of ~40 common patterns ensures consistency across books; fuzzy matching merges near-duplicates. + +#### Chunk preview via nav.json + +Each book's `nav.json` lets agents see what's inside each chunk *before* reading it: section title, concepts covered, token count, and prev/next chains. Queried via `preview_chunks`, this eliminates blind reads — the agent picks the 2-3 best chunks from a set of candidates instead of reading 5 and hoping.

AgentLib proactive library query @@ -64,37 +75,41 @@ The concept index includes **aliases** (abbreviations, acronyms, synonyms) gener

-### Three metadata layers +### Metadata layers ``` -L0 "What exists?" → catalog/NAVIGATION.md: ~50 tokens per book (cheap) -L1 "What's inside?" → manifest: structure, summaries, concepts (moderate) -L2 "Give me the content" → small self-contained chunks, 300-500 tok (expensive) +Lx "What do I know?" → library_index.json: concepts, patterns, sources (search_library) +Ln "What's in a book?" → nav.json: structure + chunk metadata + concepts (preview_chunks) +L2 "Give me the content" → chunks: 300-500 tok each (read_chunks) +Lf "Full rebuild" → manifest.json: complete archive per book (offline) ``` +Three files instead of six — `library_index.json` (1 file, entire library), `nav.json` (per book), and `manifest.json` (per book, full archive for rebuild). + Chunks are **content-aware**: tables and code fences are kept atomic (soft cap 500, hard cap 1 000 tokens). PDF tables are extracted via PyMuPDF and rendered as markdown pipe tables. Figures are extracted from PDFs with vision-based summarization, appearing as placeholders in chunks. -Plus a **concept index** shortcut (Ls) that jumps directly to relevant chunks when the agent already knows what it's looking for. Each concept carries LLM-generated aliases so the agent can find it by abbreviation, acronym, or alternative phrasing. +The concept index includes LLM-generated **aliases**, **related concepts**, and **pattern fingerprints** — turning keyword misses into graph traversals and enabling cross-domain discovery. ### Library structure ``` library/ -├── NAVIGATION.md ← Start here — index of everything +├── library_index.json ← Lx: unified concept + pattern discovery ├── books/ -│ ├── catalog.json ← L0 +│ ├── catalog.json │ └── {book-id}/ -│ ├── manifest.compact.json ← L1 -│ ├── concepts.json ← Ls +│ ├── nav.json ← Ln: structure + chunk metadata + concepts +│ ├── manifest.json ← Lf: full archive for rebuild │ └── chunks/ │ └── {chunk-id}.md ← L2 └── corpus/ └── {corpus-id}/ - ├── corpus_catalog.json ← L0 (topic clusters) - ├── concept_index.json ← Ls (cross-paper concepts) - ├── clusters/{cluster-id}.json ← L0b (papers per cluster) + ├── corpus_catalog.json + ├── concept_index.json + ├── clusters/{cluster-id}.json └── papers/{paper-id}/ - ├── manifest.compact.json ← L1 + ├── nav.json ← Ln + ├── manifest.json ← Lf └── chunks/{chunk-id}.md ← L2 ``` @@ -165,7 +180,7 @@ Simulated on realistic workloads (15-book library, 487-paper corpus, 80-table da | Wrong reads/queries | 1 | 0 | 1 | 0 | 2 | 0 | | **Token reduction** | | **82%** | | **55%** | | **55%** | -The core principle: *no heavy indexing, no vector databases — just smart, lightweight metadata and small content blobs.* +The core principle: *no vector databases — just smart, interconnected metadata structures. Concepts link to related concepts, abstract patterns connect ideas across domains, and chunk previews eliminate blind reads.* ## Install @@ -211,7 +226,7 @@ Ingestion runs chapter summarization in parallel and batches concept extraction **Explicit invocation** — prefix with `/agentlib-knowledge` when you want the library's answer, not Claude's training data: > /agentlib-knowledge What defensive techniques protect against prompt injection? -The skill delegates to the `library-researcher` agent, which navigates `NAVIGATION.md` → concept indexes → specific chunks in an isolated context. Only the synthesized answer with citations returns to your conversation. +The skill uses MCP tools directly: `search_library` → `preview_chunks` → `read_chunks`. Only the synthesized answer with citations returns to your conversation. Pattern tags integrated into `search_library` enable cross-domain analogies automatically. ## LLM Providers diff --git a/agents/library-researcher.md b/agents/library-researcher.md index cd8d78c..fe787f6 100644 --- a/agents/library-researcher.md +++ b/agents/library-researcher.md @@ -1,42 +1,60 @@ --- name: library-researcher description: "Research questions using the preprocessed knowledge library. Use when answering questions about ingested books, scientific papers, or domain knowledge that may be in the library." -model: haiku +model: sonnet tools: Read, Glob, Grep -maxTurns: 15 +maxTurns: 25 --- You are a research assistant. Follow this sequence to answer questions. **IMPORTANT:** Use ABSOLUTE paths only — never use `~/` (it won't resolve in your context). The library path will be provided in your prompt. -## Step 1: Read the index (1 read) -Read `{library}/NAVIGATION.md`. Identify which books or corpora are relevant. +## Step 1: Unified library search (1 read) +Read `{library}/library_index.json`. This contains ALL concepts across ALL books and corpora with: +- **aliases**: alternative names, abbreviations, acronyms +- **related**: directly connected concepts in the same domain +- **patterns**: abstract structural fingerprints (e.g. "credential-cycling", "retry-with-backoff") +- **sources**: which books/papers contain this concept and their chunk IDs -## Step 2: Find chunk IDs (1-2 reads) +If `library_index.json` doesn't exist, fall back to reading `{library}/NAVIGATION.md` and then per-book `nav.json`. -**Try concepts.json first** (fastest): -- Books: `{library}/books/{book-id}/concepts.json` -- Corpora: `{library}/corpus/{corpus-id}/concept_index.json` +## Step 2: Preview chunks — MANDATORY (1 read) +**NEVER read chunk files without previewing first.** This is the most important efficiency rule. -Each concept has `"chunks"` (list of chunk IDs) and optionally `"aliases"` (alternative names, abbreviations, acronyms). When scanning for your topic, check BOTH the concept name AND its aliases — your search term may match an alias rather than the primary name. +Read `{library}/books/{book-id}/nav.json` to assess candidates: +- The `chunks` section shows each chunk's **section**, **concepts**, **token count**, and **prev/next** links +- The `concepts` section maps concept names to their chunk IDs -If concepts.json has a match → note chunk IDs → go to Step 3. +Pick only the 2-3 most relevant chunks. Skip chunks whose section/concepts don't match your query. Reading unnecessary chunks wastes tokens. -**If no match in concepts**, use Grep on chunks directory: -``` -Grep pattern: "your search term" path: "{library}/books/{book-id}/chunks/" -``` -This finds which chunks contain relevant content. Note the filenames. +## Step 2b: Cross-domain insight (optional) +If the concept has **pattern** tags (e.g. "credential-cycling"), look up the pattern in `library_index.json`'s `patterns` section to discover structurally similar concepts in other domains. This enables "this reminds me of..." connections. + +Only do this when the user's question could benefit from cross-domain analogies. ## Step 3: Read chunks (2-5 reads) Read the specific chunk files identified in Step 2. +- If you need more context, follow **prev/next** links from nav.json +- Books: `{library}/books/{book-id}/chunks/{chunk-id}.md` +- Corpora: `{library}/corpus/{corpus-id}/papers/{paper-id}/chunks/{chunk-id}.md` ## Step 4: Return answer -Synthesize a clear answer citing source (book/paper title and chunk IDs). +Synthesize a clear answer citing source (book/paper title and chunk IDs). Keep your response under 2000 characters. Cite sources but don't include raw chunk text. + +If patterns revealed cross-domain analogies, mention them: "This follows the same structural pattern as [X] in [other book]." + +## Recovery: concept miss +If library_index.json has no match: +1. Check **related** concepts — your term may be a sub-concept of something indexed +2. Check **pattern** tags in library_index.json — search by structural shape instead of name +3. Fall back to `{library}/books/{book-id}/nav.json` concepts section with alias matching +4. Last resort: Grep on chunks directory ## Rules - ALWAYS use absolute paths, never `~/` -- Try concepts.json FIRST, use Grep only as fallback -- Do NOT read manifest.compact.json — it's too large -- Total: max 3 navigation reads + 5 content chunks +- Start with library_index.json (fastest: 1 file covers entire library) +- **NEVER skip the preview step — read nav.json BEFORE any chunk files** +- Total: max 4 navigation reads + 5 content chunks +- Cite the book/paper and chunk ID when answering +- **If you're running low on turns, STOP researching and synthesize an answer from what you have.** A partial answer with citations is better than no answer. Never return mid-thought narration. diff --git a/commands/agentlib-ingest-book.md b/commands/agentlib-ingest-book.md index c786419..d36cc6f 100644 --- a/commands/agentlib-ingest-book.md +++ b/commands/agentlib-ingest-book.md @@ -16,7 +16,9 @@ This will: 1. Parse the PDF/EPUB to extract chapter/section structure 2. Chunk the content into 300-500 token segments 3. Summarise each chapter using the configured LLM provider -4. Build a concept index for fast search -5. Write manifest and update the library catalog +4. Build a concept index with aliases, pattern fingerprints, and related concepts +5. Generate nav.json (per-book navigation: structure, chunk preview, concepts) +6. Update the unified library_index.json (concepts + patterns) +7. Write manifest and update the library catalog -After ingestion, the book is available in the library. The agent navigates it via the `/agentlib-knowledge` skill by reading catalog.json, manifest.compact.json, concepts.json, and chunks/*.md +After ingestion, the book is available in the library. The agent navigates it via the `/agentlib-knowledge` skill, starting with library_index.json for unified cross-library search. diff --git a/commands/agentlib-ingest-corpus.md b/commands/agentlib-ingest-corpus.md index d0520a2..46d083f 100644 --- a/commands/agentlib-ingest-corpus.md +++ b/commands/agentlib-ingest-corpus.md @@ -15,6 +15,7 @@ This will: 2. Parse and chunk each paper into 300-500 token segments 3. Summarise each paper's sections using the configured LLM provider 4. Cluster papers by topic -5. Build a cross-paper concept index +5. Build a cross-paper concept index with pattern fingerprints +6. Update the unified library_index.json (concepts + patterns) -After ingestion, use `/agentlib-knowledge` to query the corpus. +After ingestion, use `/agentlib-knowledge` to query the corpus. The agent can discover connections between corpus papers and ingested books through shared pattern fingerprints. diff --git a/commands/agentlib-library.md b/commands/agentlib-library.md index 3d7dc6f..d563841 100644 --- a/commands/agentlib-library.md +++ b/commands/agentlib-library.md @@ -11,4 +11,4 @@ If a book ID is provided (`$ARGUMENTS`), show the detailed structure of that boo Read directly from the library: - No args: Read ~/.claude/plugins/agentlib/library/books/catalog.json and display as a formatted table -- With book ID: Read ~/.claude/plugins/agentlib/library/books/{book-id}/manifest.compact.json and display the chapter structure +- With book ID: Read ~/.claude/plugins/agentlib/library/books/{book-id}/nav.json and display the chapter structure diff --git a/lib/models.py b/lib/models.py index 77b9d3d..c14c3a6 100644 --- a/lib/models.py +++ b/lib/models.py @@ -75,6 +75,183 @@ class ConceptEntry: sec: str chunks: list[str] = field(default_factory=list) aliases: list[str] = field(default_factory=list) + patterns: list[str] = field(default_factory=list) + related: list[str] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Library-wide navigation structures +# --------------------------------------------------------------------------- + +@dataclass +class LibraryConceptSource: + """A concept's presence in a specific source (book or corpus paper).""" + source: str # "book:" or "corpus::" + chunks: list[str] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> LibraryConceptSource: + return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__}) + + +@dataclass +class LibraryConceptEntry: + """A concept in the unified library index with cross-source presence.""" + sources: list[LibraryConceptSource] = field(default_factory=list) + aliases: list[str] = field(default_factory=list) + related: list[str] = field(default_factory=list) + patterns: list[str] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return { + "sources": [s.to_dict() for s in self.sources], + "aliases": self.aliases, + "related": self.related, + "patterns": self.patterns, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> LibraryConceptEntry: + sources = [LibraryConceptSource.from_dict(s) for s in data.get("sources", [])] + return cls( + sources=sources, + aliases=data.get("aliases", []), + related=data.get("related", []), + patterns=data.get("patterns", []), + ) + + +@dataclass +class LibraryIndex: + """Unified cross-book/corpus concept index for the entire library.""" + concepts: dict[str, LibraryConceptEntry] = field(default_factory=dict) + patterns: dict[str, list[PatternEntry]] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return { + "concepts": {k: v.to_dict() for k, v in self.concepts.items()}, + "patterns": {k: [e.to_dict() for e in v] for k, v in self.patterns.items()}, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> LibraryIndex: + # Support both old format (flat concept dict) and new format (with "concepts" key) + if "concepts" in data and isinstance(data["concepts"], dict): + concepts = {k: LibraryConceptEntry.from_dict(v) for k, v in data["concepts"].items()} + else: + # Legacy: top-level keys are concepts (no "concepts"/"patterns" wrapper) + concepts = {k: LibraryConceptEntry.from_dict(v) for k, v in data.items() + if k not in ("patterns",)} + patterns = {k: [PatternEntry.from_dict(e) for e in v] for k, v in data.get("patterns", {}).items()} + return cls(concepts=concepts, patterns=patterns) + + def to_json(self) -> str: + return json.dumps(self.to_dict(), indent=2) + + @classmethod + def from_json(cls, text: str) -> LibraryIndex: + return cls.from_dict(json.loads(text)) + + +@dataclass +class PatternEntry: + """A concept linked to a pattern in the pattern index.""" + concept: str + source: str # "book:" or "corpus::" + chunks: list[str] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> PatternEntry: + return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__}) + + +@dataclass +class ChunkIndexEntry: + """Metadata about a single chunk for pre-read assessment.""" + section: str + concepts: list[str] = field(default_factory=list) + tokens: int = 0 + prev: str | None = None + next: str | None = None + + def to_dict(self) -> dict[str, Any]: + d: dict[str, Any] = {"section": self.section, "concepts": self.concepts, "tokens": self.tokens} + if self.prev: + d["prev"] = self.prev + if self.next: + d["next"] = self.next + return d + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> ChunkIndexEntry: + return cls( + section=data.get("section", ""), + concepts=data.get("concepts", []), + tokens=data.get("tokens", 0), + prev=data.get("prev"), + next=data.get("next"), + ) + + +@dataclass +class ChunkIndex: + """Per-book index mapping chunk IDs to preview metadata.""" + book_id: str + chunks: dict[str, ChunkIndexEntry] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return {k: v.to_dict() for k, v in self.chunks.items()} + + @classmethod + def from_dict(cls, book_id: str, data: dict[str, Any]) -> ChunkIndex: + chunks = {k: ChunkIndexEntry.from_dict(v) for k, v in data.items()} + return cls(book_id=book_id, chunks=chunks) + + def to_json(self) -> str: + return json.dumps(self.to_dict(), indent=2) + + @classmethod + def from_json(cls, book_id: str, text: str) -> ChunkIndex: + return cls.from_dict(book_id, json.loads(text)) + + +@dataclass +class BookNav: + """Per-book navigation file: structure + chunk metadata + concepts.""" + book_id: str + chapters: list[dict] = field(default_factory=list) # compact chapter/section structure + chunks: dict[str, ChunkIndexEntry] = field(default_factory=dict) + concepts: dict[str, dict] = field(default_factory=dict) # concept -> {chunks, aliases, patterns, related} + + def to_dict(self) -> dict: + return { + "book_id": self.book_id, + "chapters": self.chapters, + "chunks": {k: v.to_dict() for k, v in self.chunks.items()}, + "concepts": self.concepts, + } + + @classmethod + def from_dict(cls, data: dict) -> BookNav: + return cls( + book_id=data["book_id"], + chapters=data.get("chapters", []), + chunks={k: ChunkIndexEntry.from_dict(v) for k, v in data.get("chunks", {}).items()}, + concepts=data.get("concepts", {}), + ) + + def to_json(self) -> str: + return json.dumps(self.to_dict(), indent=2) + + @classmethod + def from_json(cls, text: str) -> BookNav: + return cls.from_dict(json.loads(text)) @dataclass @@ -307,6 +484,8 @@ class CorpusConceptEntry: sections: dict[str, str] = field(default_factory=dict) note: str = "" aliases: list[str] = field(default_factory=list) + patterns: list[str] = field(default_factory=list) + related: list[str] = field(default_factory=list) def to_dict(self) -> dict[str, Any]: return asdict(self) @@ -318,6 +497,8 @@ def from_dict(cls, data: dict[str, Any]) -> CorpusConceptEntry: sections=data.get("sections", {}), note=data.get("note", ""), aliases=data.get("aliases", []), + patterns=data.get("patterns", []), + related=data.get("related", []), ) diff --git a/lib/storage.py b/lib/storage.py index 21af04c..0af3019 100644 --- a/lib/storage.py +++ b/lib/storage.py @@ -1,4 +1,11 @@ -"""Filesystem I/O layer for AgentLib data storage.""" +"""Filesystem I/O layer for AgentLib data storage. + +Superseded functions (removed in v1.8.0): +- write_chunk_index / read_chunk_index -> use nav.json via write_book_nav / read_book_nav +- write_pattern_index / read_pattern_index -> patterns merged into library_index.json +- write_compact_manifest -> use nav.json +- write_concept_index -> concepts merged into nav.json +""" from __future__ import annotations import os @@ -6,10 +13,12 @@ from pathlib import Path from lib.models import ( + BookNav, # pyright: ignore[reportAttributeAccessIssue] Catalog, CatalogEntry, CorpusCatalog, CorpusConceptIndex, + LibraryIndex, # pyright: ignore[reportAttributeAccessIssue] Manifest, PaperManifest, PaperMetadata, @@ -220,74 +229,43 @@ def search_concepts(query: str, book_id: str | None = None) -> dict[str, list[di # --------------------------------------------------------------------------- -# Zero-server mode: compact files for file-based navigation +# Per-book navigation (nav.json — structure + chunks + concepts) # --------------------------------------------------------------------------- -def write_compact_manifest(manifest: Manifest) -> Path: - """Write a compact manifest optimized for agent navigation (~500-2k tokens).""" - import json - - _validate_path_component(manifest.book_id, "book_id") - compact: dict = { - "book_id": manifest.book_id, - "chapters": [], - "concepts": sorted(manifest.concept_index.keys()), - } - for ch in manifest.chapters: - compact["chapters"].append({ - "id": ch.id, - "title": ch.title, - "summary": ch.summary[:100] + "..." if len(ch.summary) > 100 else ch.summary, - "concepts": ch.key_concepts[:3], - "sections": [ - {"id": s.id, "title": s.title, "chunks": len(s.chunk_ids)} - for s in ch.sections - ], - }) - - path = _safe_join(_books_root(), manifest.book_id, "manifest.compact.json") +def write_book_nav(nav: BookNav) -> None: + """Write per-book navigation file (structure + chunks + concepts).""" + path = _safe_join(_books_root(), nav.book_id, "nav.json") path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(compact, indent=2), encoding="utf-8") - return path + path.write_text(nav.to_json()) -def write_concept_index(book_id: str, concept_index: dict) -> Path: - """Write a concept index file with aliases for file-based navigation. +def read_book_nav(book_id: str) -> BookNav | None: + """Read per-book navigation file. Returns None if missing.""" + _validate_path_component(book_id) + path = _safe_join(_books_root(), book_id, "nav.json") + if not path.exists(): + return None + return BookNav.from_json(path.read_text()) - Output format: {concept_name: {"chunks": [...], "aliases": [...]}} - Falls back to {concept_name: {"chunks": [...]}} when no aliases exist. - """ - import json - _validate_path_component(book_id, "book_id") - flat: dict[str, dict[str, list[str]]] = {} - for concept, entries in concept_index.items(): - chunk_ids: list[str] = [] - aliases: list[str] = [] - for entry in entries: - # Duck-type: callers pass ConceptEntry objects (with .chunks attr) - # or dicts (from deserialized JSON). Normalizing callers is out of - # scope for this PR. - if hasattr(entry, "chunks"): - chunk_ids.extend(entry.chunks) - elif isinstance(entry, dict): - chunk_ids.extend(entry.get("chunks", [])) - # Collect aliases - if hasattr(entry, "aliases"): - aliases.extend(entry.aliases) - elif isinstance(entry, dict): - aliases.extend(entry.get("aliases", [])) - if chunk_ids: - entry_data: dict[str, list[str]] = {"chunks": chunk_ids} - # Deduplicate aliases - unique_aliases = list(dict.fromkeys(aliases)) - if unique_aliases: - entry_data["aliases"] = unique_aliases - flat[concept] = entry_data - - path = _safe_join(_books_root(), book_id, "concepts.json") +# --------------------------------------------------------------------------- +# Library index (unified cross-book/corpus concept index) +# --------------------------------------------------------------------------- + +def read_library_index() -> LibraryIndex: + """Read the unified library index. Returns empty index if not found.""" + path = _data_root() / "library_index.json" + if not path.exists(): + return LibraryIndex() + return LibraryIndex.from_json(path.read_text(encoding="utf-8")) + + +def write_library_index(index: LibraryIndex) -> Path: + """Write the unified library index.""" + import json + path = _data_root() / "library_index.json" path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(flat, indent=2), encoding="utf-8") + path.write_text(json.dumps(index.to_dict(), indent=2), encoding="utf-8") return path @@ -334,13 +312,23 @@ def write_navigation_md() -> Path: "\n" "## How to navigate\n" "\n" - "### Books -- Quick path (know what you need):\n" - "1. Read `books/{book-id}/concepts.json` -- find chunk IDs for your concept (check aliases too)\n" + "### FASTEST: Unified library search (1 read, covers ALL books + corpora)\n" + "1. Read `library_index.json` -- find concepts across ALL sources with aliases, related concepts, and pattern tags\n" + "2. Read `books/{book-id}/nav.json` -- per-book navigation: structure, chunk preview, and concepts\n" + "3. Read `books/{book-id}/chunks/{chunk-id}.md` -- get the content\n" + "\n" + "### Cross-domain insight: Pattern-based discovery\n" + "1. Find a concept's `patterns` in `library_index.json` (e.g. `credential-cycling`)\n" + "2. Look up the pattern in `library_index.json`'s `patterns` section to find ALL concepts sharing that pattern\n" + "3. Discover structurally similar concepts in different domains\n" + "\n" + "### Books -- Quick path:\n" + "1. Read `books/{book-id}/nav.json` -- find chunk IDs by concept (check aliases, patterns, related), preview chunks\n" "2. Read `books/{book-id}/chunks/{chunk-id}.md` -- get the content\n" "\n" "### Books -- Exploration path (browsing):\n" "1. Read `books/catalog.json` -- see all available books (~50 tokens/book)\n" - "2. Read `books/{book-id}/manifest.compact.json` -- see chapters, summaries, concepts (~500-2k tokens)\n" + "2. Read `books/{book-id}/nav.json` -- see chapters, sections, chunk preview, and concepts\n" "3. Read `books/{book-id}/chunks/{chunk-id}.md` -- get specific content (~300-500 tokens)\n" "\n" "### Corpora -- Paper collections:\n" @@ -348,19 +336,24 @@ def write_navigation_md() -> Path: "2. Read `corpus/{corpus-id}/clusters/{cluster-id}.json` -- see papers with abstracts\n" "3. Read `corpus/{corpus-id}/papers/{paper-id}/manifest.compact.json` -- paper structure\n" "4. Read `corpus/{corpus-id}/papers/{paper-id}/chunks/{chunk-id}.md` -- paper content\n" - "5. Read `corpus/{corpus-id}/concept_index.json` -- cross-paper concept search\n" + "5. Read `corpus/{corpus-id}/concept_index.json` -- cross-paper concept search (with patterns)\n" "\n" "## Token budget\n" + "- library_index.json: ~500-1500 tokens (entire library)\n" + "- nav.json: ~500-2000 tokens per book (structure + chunks + concepts)\n" "- catalog.json: ~50 tokens per book\n" - "- manifest.compact.json: ~500-2000 tokens per book\n" + "- manifest.json: full archive (use nav.json instead for navigation)\n" "- Each chunk: ~300-500 tokens\n" - "- concepts.json: ~200-500 tokens\n" "- corpus_catalog.json: ~500-800 tokens\n" "- concept_index.json: ~500-1500 tokens\n" "\n" "## Rules\n" - "- NEVER read the full manifest.json -- use manifest.compact.json instead\n" - "- NEVER read all chunks -- use concepts.json or manifest to find the right ones\n" + "- START with `library_index.json` for cross-library search (fastest path)\n" + "- Use `nav.json` to PREVIEW chunks before reading them\n" + "- Follow `prev`/`next` links in nav.json chunks for adjacent context\n" + "- Pattern discovery is built into `library_index.json` (no separate file needed)\n" + "- For browsing, use `nav.json` instead of manifest.json\n" + "- NEVER read all chunks -- use concepts or nav.json to find the right ones\n" "- Max 10 chunks per question -- if you need more, refine your search\n" "\n" "## Current library\n" diff --git a/lib/summariser.py b/lib/summariser.py index a8bcdd0..745d558 100644 --- a/lib/summariser.py +++ b/lib/summariser.py @@ -38,6 +38,8 @@ class ConceptMapping: sec: str chunks: list[str] = field(default_factory=list) aliases: list[str] = field(default_factory=list) + patterns: list[str] = field(default_factory=list) + related: list[str] = field(default_factory=list) def _get_config(llm_config: LLMConfig | None) -> LLMConfig: @@ -232,6 +234,7 @@ async def async_summarise_chapter( ) + def _format_chapters_text(chapters: list[ChapterSummary]) -> str: """Format chapter summaries into text for the concept extraction prompt.""" text = "" @@ -251,9 +254,13 @@ def _parse_concept_response(data: dict) -> dict[str, list[ConceptMapping]]: for concept, value in data.items(): if isinstance(value, dict) and "locations" in value: aliases = value.get("aliases", []) + patterns = value.get("patterns", []) + related = value.get("related", []) entries = value["locations"] elif isinstance(value, list): aliases = [] + patterns = [] + related = [] entries = value else: continue @@ -264,6 +271,8 @@ def _parse_concept_response(data: dict) -> dict[str, list[ConceptMapping]]: sec=e.get("sec", ""), chunks=e.get("chunks", []), aliases=aliases, + patterns=patterns, + related=related, ) for e in entries ] @@ -319,18 +328,27 @@ def extract_concepts( {chapters_text} -Create a concept index that maps key concepts to their locations. Each concept should appear with all relevant chapters, sections, and chunks where it's discussed. For each concept, include 2-3 aliases: abbreviations, acronyms, or alternative phrasings someone might search for. +Create a concept index that maps key concepts to their locations. Each concept should appear with all relevant chapters, sections, and chunks where it's discussed. + +For each concept, include: +- **aliases** (2-3): abbreviations, acronyms, or alternative phrasings someone might search for. +- **patterns** (1-3): structural or methodological patterns the concept exemplifies (e.g., 'layered-architecture', 'feedback-loop', 'defense-in-depth'). Use lowercase-hyphenated format. Use consistent naming across concepts — two concepts that share a pattern are structurally analogous. +- **related** (2-5): names of OTHER concepts in this same index that are closely related to this one. Respond with ONLY valid JSON in this exact format: {{ "concept_name_1": {{ "aliases": ["abbreviation", "synonym"], + "patterns": ["pattern-tag-1", "pattern-tag-2"], + "related": ["concept_name_2"], "locations": [ {{"ch": "ch01", "sec": "ch01-s01", "chunks": ["ch01-s01-001", "ch01-s01-002"]}} ] }}, "concept_name_2": {{ "aliases": ["alt_name"], + "patterns": ["pattern-tag-3"], + "related": ["concept_name_1"], "locations": [ {{"ch": "ch02", "sec": "ch02-s03", "chunks": ["ch02-s03-001"]}} ] @@ -356,21 +374,29 @@ def extract_concepts( concept_key_map[key] = concept all_concepts[concept] = list(mappings) - # Deduplicate locations and aliases per concept + # Deduplicate locations, aliases, patterns, and related per concept for concept in all_concepts: seen: set[tuple] = set() deduped: list[ConceptMapping] = [] all_aliases: list[str] = [] + all_patterns: list[str] = [] + all_related: list[str] = [] for m in all_concepts[concept]: loc_key = (m.ch, m.sec, tuple(sorted(m.chunks))) if loc_key not in seen: seen.add(loc_key) deduped.append(m) all_aliases.extend(m.aliases) - # Dedupe aliases preserving order + all_patterns.extend(m.patterns) + all_related.extend(m.related) + # Dedupe preserving order unique_aliases: list[str] = list(dict.fromkeys(all_aliases)) + unique_patterns: list[str] = list(dict.fromkeys(all_patterns)) + unique_related: list[str] = [r for r in dict.fromkeys(all_related) if r != concept] for m in deduped: m.aliases = unique_aliases + m.patterns = unique_patterns + m.related = unique_related all_concepts[concept] = deduped # Cap total concepts diff --git a/preprocessing/books.py b/preprocessing/books.py index bd21c4b..04dd938 100644 --- a/preprocessing/books.py +++ b/preprocessing/books.py @@ -46,22 +46,29 @@ def _load_env() -> None: from lib.chunker import chunk_sections, Chunk from lib.models import ( + BookNav, CatalogEntry, ChapterInfo, + ChunkIndexEntry, ConceptEntry, + LibraryConceptEntry, + LibraryConceptSource, Manifest, ParsedSection, + PatternEntry, SectionInfo, ) from lib.parser import parse_file from lib.storage import ( list_chunks, read_catalog, + read_chunk, + read_library_index, read_manifest, update_catalog_entry, + write_book_nav, write_chunk, - write_compact_manifest, - write_concept_index, + write_library_index, write_manifest, write_navigation_md, ) @@ -123,6 +130,113 @@ def _group_chunks_by_section( return dict(result) +def _fuzzy_match_pattern(new_pattern: str, existing_patterns: set[str]) -> str: + """Match a new pattern to an existing one if similar enough, else return as-is. + + Uses simple Jaccard similarity on character trigrams. + """ + if new_pattern in existing_patterns: + return new_pattern + + def _trigrams(s: str) -> set[str]: + return {s[i:i + 3] for i in range(max(0, len(s) - 2))} + + new_tri = _trigrams(new_pattern) + if not new_tri: + return new_pattern + + best_match = new_pattern + best_score = 0.0 + for existing in existing_patterns: + existing_tri = _trigrams(existing) + if not existing_tri: + continue + intersection = len(new_tri & existing_tri) + union = len(new_tri | existing_tri) + score = intersection / union if union else 0.0 + if score > best_score and score >= 0.7: + best_score = score + best_match = existing + + return best_match + + +def _update_library_indices(book_id: str, manifest: Manifest) -> None: + """Update library_index.json (concepts + patterns) with this book's concepts.""" + source_prefix = f"book:{book_id}" + + lib_index = read_library_index() + + # Remove stale concept entries for this book + for concept_name, entry in list(lib_index.concepts.items()): + entry.sources = [s for s in entry.sources if s.source != source_prefix] + if not entry.sources: + del lib_index.concepts[concept_name] + + # Remove stale pattern entries for this book + existing_pattern_names = set(lib_index.patterns.keys()) + for pattern_name, entries in list(lib_index.patterns.items()): + lib_index.patterns[pattern_name] = [ + e for e in entries if e.source != source_prefix + ] + if not lib_index.patterns[pattern_name]: + del lib_index.patterns[pattern_name] + existing_pattern_names.discard(pattern_name) + + # Merge this book's concepts + for concept_name, concept_entries in manifest.concept_index.items(): + all_chunks: list[str] = [] + all_aliases: list[str] = [] + all_patterns: list[str] = [] + all_related: list[str] = [] + for ce in concept_entries: + all_chunks.extend(ce.chunks) + all_aliases.extend(ce.aliases) + all_patterns.extend(getattr(ce, "patterns", [])) + all_related.extend(getattr(ce, "related", [])) + + source = LibraryConceptSource(source=source_prefix, chunks=all_chunks) + + if concept_name in lib_index.concepts: + existing = lib_index.concepts[concept_name] + existing.sources.append(source) + # Merge aliases/patterns/related with dedup + existing.aliases = list(dict.fromkeys(existing.aliases + all_aliases)) + existing.patterns = list(dict.fromkeys(existing.patterns + all_patterns)) + existing.related = list(dict.fromkeys( + r for r in existing.related + all_related if r != concept_name + )) + else: + lib_index.concepts[concept_name] = LibraryConceptEntry( + sources=[source], + aliases=list(dict.fromkeys(all_aliases)), + related=list(dict.fromkeys(r for r in all_related if r != concept_name)), + patterns=list(dict.fromkeys(all_patterns)), + ) + + # Merge pattern entries into lib_index.patterns with fuzzy merge + for concept_name, concept_entries in manifest.concept_index.items(): + all_chunks: list[str] = [] + raw_patterns: list[str] = [] + for ce in concept_entries: + all_chunks.extend(ce.chunks) + raw_patterns.extend(getattr(ce, "patterns", [])) + + unique_patterns = list(dict.fromkeys(raw_patterns)) + for raw_pat in unique_patterns: + canonical = _fuzzy_match_pattern(raw_pat, existing_pattern_names) + if canonical not in lib_index.patterns: + lib_index.patterns[canonical] = [] + lib_index.patterns[canonical].append(PatternEntry( + concept=concept_name, + source=source_prefix, + chunks=all_chunks, + )) + existing_pattern_names.add(canonical) + + write_library_index(lib_index) + + def ingest_book( file_path: Path, book_id: str | None = None, @@ -360,7 +474,7 @@ async def _summarise_all() -> list[ChapterSummary]: concept_index_raw: dict[str, list[ConceptEntry]] = {} for concept, mappings in concept_mappings.items(): concept_index_raw[concept] = [ - ConceptEntry(ch=m.ch, sec=m.sec, chunks=m.chunks, aliases=m.aliases) + ConceptEntry(ch=m.ch, sec=m.sec, chunks=m.chunks, aliases=m.aliases, patterns=m.patterns, related=m.related) for m in mappings ] @@ -396,11 +510,113 @@ async def _summarise_all() -> list[ChapterSummary]: write_manifest(manifest) logger.info(" Wrote manifest.json") - # Write zero-server navigation files - write_compact_manifest(manifest) - logger.info(" Wrote manifest.compact.json") - write_concept_index(book_id, manifest.concept_index) - logger.info(" Wrote concepts.json") + # Build chunk index entries for nav.json + chunk_idx_entries: dict[str, ChunkIndexEntry] = {} + # Build concept->chunk reverse map + chunk_concepts: dict[str, list[str]] = defaultdict(list) + for concept_name, entries in manifest.concept_index.items(): + for entry in entries: + for cid in entry.chunks: + if concept_name not in chunk_concepts[cid]: + chunk_concepts[cid].append(concept_name) + + # Build section label map from manifest + section_labels: dict[str, str] = {} + for ch in manifest.chapters: + for sec in ch.sections: + for cid in sec.chunk_ids: + section_labels[cid] = f"{ch.title} > {sec.title}" + + # Populate chunk index with prev/next from written chunks + all_chunk_ids = list_chunks(book_id) + for cid in all_chunk_ids: + chunk_idx_entries[cid] = ChunkIndexEntry( + section=section_labels.get(cid, ""), + concepts=chunk_concepts.get(cid, []), + tokens=0, # filled below if available + ) + + # Set prev/next chains per section group and token counts + sec_groups: dict[str, list[str]] = defaultdict(list) + for cid in all_chunk_ids: + sec_id = _section_id_from_chunk(cid) + sec_groups[sec_id].append(cid) + for sec_id, cids in sec_groups.items(): + for i, cid in enumerate(cids): + entry = chunk_idx_entries[cid] + if i > 0: + entry.prev = cids[i - 1] + if i < len(cids) - 1: + entry.next = cids[i + 1] + + # Get token counts from the chunks we just created + if all_chunks: + for chunk in all_chunks: + if chunk.chunk_id in chunk_idx_entries: + chunk_idx_entries[chunk.chunk_id].tokens = chunk.meta.token_count + else: + # Fallback: read token counts from chunk file YAML frontmatter + for cid in all_chunk_ids: + content = read_chunk(book_id, cid) + if content and content.startswith("---"): + parts = content.split("---", 2) + if len(parts) >= 3: + for line in parts[1].splitlines(): + if line.strip().startswith("token_count:"): + try: + token_val = int(line.strip().split(":", 1)[1].strip()) + if cid in chunk_idx_entries: + chunk_idx_entries[cid].tokens = token_val + except (ValueError, IndexError): + pass + break + + # Build nav chapters (compact structure) + nav_chapters = [] + for ch in manifest.chapters: + nav_ch = { + "id": ch.id, + "title": ch.title, + "summary": ch.summary[:100], + "concepts": ch.key_concepts[:3], + "sections": [ + {"id": s.id, "title": s.title, "chunks": len(s.chunk_ids)} + for s in ch.sections + ], + } + nav_chapters.append(nav_ch) + + # Build nav concepts + nav_concepts: dict[str, dict] = {} + for concept, c_entries in manifest.concept_index.items(): + c_all_chunks: list[str] = [] + c_all_aliases: list[str] = [] + c_all_patterns: list[str] = [] + c_all_related: list[str] = [] + for ce in c_entries: + c_all_chunks.extend(ce.chunks) + c_all_aliases.extend(ce.aliases) + c_all_patterns.extend(ce.patterns) + c_all_related.extend(ce.related) + nav_concepts[concept] = { + "chunks": list(dict.fromkeys(c_all_chunks)), + "aliases": list(dict.fromkeys(c_all_aliases)), + "patterns": list(dict.fromkeys(c_all_patterns)), + "related": list(dict.fromkeys(c_all_related)), + } + + book_nav = BookNav( + book_id=book_id, + chapters=nav_chapters, + chunks=chunk_idx_entries, + concepts=nav_concepts, + ) + write_book_nav(book_nav) + logger.info(" Wrote nav.json") + + # Update unified library_index.json (concepts + patterns) + _update_library_indices(book_id, manifest) + logger.info(" Updated library_index.json") # Update catalog total_chunks = sum( diff --git a/preprocessing/corpus.py b/preprocessing/corpus.py index bb44029..0b865a1 100644 --- a/preprocessing/corpus.py +++ b/preprocessing/corpus.py @@ -51,20 +51,25 @@ def _load_env() -> None: CorpusCatalog, CorpusConceptEntry, CorpusConceptIndex, + LibraryConceptEntry, + LibraryConceptSource, PaperEntry, PaperManifest, PaperMetadata, ParsedSection, + PatternEntry, SectionInfo, ) from lib.parser import parse_pdf from lib.storage import ( find_paper_by_filename, list_paper_chunks, + read_library_index, read_paper_manifest, write_cluster_list, write_corpus_catalog, write_corpus_concept_index, + write_library_index, write_navigation_md, write_paper_chunk, write_paper_manifest, @@ -246,12 +251,19 @@ def _build_corpus_concept_index( {papers_text} -Create a concept index mapping key concepts to papers and sections. Include 15-40 concepts that appear across multiple papers. For each concept, include 2-3 aliases: abbreviations, acronyms, or alternative phrasings someone might search for. +Create a concept index mapping key concepts to papers and sections. Include 15-40 concepts that appear across multiple papers. + +For each concept, include: +- **aliases** (2-3): abbreviations, acronyms, or alternative phrasings. +- **patterns** (1-3): structural or methodological patterns the concept exemplifies (e.g., 'layered-architecture', 'feedback-loop', 'defense-in-depth'). Use lowercase-hyphenated format. Use consistent naming across concepts — two concepts that share a pattern are structurally analogous. +- **related** (2-5): names of OTHER concepts in this same index that are closely related to this one. Respond with ONLY valid JSON: {{ "concept_name": {{ "aliases": ["abbreviation", "synonym"], + "patterns": ["pattern-tag-1", "pattern-tag-2"], + "related": ["other_concept_1", "other_concept_2"], "papers": ["paper-id-1", "paper-id-2"], "sections": {{"paper-id-1": "ch02", "paper-id-2": "ch03"}}, "note": "brief context" @@ -269,6 +281,8 @@ def _build_corpus_concept_index( sections=entry_data.get("sections", {}), note=entry_data.get("note", ""), aliases=entry_data.get("aliases", []), + patterns=entry_data.get("patterns", []), + related=entry_data.get("related", []), ) return CorpusConceptIndex(corpus_id=corpus_id, concepts=concepts) @@ -278,6 +292,79 @@ def _build_corpus_concept_index( # Main pipeline # --------------------------------------------------------------------------- +def _update_library_indices_corpus( + corpus_id: str, + paper_summaries: dict[str, tuple[PaperMetadata, list]], +) -> None: + """Update library_index.json (concepts + patterns) with corpus concepts. + + Reads the existing concept_index.json for this corpus (just written) and + merges it into the unified library index. + """ + from lib.storage import read_corpus_concept_index + + concept_index = read_corpus_concept_index(corpus_id) + if not concept_index: + return + + source_prefix = f"corpus:{corpus_id}" + + lib_index = read_library_index() + + # Remove stale concept entries for this corpus + for concept_name, entry in list(lib_index.concepts.items()): + entry.sources = [s for s in entry.sources if not s.source.startswith(source_prefix)] + if not entry.sources: + del lib_index.concepts[concept_name] + + # Remove stale pattern entries for this corpus + for pattern_name, entries in list(lib_index.patterns.items()): + lib_index.patterns[pattern_name] = [ + e for e in entries if not e.source.startswith(source_prefix) + ] + if not lib_index.patterns[pattern_name]: + del lib_index.patterns[pattern_name] + + # Merge corpus concepts + for concept_name, ce in concept_index.concepts.items(): + # Create one source entry per paper that covers this concept + new_sources = [] + for paper_id in ce.papers: + new_sources.append(LibraryConceptSource( + source=f"{source_prefix}:{paper_id}", + chunks=[], # corpus concept index doesn't track chunk IDs per paper + )) + + if concept_name in lib_index.concepts: + existing = lib_index.concepts[concept_name] + existing.sources.extend(new_sources) + existing.aliases = list(dict.fromkeys(existing.aliases + ce.aliases)) + existing.patterns = list(dict.fromkeys(existing.patterns + ce.patterns)) + existing.related = list(dict.fromkeys(existing.related + ce.related)) + else: + lib_index.concepts[concept_name] = LibraryConceptEntry( + sources=new_sources, + aliases=list(dict.fromkeys(ce.aliases)), + related=list(dict.fromkeys(ce.related)), + patterns=list(dict.fromkeys(ce.patterns)), + ) + + # Merge pattern entries into lib_index.patterns + for concept_name, ce in concept_index.concepts.items(): + for pat in ce.patterns: + # Simple exact match for corpus (no fuzzy needed since patterns come from same prompt) + if pat not in lib_index.patterns: + lib_index.patterns[pat] = [] + for paper_id in ce.papers: + lib_index.patterns[pat].append(PatternEntry( + concept=concept_name, + source=f"{source_prefix}:{paper_id}", + chunks=[], + )) + + write_library_index(lib_index) + + def ingest_corpus( folder_path: Path, corpus_id: str | None = None, @@ -577,6 +664,10 @@ def ingest_corpus( else: logger.warning(" No paper summaries available, skipping concept index") + # Update unified library_index.json (concepts + patterns) + _update_library_indices_corpus(corpus_id, paper_summaries) + logger.info(" Updated library_index.json") + # Update NAVIGATION.md write_navigation_md() logger.info(" Updated NAVIGATION.md") diff --git a/pyproject.toml b/pyproject.toml index 4d84a27..c6ae100 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "agentlib" -version = "1.7.0" +version = "1.8.0" description = "Agentic Knowledge Navigation System" readme = "README.md" requires-python = ">=3.12" diff --git a/server.py b/server.py index 3afe3b0..0aa0f50 100644 --- a/server.py +++ b/server.py @@ -93,8 +93,10 @@ def open_book(book_id: str) -> str: "title": ch.title, "summary": _truncate(ch.summary, MAX_CHAPTER_SUMMARY_CHARS), "concepts": ch.key_concepts[:MAX_CONCEPTS_PER_CHAPTER], - "sections": len(ch.sections), - "chunks": sum(len(s.chunk_ids) for s in ch.sections), + "sections": [ + {"id": s.id, "title": s.title, "chunk_ids": s.chunk_ids} + for s in ch.sections + ], }) compact: dict = {"book_id": manifest.book_id} @@ -157,5 +159,83 @@ def search_concepts(query: str, book_id: str | None = None) -> str: return json.dumps(compact) +@mcp.tool() +def search_library(query: str) -> str: + """Search the unified library index across ALL books and corpora. + Matches concept names, aliases, related concepts, and structural patterns.""" + lib_index = storage.read_library_index() + query_lower = query.strip().lower() + + results: dict = {} + + # 1. Search concepts (name, aliases, related) + for concept, entry in lib_index.concepts.items(): + searchable = [concept.lower()] + searchable.extend(a.lower() for a in entry.aliases) + searchable.extend(r.lower() for r in entry.related) + if any(query_lower in s for s in searchable): + results[concept] = { + "sources": [{"source": s.source, "chunks": s.chunks} for s in entry.sources], + "aliases": entry.aliases, + "related": entry.related, + "patterns": entry.patterns, + } + + # 2. Search patterns — find concepts that share a matching pattern + for pattern, entries in lib_index.patterns.items(): + if query_lower in pattern.lower(): + for pe in entries: + if pe.concept not in results: + # Add this concept from the concepts dict if available + concept_entry = lib_index.concepts.get(pe.concept) + if concept_entry: + results[pe.concept] = { + "sources": [{"source": s.source, "chunks": s.chunks} for s in concept_entry.sources], + "aliases": concept_entry.aliases, + "related": concept_entry.related, + "patterns": concept_entry.patterns, + "matched_via_pattern": pattern, + } + else: + results[pe.concept] = { + "sources": [{"source": pe.source, "chunks": pe.chunks}], + "matched_via_pattern": pattern, + } + + if not results: + # Helpful fallback: list available patterns + available_patterns = sorted(lib_index.patterns.keys())[:20] + if available_patterns: + return f"No matches for '{query}'. Available patterns: {', '.join(available_patterns)}" + return f"No matches for '{query}' in library index." + + # Cap results + if len(results) > MAX_SEARCH_RESULTS: + results = dict(list(results.items())[:MAX_SEARCH_RESULTS]) + + return json.dumps(results, indent=2) + + +@mcp.tool() +def preview_chunks(book_id: str, chunk_ids: list[str]) -> str: + """Preview chunk metadata before reading full content. Shows section, concepts, token count, and prev/next navigation.""" + nav = storage.read_book_nav(book_id) + if nav is None: + return f"No navigation data found for book '{book_id}'." + + MAX_PREVIEW = 20 + chunk_ids = chunk_ids[:MAX_PREVIEW] + + previews: dict = {} + for cid in chunk_ids: + entry = nav.chunks.get(cid) + if entry: + previews[cid] = entry.to_dict() + else: + previews[cid] = None + + return json.dumps(previews, indent=2) + + if __name__ == "__main__": mcp.run() diff --git a/skills/agentlib-knowledge/SKILL.md b/skills/agentlib-knowledge/SKILL.md index 7e86f33..c0c84a4 100644 --- a/skills/agentlib-knowledge/SKILL.md +++ b/skills/agentlib-knowledge/SKILL.md @@ -5,73 +5,21 @@ description: "Knowledge library navigation. Trigger on: research questions, book ## AgentLib — Knowledge Library -**This skill activates automatically** for research/knowledge questions. Users can also invoke it explicitly with `/agentlib-knowledge ` to always consult the library. +**ALWAYS check this library BEFORE web search or answering from training data** when the user asks about topics that could be covered by ingested books or paper corpora. -You have a preprocessed knowledge library at `~/.claude/plugins/agentlib/library/`. +Use the MCP tools provided by the agentlib plugin. Do NOT read library files directly. -**IMPORTANT: ALWAYS check this library BEFORE web search or answering from training data when the user asks about topics that could be covered by ingested books or paper corpora.** +### Workflow -**You MUST delegate this research to the `library-researcher` agent using the Agent tool.** Do NOT read library files directly — spawn the agent with the user's question and let it handle all navigation and reading. This keeps your main context clean. +1. **`search_library(query)`** — Try ONCE with a broad 1-2 word query. Do NOT retry with rephrased queries. If no results, go to step 2. -When spawning the agent, include the **absolute library path** in the prompt (expand `~` to the full home directory). Example: -> Research the following question using the library at /Users/nadavbarkai/.claude/plugins/agentlib/library/ -> Question: {user's question} +2. **`browse_library`** to find the right book, then **`open_book(book_id)`** to browse its chapter structure. Identify the relevant chapter/section. -The agent will return a synthesized answer with citations. +3. **`preview_chunks(book_id, chunk_ids)`** — preview candidate chunks (section title, concepts, token count). Pick the 2-3 most relevant. -If the Agent tool is unavailable, fall back to the manual steps below. - -### Manual fallback (only if agent delegation fails) - -#### Step 1: Check what's available -``` -Read ~/.claude/plugins/agentlib/library/NAVIGATION.md -``` -This lists ALL books AND paper corpora. Check BOTH sections — if a book OR corpus covers the topic, continue with the appropriate path below. Only proceed with other tools if nothing is relevant. - ---- - -### Path A: Books - -**Find content (pick one):** - -**A1 — Search by concept (fastest, 2 reads):** -``` -Read ~/.claude/plugins/agentlib/library/books/{book-id}/concepts.json -``` -Find your concept → get chunk IDs → go to Step 3. - -**A2 — Browse chapters (3 reads):** -``` -Read ~/.claude/plugins/agentlib/library/books/{book-id}/manifest.compact.json -``` -Find relevant chapter/section → note chunk IDs → go to Step 3. - ---- - -### Path B: Corpora (scientific papers) - -**B1 — Search by concept across all papers (fastest, 1 read):** -``` -Read ~/.claude/plugins/agentlib/library/corpus/{corpus-id}/concept_index.json -``` -Find your concept → get paper IDs and chunk IDs → go to Step 3. - -**B2 — Browse by topic cluster (2-3 reads):** -1. `corpus_catalog.json` — see topic clusters -2. `clusters/{cluster-id}.json` — see papers with abstracts -3. Pick papers → read `papers/{paper-id}/manifest.compact.json` - ---- - -### Step 3: Read the content -``` -Read ~/.claude/plugins/agentlib/library/books/{book-id}/chunks/{chunk-id}.md -Read ~/.claude/plugins/agentlib/library/corpus/{corpus-id}/papers/{paper-id}/chunks/{chunk-id}.md -``` -Each chunk is ~300-500 tokens. Read up to 5 chunks per question. Chunks have `prev`/`next` links for adjacent context. +4. **`read_chunks(book_id, chunk_ids)`** — read the full content of the selected chunks. ### Rules -- ALWAYS use `manifest.compact.json`, NEVER `manifest.json` -- Max 4 navigation reads, then up to 5 content chunks -- Cite the book/paper and chunk ID when answering + +- Cite the book/paper title and chunk ID when answering +- Max 2-3 content chunks per question — use preview to pick well diff --git a/tests/conftest.py b/tests/conftest.py index 7b2237a..8411299 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -70,12 +70,19 @@ def sample_manifest() -> Manifest: ), ], concept_index={ - "unit testing": [ConceptEntry(ch="ch01", sec="ch01-s01", chunks=["ch01-s01-001"])], - "mocking": [ConceptEntry(ch="ch02", sec="ch02-s01", chunks=["ch02-s01-001"])], + "unit testing": [ConceptEntry( + ch="ch01", sec="ch01-s01", chunks=["ch01-s01-001"], + patterns=["test-driven-development", "fail-fast"], + )], + "mocking": [ConceptEntry( + ch="ch02", sec="ch02-s01", chunks=["ch02-s01-001"], + patterns=["dependency-injection", "inversion-of-control"], + )], "retrieval augmented generation": [ ConceptEntry( ch="ch01", sec="ch01-s01", chunks=["ch01-s01-002"], aliases=["RAG", "RAG pipeline"], + patterns=["pipeline-stages", "fan-out-aggregation"], ), ], }, diff --git a/tests/test_server.py b/tests/test_server.py index 323c99d..5f6a75b 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -5,8 +5,23 @@ import json from pathlib import Path -from lib.models import CatalogEntry, Manifest -from lib.storage import update_catalog_entry, write_chunk, write_manifest +from lib.models import ( + BookNav, + CatalogEntry, + ChunkIndexEntry, + LibraryConceptEntry, + LibraryConceptSource, + LibraryIndex, + Manifest, + PatternEntry, +) +from lib.storage import ( + update_catalog_entry, + write_book_nav, + write_chunk, + write_library_index, + write_manifest, +) class TestBrowseLibrary: @@ -74,3 +89,114 @@ def test_search_empty(self, tmp_data_dir: Path) -> None: from server import search_concepts result = json.loads(search_concepts("anything")) assert result == {} + + +class TestSearchLibrary: + def test_search_unified(self, tmp_data_dir: Path) -> None: + from server import search_library + lib_index = LibraryIndex(concepts={ + "OAuth 2.0": LibraryConceptEntry( + sources=[LibraryConceptSource(source="book:api-sec", chunks=["ch03-001"])], + aliases=["OAuth"], + related=["JWT"], + patterns=["credential-cycling"], + ), + }) + write_library_index(lib_index) + + result = json.loads(search_library("OAuth")) + assert "OAuth 2.0" in result + assert result["OAuth 2.0"]["patterns"] == ["credential-cycling"] + assert result["OAuth 2.0"]["related"] == ["JWT"] + + def test_search_by_related(self, tmp_data_dir: Path) -> None: + from server import search_library + lib_index = LibraryIndex(concepts={ + "token lifecycle": LibraryConceptEntry( + sources=[LibraryConceptSource(source="book:auth", chunks=["ch01-001"])], + aliases=[], + related=["refresh tokens"], + patterns=["credential-cycling"], + ), + }) + write_library_index(lib_index) + + result = json.loads(search_library("refresh tokens")) + assert "token lifecycle" in result + + def test_search_empty_library(self, tmp_data_dir: Path) -> None: + from server import search_library + result = search_library("anything") + assert "No matches" in result + + +class TestSearchLibraryPatterns: + """Tests for pattern search within search_library (migrated from TestExplorePatterns).""" + + def test_search_by_pattern(self, tmp_data_dir: Path) -> None: + from server import search_library + lib_index = LibraryIndex( + concepts={ + "OAuth tokens": LibraryConceptEntry( + sources=[LibraryConceptSource(source="book:api-sec", chunks=["ch03-001"])], + aliases=[], + related=[], + patterns=["credential-cycling"], + ), + "TLS certs": LibraryConceptEntry( + sources=[LibraryConceptSource(source="book:tls", chunks=["ch08-001"])], + aliases=[], + related=[], + patterns=["credential-cycling"], + ), + }, + patterns={ + "credential-cycling": [ + PatternEntry(concept="OAuth tokens", source="book:api-sec", chunks=["ch03-001"]), + PatternEntry(concept="TLS certs", source="book:tls", chunks=["ch08-001"]), + ], + }, + ) + write_library_index(lib_index) + + result = json.loads(search_library("credential")) + assert "OAuth tokens" in result + assert "TLS certs" in result + + def test_no_match_shows_available_patterns(self, tmp_data_dir: Path) -> None: + from server import search_library + lib_index = LibraryIndex( + concepts={}, + patterns={"retry-with-backoff": []}, + ) + write_library_index(lib_index) + + result = search_library("nonexistent") + assert "retry-with-backoff" in result + assert "No matches" in result + + +class TestPreviewChunks: + def test_preview_existing(self, tmp_data_dir: Path) -> None: + from server import preview_chunks + nav = BookNav( + book_id="test-book", + chunks={ + "ch01-001": ChunkIndexEntry( + section="Intro", concepts=["testing"], tokens=420, + prev=None, next="ch01-002", + ), + }, + ) + write_book_nav(nav) + + result = json.loads(preview_chunks("test-book", ["ch01-001", "ch01-999"])) + assert result["ch01-001"]["section"] == "Intro" + assert result["ch01-001"]["concepts"] == ["testing"] + assert result["ch01-001"]["tokens"] == 420 + assert result["ch01-999"] is None + + def test_preview_nonexistent_book(self, tmp_data_dir: Path) -> None: + from server import preview_chunks + result = preview_chunks("nonexistent", ["ch01-001"]) + assert "No navigation data" in result diff --git a/tests/test_storage.py b/tests/test_storage.py index 48e0ed6..1b7eb11 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -4,16 +4,26 @@ from pathlib import Path -from lib.models import Catalog, CatalogEntry, ConceptEntry, Manifest -from lib.storage import ( +from lib.models import ( # pyright: ignore[reportAttributeAccessIssue] + Catalog, + CatalogEntry, + ConceptEntry, + LibraryConceptEntry, # pyright: ignore[reportAttributeAccessIssue] + LibraryConceptSource, # pyright: ignore[reportAttributeAccessIssue] + LibraryIndex, # pyright: ignore[reportAttributeAccessIssue] + Manifest, +) +from lib.storage import ( # pyright: ignore[reportAttributeAccessIssue] read_catalog, read_chunk, read_chunks, + read_library_index, # pyright: ignore[reportAttributeAccessIssue] read_manifest, search_concepts, update_catalog_entry, write_catalog, write_chunk, + write_library_index, # pyright: ignore[reportAttributeAccessIssue] write_manifest, list_chunks, book_exists, @@ -150,5 +160,33 @@ def test_backward_compat_no_aliases(self, tmp_data_dir: Path) -> None: loaded = read_manifest("old-book") assert loaded is not None assert loaded.concept_index["testing"][0].aliases == [] - - + assert loaded.concept_index["testing"][0].patterns == [] # pyright: ignore[reportAttributeAccessIssue] + + +class TestLibraryIndex: + def test_read_empty(self, tmp_data_dir: Path) -> None: + lib_index = read_library_index() + assert lib_index.concepts == {} + + def test_write_and_read(self, tmp_data_dir: Path) -> None: + lib_index = LibraryIndex(concepts={ + "OAuth 2.0": LibraryConceptEntry( + sources=[ + LibraryConceptSource(source="book:api-security", chunks=["ch03-s01-001"]), + LibraryConceptSource(source="book:web-auth", chunks=["ch07-s02-004"]), + ], + aliases=["OAuth", "OAuth2"], + related=["JWT", "access tokens"], + patterns=["credential-cycling", "time-bounded-trust"], + ), + }) + write_library_index(lib_index) + + loaded = read_library_index() + assert "OAuth 2.0" in loaded.concepts + entry = loaded.concepts["OAuth 2.0"] + assert len(entry.sources) == 2 + assert entry.sources[0].source == "book:api-security" + assert entry.aliases == ["OAuth", "OAuth2"] + assert entry.related == ["JWT", "access tokens"] + assert entry.patterns == ["credential-cycling", "time-bounded-trust"] diff --git a/tests/test_summariser.py b/tests/test_summariser.py index 2fead6d..5db98c2 100644 --- a/tests/test_summariser.py +++ b/tests/test_summariser.py @@ -55,6 +55,8 @@ def _llm_response_for_batch(batch_chapters: list[ChapterSummary]) -> str: sec = ch.sections[0] concepts[concept_name] = { "aliases": [f"alias_{ch.chapter_id}"], + "patterns": [f"pattern_{ch.chapter_id}"], + "related": [], "locations": [ {"ch": ch.chapter_id, "sec": sec.section_id, "chunks": sec.chunk_ids} ], @@ -100,6 +102,8 @@ def test_new_format_with_aliases_and_locations(self): data = { "Concept A": { "aliases": ["alias1"], + "patterns": ["credential-cycling"], + "related": ["Concept B"], "locations": [ {"ch": "ch01", "sec": "ch01-s01", "chunks": ["ch01-s01-001"]} ], @@ -117,6 +121,8 @@ def test_new_format_with_aliases_and_locations(self): assert m.sec == "ch01-s01" assert m.chunks == ["ch01-s01-001"] assert m.aliases == ["alias1"] + assert m.patterns == ["credential-cycling"] + assert m.related == ["Concept B"] def test_old_format_list(self): data = { @@ -130,6 +136,8 @@ def test_old_format_list(self): mappings = result["Concept B"] assert len(mappings) == 1 assert mappings[0].aliases == [] + assert mappings[0].patterns == [] + assert mappings[0].related == [] assert mappings[0].ch == "ch02" def test_multiple_locations(self): @@ -215,6 +223,8 @@ def side_effect(config, prompt, **kwargs): return json.dumps({ shared_concept: { "aliases": ["shared"], + "patterns": ["test-pattern"], + "related": [], "locations": [ {"ch": "ch01", "sec": "ch01-s01", "chunks": ["ch01-s01-001"]} ], @@ -226,6 +236,8 @@ def side_effect(config, prompt, **kwargs): return json.dumps({ shared_concept: { "aliases": ["shared"], + "patterns": ["test-pattern"], + "related": [], "locations": [ {"ch": ch_id, "sec": f"{ch_id}-s01", "chunks": [f"{ch_id}-s01-001"]} ], @@ -258,6 +270,8 @@ def side_effect(config, prompt, **kwargs): return json.dumps({ shared_concept: { "aliases": ["shared", "sc"], + "patterns": ["test-pattern"], + "related": [], "locations": [ {"ch": "ch01", "sec": "ch01-s01", "chunks": ["ch01-s01-001"]}, ], @@ -290,6 +304,8 @@ def test_concept_cap_at_50(self, mock_provider, mock_llm): ] concepts[f"Concept {i:03d}"] = { "aliases": [f"c{i}"], + "patterns": [f"pattern-{i}"], + "related": [], "locations": locations, } mock_llm.return_value = json.dumps(concepts)