diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index 4eab899..128069e 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,8 +1,15 @@
{
"name": "agentlib",
- "version": "1.4.0",
- "description": "Agentic Knowledge Navigation — ingest books/papers/databases into chunked metadata layers, then navigate them via a universal skill. No MCP server required.",
+ "version": "1.8.0",
+ "description": "Agentic Knowledge Navigation — ingest books and papers into a curated library, then navigate via MCP tools or file-based agent.",
"author": {
"name": "Nadav Barkai"
+ },
+ "mcpServers": {
+ "agentlib": {
+ "command": "/usr/bin/env",
+ "args": ["uv", "run", "--project", "${CLAUDE_PLUGIN_ROOT}", "python", "${CLAUDE_PLUGIN_ROOT}/server.py"],
+ "cwd": "${CLAUDE_PLUGIN_ROOT}"
+ }
}
}
diff --git a/README.md b/README.md
index 226dee9..157a11c 100644
--- a/README.md
+++ b/README.md
@@ -20,36 +20,47 @@ AgentLib changes this. Ingest the books, papers, and documents that matter for y
AgentLib has three parts:
1. **Ingestion pipelines** — preprocess books, scientific paper corpora, and databases into small, self-contained chunks with lightweight metadata at multiple layers.
-2. **Universal navigation skill** (`agentlib-knowledge`) — teaches the agent to read cheap metadata first, then drill into specific chunks.
-3. **Research agent** (`library-researcher`) — runs in an isolated context to keep the main conversation clean. All navigation and chunk reading happens in the agent's context; only a synthesized answer returns.
+2. **MCP tools** — the plugin registers an MCP server with 6 tools: `browse_library`, `open_book`, `search_library`, `search_concepts`, `preview_chunks`, `read_chunks`. The agent calls these directly — no sub-agent needed.
+3. **Universal navigation skill** (`agentlib-knowledge`) — teaches the agent to search cheap metadata first, then drill into specific chunks via `search_library` → `preview_chunks` → `read_chunks`.
-No MCP server required. No tool calls. The agent reads preprocessed files directly from `~/.claude/plugins/agentlib/library/`.
+The agent navigates via MCP tool calls against preprocessed files in `~/.claude/plugins/agentlib/library/`.
### How agents navigate the library
```mermaid
graph LR
- Q["User question"] --> R["library-researcher
(isolated context)"]
- R --> NAV["NAVIGATION.md
~50 tok per book"]
- R --> CS["concepts.json (Ls)
~200 tok"]
- R --> CAT["catalog (L0)
~50 tok per book"]
+ Q["User question"] --> SL["search_library
concepts + patterns
library_index.json"]
+ SL --> PC["preview_chunks
chunk metadata
nav.json"]
+ PC --> RC["read_chunks
2-3 best chunks
300-500 tok each"]
+ RC --> A["Answer with citations"]
+```
- CS --> M{"concept/alias
match?"}
- M -- hit --> CH["chunks (L2)
300-500 tok each"]
- M -- miss --> MAN["manifest (L1)
~500 tok"]
- MAN --> CH
+**Fast path (concept hit):** `search_library` → `preview_chunks` → `read_chunks` — **3 tool calls, ~1.5k tokens**
- NAV --> CH
- CAT --> MAN
+**Pattern path (cross-domain):** `search_library` (pattern tags) → `preview_chunks` → `read_chunks` — **3 tool calls, ~2.5k tokens**
- CH --> A["Synthesized answer
(returned to user)"]
-```
+**Recovery on miss:** related concepts → pattern traversal → `search_concepts` per book → Grep fallback
+
+#### Unified library index
+
+`library_index.json` is the single entry point for the entire library. One file, all books and corpora — queried via `search_library`. Each concept carries:
+
+- **aliases** — abbreviations, acronyms, synonyms (searching "CDX" matches "CycloneDX")
+- **related** — directly connected concepts in the same domain ("OAuth 2.0" → "JWT", "access tokens")
+- **patterns** — abstract structural fingerprints for cross-domain discovery (see below)
+- **sources** — which books/papers contain the concept and their chunk IDs
-**Ls hit (fast path):** NAVIGATION → concepts.json → chunks — **2-3 reads, ~1k tokens**
+#### Pattern fingerprints — associative recall
-**Ls miss (slow path):** NAVIGATION → catalog → manifest → chunks — **5-6 reads, ~5k tokens**
+Every concept is tagged with 2-3 **pattern fingerprints**: abstract, domain-independent descriptors of its structural nature. These enable a "this reminds me of..." capability that keyword search can never provide.
-The concept index includes **aliases** (abbreviations, acronyms, synonyms) generated by the LLM at ingestion time. Searching "CDX" matches the alias on "CycloneDX"; searching "SBOM" matches "Software Bill of Materials". This turns misses into hits without any runtime cost.
+For example, "OAuth token rotation", "TLS certificate renewal", and "SSH key rotation" all share the pattern `credential-cycling`. An agent reading about token rotation can discover structurally analogous solutions in completely different books — without any keyword overlap.
+
+Pattern tags are integrated directly into `library_index.json` and searchable via `search_library`. A seed vocabulary of ~40 common patterns ensures consistency across books; fuzzy matching merges near-duplicates.
+
+#### Chunk preview via nav.json
+
+Each book's `nav.json` lets agents see what's inside each chunk *before* reading it: section title, concepts covered, token count, and prev/next chains. Queried via `preview_chunks`, this eliminates blind reads — the agent picks the 2-3 best chunks from a set of candidates instead of reading 5 and hoping.
@@ -64,37 +75,41 @@ The concept index includes **aliases** (abbreviations, acronyms, synonyms) gener
-### Three metadata layers
+### Metadata layers
```
-L0 "What exists?" → catalog/NAVIGATION.md: ~50 tokens per book (cheap)
-L1 "What's inside?" → manifest: structure, summaries, concepts (moderate)
-L2 "Give me the content" → small self-contained chunks, 300-500 tok (expensive)
+Lx "What do I know?" → library_index.json: concepts, patterns, sources (search_library)
+Ln "What's in a book?" → nav.json: structure + chunk metadata + concepts (preview_chunks)
+L2 "Give me the content" → chunks: 300-500 tok each (read_chunks)
+Lf "Full rebuild" → manifest.json: complete archive per book (offline)
```
+Three files instead of six — `library_index.json` (1 file, entire library), `nav.json` (per book), and `manifest.json` (per book, full archive for rebuild).
+
Chunks are **content-aware**: tables and code fences are kept atomic (soft cap 500, hard cap 1 000 tokens). PDF tables are extracted via PyMuPDF and rendered as markdown pipe tables. Figures are extracted from PDFs with vision-based summarization, appearing as placeholders in chunks.
-Plus a **concept index** shortcut (Ls) that jumps directly to relevant chunks when the agent already knows what it's looking for. Each concept carries LLM-generated aliases so the agent can find it by abbreviation, acronym, or alternative phrasing.
+The concept index includes LLM-generated **aliases**, **related concepts**, and **pattern fingerprints** — turning keyword misses into graph traversals and enabling cross-domain discovery.
### Library structure
```
library/
-├── NAVIGATION.md ← Start here — index of everything
+├── library_index.json ← Lx: unified concept + pattern discovery
├── books/
-│ ├── catalog.json ← L0
+│ ├── catalog.json
│ └── {book-id}/
-│ ├── manifest.compact.json ← L1
-│ ├── concepts.json ← Ls
+│ ├── nav.json ← Ln: structure + chunk metadata + concepts
+│ ├── manifest.json ← Lf: full archive for rebuild
│ └── chunks/
│ └── {chunk-id}.md ← L2
└── corpus/
└── {corpus-id}/
- ├── corpus_catalog.json ← L0 (topic clusters)
- ├── concept_index.json ← Ls (cross-paper concepts)
- ├── clusters/{cluster-id}.json ← L0b (papers per cluster)
+ ├── corpus_catalog.json
+ ├── concept_index.json
+ ├── clusters/{cluster-id}.json
└── papers/{paper-id}/
- ├── manifest.compact.json ← L1
+ ├── nav.json ← Ln
+ ├── manifest.json ← Lf
└── chunks/{chunk-id}.md ← L2
```
@@ -165,7 +180,7 @@ Simulated on realistic workloads (15-book library, 487-paper corpus, 80-table da
| Wrong reads/queries | 1 | 0 | 1 | 0 | 2 | 0 |
| **Token reduction** | | **82%** | | **55%** | | **55%** |
-The core principle: *no heavy indexing, no vector databases — just smart, lightweight metadata and small content blobs.*
+The core principle: *no vector databases — just smart, interconnected metadata structures. Concepts link to related concepts, abstract patterns connect ideas across domains, and chunk previews eliminate blind reads.*
## Install
@@ -211,7 +226,7 @@ Ingestion runs chapter summarization in parallel and batches concept extraction
**Explicit invocation** — prefix with `/agentlib-knowledge` when you want the library's answer, not Claude's training data:
> /agentlib-knowledge What defensive techniques protect against prompt injection?
-The skill delegates to the `library-researcher` agent, which navigates `NAVIGATION.md` → concept indexes → specific chunks in an isolated context. Only the synthesized answer with citations returns to your conversation.
+The skill uses MCP tools directly: `search_library` → `preview_chunks` → `read_chunks`. Only the synthesized answer with citations returns to your conversation. Pattern tags integrated into `search_library` enable cross-domain analogies automatically.
## LLM Providers
diff --git a/agents/library-researcher.md b/agents/library-researcher.md
index cd8d78c..fe787f6 100644
--- a/agents/library-researcher.md
+++ b/agents/library-researcher.md
@@ -1,42 +1,60 @@
---
name: library-researcher
description: "Research questions using the preprocessed knowledge library. Use when answering questions about ingested books, scientific papers, or domain knowledge that may be in the library."
-model: haiku
+model: sonnet
tools: Read, Glob, Grep
-maxTurns: 15
+maxTurns: 25
---
You are a research assistant. Follow this sequence to answer questions.
**IMPORTANT:** Use ABSOLUTE paths only — never use `~/` (it won't resolve in your context). The library path will be provided in your prompt.
-## Step 1: Read the index (1 read)
-Read `{library}/NAVIGATION.md`. Identify which books or corpora are relevant.
+## Step 1: Unified library search (1 read)
+Read `{library}/library_index.json`. This contains ALL concepts across ALL books and corpora with:
+- **aliases**: alternative names, abbreviations, acronyms
+- **related**: directly connected concepts in the same domain
+- **patterns**: abstract structural fingerprints (e.g. "credential-cycling", "retry-with-backoff")
+- **sources**: which books/papers contain this concept and their chunk IDs
-## Step 2: Find chunk IDs (1-2 reads)
+If `library_index.json` doesn't exist, fall back to reading `{library}/NAVIGATION.md` and then per-book `nav.json`.
-**Try concepts.json first** (fastest):
-- Books: `{library}/books/{book-id}/concepts.json`
-- Corpora: `{library}/corpus/{corpus-id}/concept_index.json`
+## Step 2: Preview chunks — MANDATORY (1 read)
+**NEVER read chunk files without previewing first.** This is the most important efficiency rule.
-Each concept has `"chunks"` (list of chunk IDs) and optionally `"aliases"` (alternative names, abbreviations, acronyms). When scanning for your topic, check BOTH the concept name AND its aliases — your search term may match an alias rather than the primary name.
+Read `{library}/books/{book-id}/nav.json` to assess candidates:
+- The `chunks` section shows each chunk's **section**, **concepts**, **token count**, and **prev/next** links
+- The `concepts` section maps concept names to their chunk IDs
-If concepts.json has a match → note chunk IDs → go to Step 3.
+Pick only the 2-3 most relevant chunks. Skip chunks whose section/concepts don't match your query. Reading unnecessary chunks wastes tokens.
-**If no match in concepts**, use Grep on chunks directory:
-```
-Grep pattern: "your search term" path: "{library}/books/{book-id}/chunks/"
-```
-This finds which chunks contain relevant content. Note the filenames.
+## Step 2b: Cross-domain insight (optional)
+If the concept has **pattern** tags (e.g. "credential-cycling"), look up the pattern in `library_index.json`'s `patterns` section to discover structurally similar concepts in other domains. This enables "this reminds me of..." connections.
+
+Only do this when the user's question could benefit from cross-domain analogies.
## Step 3: Read chunks (2-5 reads)
Read the specific chunk files identified in Step 2.
+- If you need more context, follow **prev/next** links from nav.json
+- Books: `{library}/books/{book-id}/chunks/{chunk-id}.md`
+- Corpora: `{library}/corpus/{corpus-id}/papers/{paper-id}/chunks/{chunk-id}.md`
## Step 4: Return answer
-Synthesize a clear answer citing source (book/paper title and chunk IDs).
+Synthesize a clear answer citing source (book/paper title and chunk IDs). Keep your response under 2000 characters. Cite sources but don't include raw chunk text.
+
+If patterns revealed cross-domain analogies, mention them: "This follows the same structural pattern as [X] in [other book]."
+
+## Recovery: concept miss
+If library_index.json has no match:
+1. Check **related** concepts — your term may be a sub-concept of something indexed
+2. Check **pattern** tags in library_index.json — search by structural shape instead of name
+3. Fall back to `{library}/books/{book-id}/nav.json` concepts section with alias matching
+4. Last resort: Grep on chunks directory
## Rules
- ALWAYS use absolute paths, never `~/`
-- Try concepts.json FIRST, use Grep only as fallback
-- Do NOT read manifest.compact.json — it's too large
-- Total: max 3 navigation reads + 5 content chunks
+- Start with library_index.json (fastest: 1 file covers entire library)
+- **NEVER skip the preview step — read nav.json BEFORE any chunk files**
+- Total: max 4 navigation reads + 5 content chunks
+- Cite the book/paper and chunk ID when answering
+- **If you're running low on turns, STOP researching and synthesize an answer from what you have.** A partial answer with citations is better than no answer. Never return mid-thought narration.
diff --git a/commands/agentlib-ingest-book.md b/commands/agentlib-ingest-book.md
index c786419..d36cc6f 100644
--- a/commands/agentlib-ingest-book.md
+++ b/commands/agentlib-ingest-book.md
@@ -16,7 +16,9 @@ This will:
1. Parse the PDF/EPUB to extract chapter/section structure
2. Chunk the content into 300-500 token segments
3. Summarise each chapter using the configured LLM provider
-4. Build a concept index for fast search
-5. Write manifest and update the library catalog
+4. Build a concept index with aliases, pattern fingerprints, and related concepts
+5. Generate nav.json (per-book navigation: structure, chunk preview, concepts)
+6. Update the unified library_index.json (concepts + patterns)
+7. Write manifest and update the library catalog
-After ingestion, the book is available in the library. The agent navigates it via the `/agentlib-knowledge` skill by reading catalog.json, manifest.compact.json, concepts.json, and chunks/*.md
+After ingestion, the book is available in the library. The agent navigates it via the `/agentlib-knowledge` skill, starting with library_index.json for unified cross-library search.
diff --git a/commands/agentlib-ingest-corpus.md b/commands/agentlib-ingest-corpus.md
index d0520a2..46d083f 100644
--- a/commands/agentlib-ingest-corpus.md
+++ b/commands/agentlib-ingest-corpus.md
@@ -15,6 +15,7 @@ This will:
2. Parse and chunk each paper into 300-500 token segments
3. Summarise each paper's sections using the configured LLM provider
4. Cluster papers by topic
-5. Build a cross-paper concept index
+5. Build a cross-paper concept index with pattern fingerprints
+6. Update the unified library_index.json (concepts + patterns)
-After ingestion, use `/agentlib-knowledge` to query the corpus.
+After ingestion, use `/agentlib-knowledge` to query the corpus. The agent can discover connections between corpus papers and ingested books through shared pattern fingerprints.
diff --git a/commands/agentlib-library.md b/commands/agentlib-library.md
index 3d7dc6f..d563841 100644
--- a/commands/agentlib-library.md
+++ b/commands/agentlib-library.md
@@ -11,4 +11,4 @@ If a book ID is provided (`$ARGUMENTS`), show the detailed structure of that boo
Read directly from the library:
- No args: Read ~/.claude/plugins/agentlib/library/books/catalog.json and display as a formatted table
-- With book ID: Read ~/.claude/plugins/agentlib/library/books/{book-id}/manifest.compact.json and display the chapter structure
+- With book ID: Read ~/.claude/plugins/agentlib/library/books/{book-id}/nav.json and display the chapter structure
diff --git a/lib/models.py b/lib/models.py
index 77b9d3d..c14c3a6 100644
--- a/lib/models.py
+++ b/lib/models.py
@@ -75,6 +75,183 @@ class ConceptEntry:
sec: str
chunks: list[str] = field(default_factory=list)
aliases: list[str] = field(default_factory=list)
+ patterns: list[str] = field(default_factory=list)
+ related: list[str] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Library-wide navigation structures
+# ---------------------------------------------------------------------------
+
+@dataclass
+class LibraryConceptSource:
+ """A concept's presence in a specific source (book or corpus paper)."""
+ source: str # "book:" or "corpus::"
+ chunks: list[str] = field(default_factory=list)
+
+ def to_dict(self) -> dict[str, Any]:
+ return asdict(self)
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> LibraryConceptSource:
+ return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
+
+
+@dataclass
+class LibraryConceptEntry:
+ """A concept in the unified library index with cross-source presence."""
+ sources: list[LibraryConceptSource] = field(default_factory=list)
+ aliases: list[str] = field(default_factory=list)
+ related: list[str] = field(default_factory=list)
+ patterns: list[str] = field(default_factory=list)
+
+ def to_dict(self) -> dict[str, Any]:
+ return {
+ "sources": [s.to_dict() for s in self.sources],
+ "aliases": self.aliases,
+ "related": self.related,
+ "patterns": self.patterns,
+ }
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> LibraryConceptEntry:
+ sources = [LibraryConceptSource.from_dict(s) for s in data.get("sources", [])]
+ return cls(
+ sources=sources,
+ aliases=data.get("aliases", []),
+ related=data.get("related", []),
+ patterns=data.get("patterns", []),
+ )
+
+
+@dataclass
+class LibraryIndex:
+ """Unified cross-book/corpus concept index for the entire library."""
+ concepts: dict[str, LibraryConceptEntry] = field(default_factory=dict)
+ patterns: dict[str, list[PatternEntry]] = field(default_factory=dict)
+
+ def to_dict(self) -> dict[str, Any]:
+ return {
+ "concepts": {k: v.to_dict() for k, v in self.concepts.items()},
+ "patterns": {k: [e.to_dict() for e in v] for k, v in self.patterns.items()},
+ }
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> LibraryIndex:
+ # Support both old format (flat concept dict) and new format (with "concepts" key)
+ if "concepts" in data and isinstance(data["concepts"], dict):
+ concepts = {k: LibraryConceptEntry.from_dict(v) for k, v in data["concepts"].items()}
+ else:
+ # Legacy: top-level keys are concepts (no "concepts"/"patterns" wrapper)
+ concepts = {k: LibraryConceptEntry.from_dict(v) for k, v in data.items()
+ if k not in ("patterns",)}
+ patterns = {k: [PatternEntry.from_dict(e) for e in v] for k, v in data.get("patterns", {}).items()}
+ return cls(concepts=concepts, patterns=patterns)
+
+ def to_json(self) -> str:
+ return json.dumps(self.to_dict(), indent=2)
+
+ @classmethod
+ def from_json(cls, text: str) -> LibraryIndex:
+ return cls.from_dict(json.loads(text))
+
+
+@dataclass
+class PatternEntry:
+ """A concept linked to a pattern in the pattern index."""
+ concept: str
+ source: str # "book:" or "corpus::"
+ chunks: list[str] = field(default_factory=list)
+
+ def to_dict(self) -> dict[str, Any]:
+ return asdict(self)
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> PatternEntry:
+ return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
+
+
+@dataclass
+class ChunkIndexEntry:
+ """Metadata about a single chunk for pre-read assessment."""
+ section: str
+ concepts: list[str] = field(default_factory=list)
+ tokens: int = 0
+ prev: str | None = None
+ next: str | None = None
+
+ def to_dict(self) -> dict[str, Any]:
+ d: dict[str, Any] = {"section": self.section, "concepts": self.concepts, "tokens": self.tokens}
+ if self.prev:
+ d["prev"] = self.prev
+ if self.next:
+ d["next"] = self.next
+ return d
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> ChunkIndexEntry:
+ return cls(
+ section=data.get("section", ""),
+ concepts=data.get("concepts", []),
+ tokens=data.get("tokens", 0),
+ prev=data.get("prev"),
+ next=data.get("next"),
+ )
+
+
+@dataclass
+class ChunkIndex:
+ """Per-book index mapping chunk IDs to preview metadata."""
+ book_id: str
+ chunks: dict[str, ChunkIndexEntry] = field(default_factory=dict)
+
+ def to_dict(self) -> dict[str, Any]:
+ return {k: v.to_dict() for k, v in self.chunks.items()}
+
+ @classmethod
+ def from_dict(cls, book_id: str, data: dict[str, Any]) -> ChunkIndex:
+ chunks = {k: ChunkIndexEntry.from_dict(v) for k, v in data.items()}
+ return cls(book_id=book_id, chunks=chunks)
+
+ def to_json(self) -> str:
+ return json.dumps(self.to_dict(), indent=2)
+
+ @classmethod
+ def from_json(cls, book_id: str, text: str) -> ChunkIndex:
+ return cls.from_dict(book_id, json.loads(text))
+
+
+@dataclass
+class BookNav:
+ """Per-book navigation file: structure + chunk metadata + concepts."""
+ book_id: str
+ chapters: list[dict] = field(default_factory=list) # compact chapter/section structure
+ chunks: dict[str, ChunkIndexEntry] = field(default_factory=dict)
+ concepts: dict[str, dict] = field(default_factory=dict) # concept -> {chunks, aliases, patterns, related}
+
+ def to_dict(self) -> dict:
+ return {
+ "book_id": self.book_id,
+ "chapters": self.chapters,
+ "chunks": {k: v.to_dict() for k, v in self.chunks.items()},
+ "concepts": self.concepts,
+ }
+
+ @classmethod
+ def from_dict(cls, data: dict) -> BookNav:
+ return cls(
+ book_id=data["book_id"],
+ chapters=data.get("chapters", []),
+ chunks={k: ChunkIndexEntry.from_dict(v) for k, v in data.get("chunks", {}).items()},
+ concepts=data.get("concepts", {}),
+ )
+
+ def to_json(self) -> str:
+ return json.dumps(self.to_dict(), indent=2)
+
+ @classmethod
+ def from_json(cls, text: str) -> BookNav:
+ return cls.from_dict(json.loads(text))
@dataclass
@@ -307,6 +484,8 @@ class CorpusConceptEntry:
sections: dict[str, str] = field(default_factory=dict)
note: str = ""
aliases: list[str] = field(default_factory=list)
+ patterns: list[str] = field(default_factory=list)
+ related: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@@ -318,6 +497,8 @@ def from_dict(cls, data: dict[str, Any]) -> CorpusConceptEntry:
sections=data.get("sections", {}),
note=data.get("note", ""),
aliases=data.get("aliases", []),
+ patterns=data.get("patterns", []),
+ related=data.get("related", []),
)
diff --git a/lib/storage.py b/lib/storage.py
index 21af04c..0af3019 100644
--- a/lib/storage.py
+++ b/lib/storage.py
@@ -1,4 +1,11 @@
-"""Filesystem I/O layer for AgentLib data storage."""
+"""Filesystem I/O layer for AgentLib data storage.
+
+Superseded functions (removed in v1.8.0):
+- write_chunk_index / read_chunk_index -> use nav.json via write_book_nav / read_book_nav
+- write_pattern_index / read_pattern_index -> patterns merged into library_index.json
+- write_compact_manifest -> use nav.json
+- write_concept_index -> concepts merged into nav.json
+"""
from __future__ import annotations
import os
@@ -6,10 +13,12 @@
from pathlib import Path
from lib.models import (
+ BookNav, # pyright: ignore[reportAttributeAccessIssue]
Catalog,
CatalogEntry,
CorpusCatalog,
CorpusConceptIndex,
+ LibraryIndex, # pyright: ignore[reportAttributeAccessIssue]
Manifest,
PaperManifest,
PaperMetadata,
@@ -220,74 +229,43 @@ def search_concepts(query: str, book_id: str | None = None) -> dict[str, list[di
# ---------------------------------------------------------------------------
-# Zero-server mode: compact files for file-based navigation
+# Per-book navigation (nav.json — structure + chunks + concepts)
# ---------------------------------------------------------------------------
-def write_compact_manifest(manifest: Manifest) -> Path:
- """Write a compact manifest optimized for agent navigation (~500-2k tokens)."""
- import json
-
- _validate_path_component(manifest.book_id, "book_id")
- compact: dict = {
- "book_id": manifest.book_id,
- "chapters": [],
- "concepts": sorted(manifest.concept_index.keys()),
- }
- for ch in manifest.chapters:
- compact["chapters"].append({
- "id": ch.id,
- "title": ch.title,
- "summary": ch.summary[:100] + "..." if len(ch.summary) > 100 else ch.summary,
- "concepts": ch.key_concepts[:3],
- "sections": [
- {"id": s.id, "title": s.title, "chunks": len(s.chunk_ids)}
- for s in ch.sections
- ],
- })
-
- path = _safe_join(_books_root(), manifest.book_id, "manifest.compact.json")
+def write_book_nav(nav: BookNav) -> None:
+ """Write per-book navigation file (structure + chunks + concepts)."""
+ path = _safe_join(_books_root(), nav.book_id, "nav.json")
path.parent.mkdir(parents=True, exist_ok=True)
- path.write_text(json.dumps(compact, indent=2), encoding="utf-8")
- return path
+ path.write_text(nav.to_json())
-def write_concept_index(book_id: str, concept_index: dict) -> Path:
- """Write a concept index file with aliases for file-based navigation.
+def read_book_nav(book_id: str) -> BookNav | None:
+ """Read per-book navigation file. Returns None if missing."""
+ _validate_path_component(book_id)
+ path = _safe_join(_books_root(), book_id, "nav.json")
+ if not path.exists():
+ return None
+ return BookNav.from_json(path.read_text())
- Output format: {concept_name: {"chunks": [...], "aliases": [...]}}
- Falls back to {concept_name: {"chunks": [...]}} when no aliases exist.
- """
- import json
- _validate_path_component(book_id, "book_id")
- flat: dict[str, dict[str, list[str]]] = {}
- for concept, entries in concept_index.items():
- chunk_ids: list[str] = []
- aliases: list[str] = []
- for entry in entries:
- # Duck-type: callers pass ConceptEntry objects (with .chunks attr)
- # or dicts (from deserialized JSON). Normalizing callers is out of
- # scope for this PR.
- if hasattr(entry, "chunks"):
- chunk_ids.extend(entry.chunks)
- elif isinstance(entry, dict):
- chunk_ids.extend(entry.get("chunks", []))
- # Collect aliases
- if hasattr(entry, "aliases"):
- aliases.extend(entry.aliases)
- elif isinstance(entry, dict):
- aliases.extend(entry.get("aliases", []))
- if chunk_ids:
- entry_data: dict[str, list[str]] = {"chunks": chunk_ids}
- # Deduplicate aliases
- unique_aliases = list(dict.fromkeys(aliases))
- if unique_aliases:
- entry_data["aliases"] = unique_aliases
- flat[concept] = entry_data
-
- path = _safe_join(_books_root(), book_id, "concepts.json")
+# ---------------------------------------------------------------------------
+# Library index (unified cross-book/corpus concept index)
+# ---------------------------------------------------------------------------
+
+def read_library_index() -> LibraryIndex:
+ """Read the unified library index. Returns empty index if not found."""
+ path = _data_root() / "library_index.json"
+ if not path.exists():
+ return LibraryIndex()
+ return LibraryIndex.from_json(path.read_text(encoding="utf-8"))
+
+
+def write_library_index(index: LibraryIndex) -> Path:
+ """Write the unified library index."""
+ import json
+ path = _data_root() / "library_index.json"
path.parent.mkdir(parents=True, exist_ok=True)
- path.write_text(json.dumps(flat, indent=2), encoding="utf-8")
+ path.write_text(json.dumps(index.to_dict(), indent=2), encoding="utf-8")
return path
@@ -334,13 +312,23 @@ def write_navigation_md() -> Path:
"\n"
"## How to navigate\n"
"\n"
- "### Books -- Quick path (know what you need):\n"
- "1. Read `books/{book-id}/concepts.json` -- find chunk IDs for your concept (check aliases too)\n"
+ "### FASTEST: Unified library search (1 read, covers ALL books + corpora)\n"
+ "1. Read `library_index.json` -- find concepts across ALL sources with aliases, related concepts, and pattern tags\n"
+ "2. Read `books/{book-id}/nav.json` -- per-book navigation: structure, chunk preview, and concepts\n"
+ "3. Read `books/{book-id}/chunks/{chunk-id}.md` -- get the content\n"
+ "\n"
+ "### Cross-domain insight: Pattern-based discovery\n"
+ "1. Find a concept's `patterns` in `library_index.json` (e.g. `credential-cycling`)\n"
+ "2. Look up the pattern in `library_index.json`'s `patterns` section to find ALL concepts sharing that pattern\n"
+ "3. Discover structurally similar concepts in different domains\n"
+ "\n"
+ "### Books -- Quick path:\n"
+ "1. Read `books/{book-id}/nav.json` -- find chunk IDs by concept (check aliases, patterns, related), preview chunks\n"
"2. Read `books/{book-id}/chunks/{chunk-id}.md` -- get the content\n"
"\n"
"### Books -- Exploration path (browsing):\n"
"1. Read `books/catalog.json` -- see all available books (~50 tokens/book)\n"
- "2. Read `books/{book-id}/manifest.compact.json` -- see chapters, summaries, concepts (~500-2k tokens)\n"
+ "2. Read `books/{book-id}/nav.json` -- see chapters, sections, chunk preview, and concepts\n"
"3. Read `books/{book-id}/chunks/{chunk-id}.md` -- get specific content (~300-500 tokens)\n"
"\n"
"### Corpora -- Paper collections:\n"
@@ -348,19 +336,24 @@ def write_navigation_md() -> Path:
"2. Read `corpus/{corpus-id}/clusters/{cluster-id}.json` -- see papers with abstracts\n"
"3. Read `corpus/{corpus-id}/papers/{paper-id}/manifest.compact.json` -- paper structure\n"
"4. Read `corpus/{corpus-id}/papers/{paper-id}/chunks/{chunk-id}.md` -- paper content\n"
- "5. Read `corpus/{corpus-id}/concept_index.json` -- cross-paper concept search\n"
+ "5. Read `corpus/{corpus-id}/concept_index.json` -- cross-paper concept search (with patterns)\n"
"\n"
"## Token budget\n"
+ "- library_index.json: ~500-1500 tokens (entire library)\n"
+ "- nav.json: ~500-2000 tokens per book (structure + chunks + concepts)\n"
"- catalog.json: ~50 tokens per book\n"
- "- manifest.compact.json: ~500-2000 tokens per book\n"
+ "- manifest.json: full archive (use nav.json instead for navigation)\n"
"- Each chunk: ~300-500 tokens\n"
- "- concepts.json: ~200-500 tokens\n"
"- corpus_catalog.json: ~500-800 tokens\n"
"- concept_index.json: ~500-1500 tokens\n"
"\n"
"## Rules\n"
- "- NEVER read the full manifest.json -- use manifest.compact.json instead\n"
- "- NEVER read all chunks -- use concepts.json or manifest to find the right ones\n"
+ "- START with `library_index.json` for cross-library search (fastest path)\n"
+ "- Use `nav.json` to PREVIEW chunks before reading them\n"
+ "- Follow `prev`/`next` links in nav.json chunks for adjacent context\n"
+ "- Pattern discovery is built into `library_index.json` (no separate file needed)\n"
+ "- For browsing, use `nav.json` instead of manifest.json\n"
+ "- NEVER read all chunks -- use concepts or nav.json to find the right ones\n"
"- Max 10 chunks per question -- if you need more, refine your search\n"
"\n"
"## Current library\n"
diff --git a/lib/summariser.py b/lib/summariser.py
index a8bcdd0..745d558 100644
--- a/lib/summariser.py
+++ b/lib/summariser.py
@@ -38,6 +38,8 @@ class ConceptMapping:
sec: str
chunks: list[str] = field(default_factory=list)
aliases: list[str] = field(default_factory=list)
+ patterns: list[str] = field(default_factory=list)
+ related: list[str] = field(default_factory=list)
def _get_config(llm_config: LLMConfig | None) -> LLMConfig:
@@ -232,6 +234,7 @@ async def async_summarise_chapter(
)
+
def _format_chapters_text(chapters: list[ChapterSummary]) -> str:
"""Format chapter summaries into text for the concept extraction prompt."""
text = ""
@@ -251,9 +254,13 @@ def _parse_concept_response(data: dict) -> dict[str, list[ConceptMapping]]:
for concept, value in data.items():
if isinstance(value, dict) and "locations" in value:
aliases = value.get("aliases", [])
+ patterns = value.get("patterns", [])
+ related = value.get("related", [])
entries = value["locations"]
elif isinstance(value, list):
aliases = []
+ patterns = []
+ related = []
entries = value
else:
continue
@@ -264,6 +271,8 @@ def _parse_concept_response(data: dict) -> dict[str, list[ConceptMapping]]:
sec=e.get("sec", ""),
chunks=e.get("chunks", []),
aliases=aliases,
+ patterns=patterns,
+ related=related,
)
for e in entries
]
@@ -319,18 +328,27 @@ def extract_concepts(
{chapters_text}
-Create a concept index that maps key concepts to their locations. Each concept should appear with all relevant chapters, sections, and chunks where it's discussed. For each concept, include 2-3 aliases: abbreviations, acronyms, or alternative phrasings someone might search for.
+Create a concept index that maps key concepts to their locations. Each concept should appear with all relevant chapters, sections, and chunks where it's discussed.
+
+For each concept, include:
+- **aliases** (2-3): abbreviations, acronyms, or alternative phrasings someone might search for.
+- **patterns** (1-3): structural or methodological patterns the concept exemplifies (e.g., 'layered-architecture', 'feedback-loop', 'defense-in-depth'). Use lowercase-hyphenated format. Use consistent naming across concepts — two concepts that share a pattern are structurally analogous.
+- **related** (2-5): names of OTHER concepts in this same index that are closely related to this one.
Respond with ONLY valid JSON in this exact format:
{{
"concept_name_1": {{
"aliases": ["abbreviation", "synonym"],
+ "patterns": ["pattern-tag-1", "pattern-tag-2"],
+ "related": ["concept_name_2"],
"locations": [
{{"ch": "ch01", "sec": "ch01-s01", "chunks": ["ch01-s01-001", "ch01-s01-002"]}}
]
}},
"concept_name_2": {{
"aliases": ["alt_name"],
+ "patterns": ["pattern-tag-3"],
+ "related": ["concept_name_1"],
"locations": [
{{"ch": "ch02", "sec": "ch02-s03", "chunks": ["ch02-s03-001"]}}
]
@@ -356,21 +374,29 @@ def extract_concepts(
concept_key_map[key] = concept
all_concepts[concept] = list(mappings)
- # Deduplicate locations and aliases per concept
+ # Deduplicate locations, aliases, patterns, and related per concept
for concept in all_concepts:
seen: set[tuple] = set()
deduped: list[ConceptMapping] = []
all_aliases: list[str] = []
+ all_patterns: list[str] = []
+ all_related: list[str] = []
for m in all_concepts[concept]:
loc_key = (m.ch, m.sec, tuple(sorted(m.chunks)))
if loc_key not in seen:
seen.add(loc_key)
deduped.append(m)
all_aliases.extend(m.aliases)
- # Dedupe aliases preserving order
+ all_patterns.extend(m.patterns)
+ all_related.extend(m.related)
+ # Dedupe preserving order
unique_aliases: list[str] = list(dict.fromkeys(all_aliases))
+ unique_patterns: list[str] = list(dict.fromkeys(all_patterns))
+ unique_related: list[str] = [r for r in dict.fromkeys(all_related) if r != concept]
for m in deduped:
m.aliases = unique_aliases
+ m.patterns = unique_patterns
+ m.related = unique_related
all_concepts[concept] = deduped
# Cap total concepts
diff --git a/preprocessing/books.py b/preprocessing/books.py
index bd21c4b..04dd938 100644
--- a/preprocessing/books.py
+++ b/preprocessing/books.py
@@ -46,22 +46,29 @@ def _load_env() -> None:
from lib.chunker import chunk_sections, Chunk
from lib.models import (
+ BookNav,
CatalogEntry,
ChapterInfo,
+ ChunkIndexEntry,
ConceptEntry,
+ LibraryConceptEntry,
+ LibraryConceptSource,
Manifest,
ParsedSection,
+ PatternEntry,
SectionInfo,
)
from lib.parser import parse_file
from lib.storage import (
list_chunks,
read_catalog,
+ read_chunk,
+ read_library_index,
read_manifest,
update_catalog_entry,
+ write_book_nav,
write_chunk,
- write_compact_manifest,
- write_concept_index,
+ write_library_index,
write_manifest,
write_navigation_md,
)
@@ -123,6 +130,113 @@ def _group_chunks_by_section(
return dict(result)
+def _fuzzy_match_pattern(new_pattern: str, existing_patterns: set[str]) -> str:
+ """Match a new pattern to an existing one if similar enough, else return as-is.
+
+ Uses simple Jaccard similarity on character trigrams.
+ """
+ if new_pattern in existing_patterns:
+ return new_pattern
+
+ def _trigrams(s: str) -> set[str]:
+ return {s[i:i + 3] for i in range(max(0, len(s) - 2))}
+
+ new_tri = _trigrams(new_pattern)
+ if not new_tri:
+ return new_pattern
+
+ best_match = new_pattern
+ best_score = 0.0
+ for existing in existing_patterns:
+ existing_tri = _trigrams(existing)
+ if not existing_tri:
+ continue
+ intersection = len(new_tri & existing_tri)
+ union = len(new_tri | existing_tri)
+ score = intersection / union if union else 0.0
+ if score > best_score and score >= 0.7:
+ best_score = score
+ best_match = existing
+
+ return best_match
+
+
+def _update_library_indices(book_id: str, manifest: Manifest) -> None:
+ """Update library_index.json (concepts + patterns) with this book's concepts."""
+ source_prefix = f"book:{book_id}"
+
+ lib_index = read_library_index()
+
+ # Remove stale concept entries for this book
+ for concept_name, entry in list(lib_index.concepts.items()):
+ entry.sources = [s for s in entry.sources if s.source != source_prefix]
+ if not entry.sources:
+ del lib_index.concepts[concept_name]
+
+ # Remove stale pattern entries for this book
+ existing_pattern_names = set(lib_index.patterns.keys())
+ for pattern_name, entries in list(lib_index.patterns.items()):
+ lib_index.patterns[pattern_name] = [
+ e for e in entries if e.source != source_prefix
+ ]
+ if not lib_index.patterns[pattern_name]:
+ del lib_index.patterns[pattern_name]
+ existing_pattern_names.discard(pattern_name)
+
+ # Merge this book's concepts
+ for concept_name, concept_entries in manifest.concept_index.items():
+ all_chunks: list[str] = []
+ all_aliases: list[str] = []
+ all_patterns: list[str] = []
+ all_related: list[str] = []
+ for ce in concept_entries:
+ all_chunks.extend(ce.chunks)
+ all_aliases.extend(ce.aliases)
+ all_patterns.extend(getattr(ce, "patterns", []))
+ all_related.extend(getattr(ce, "related", []))
+
+ source = LibraryConceptSource(source=source_prefix, chunks=all_chunks)
+
+ if concept_name in lib_index.concepts:
+ existing = lib_index.concepts[concept_name]
+ existing.sources.append(source)
+ # Merge aliases/patterns/related with dedup
+ existing.aliases = list(dict.fromkeys(existing.aliases + all_aliases))
+ existing.patterns = list(dict.fromkeys(existing.patterns + all_patterns))
+ existing.related = list(dict.fromkeys(
+ r for r in existing.related + all_related if r != concept_name
+ ))
+ else:
+ lib_index.concepts[concept_name] = LibraryConceptEntry(
+ sources=[source],
+ aliases=list(dict.fromkeys(all_aliases)),
+ related=list(dict.fromkeys(r for r in all_related if r != concept_name)),
+ patterns=list(dict.fromkeys(all_patterns)),
+ )
+
+ # Merge pattern entries into lib_index.patterns with fuzzy merge
+ for concept_name, concept_entries in manifest.concept_index.items():
+ all_chunks: list[str] = []
+ raw_patterns: list[str] = []
+ for ce in concept_entries:
+ all_chunks.extend(ce.chunks)
+ raw_patterns.extend(getattr(ce, "patterns", []))
+
+ unique_patterns = list(dict.fromkeys(raw_patterns))
+ for raw_pat in unique_patterns:
+ canonical = _fuzzy_match_pattern(raw_pat, existing_pattern_names)
+ if canonical not in lib_index.patterns:
+ lib_index.patterns[canonical] = []
+ lib_index.patterns[canonical].append(PatternEntry(
+ concept=concept_name,
+ source=source_prefix,
+ chunks=all_chunks,
+ ))
+ existing_pattern_names.add(canonical)
+
+ write_library_index(lib_index)
+
+
def ingest_book(
file_path: Path,
book_id: str | None = None,
@@ -360,7 +474,7 @@ async def _summarise_all() -> list[ChapterSummary]:
concept_index_raw: dict[str, list[ConceptEntry]] = {}
for concept, mappings in concept_mappings.items():
concept_index_raw[concept] = [
- ConceptEntry(ch=m.ch, sec=m.sec, chunks=m.chunks, aliases=m.aliases)
+ ConceptEntry(ch=m.ch, sec=m.sec, chunks=m.chunks, aliases=m.aliases, patterns=m.patterns, related=m.related)
for m in mappings
]
@@ -396,11 +510,113 @@ async def _summarise_all() -> list[ChapterSummary]:
write_manifest(manifest)
logger.info(" Wrote manifest.json")
- # Write zero-server navigation files
- write_compact_manifest(manifest)
- logger.info(" Wrote manifest.compact.json")
- write_concept_index(book_id, manifest.concept_index)
- logger.info(" Wrote concepts.json")
+ # Build chunk index entries for nav.json
+ chunk_idx_entries: dict[str, ChunkIndexEntry] = {}
+ # Build concept->chunk reverse map
+ chunk_concepts: dict[str, list[str]] = defaultdict(list)
+ for concept_name, entries in manifest.concept_index.items():
+ for entry in entries:
+ for cid in entry.chunks:
+ if concept_name not in chunk_concepts[cid]:
+ chunk_concepts[cid].append(concept_name)
+
+ # Build section label map from manifest
+ section_labels: dict[str, str] = {}
+ for ch in manifest.chapters:
+ for sec in ch.sections:
+ for cid in sec.chunk_ids:
+ section_labels[cid] = f"{ch.title} > {sec.title}"
+
+ # Populate chunk index with prev/next from written chunks
+ all_chunk_ids = list_chunks(book_id)
+ for cid in all_chunk_ids:
+ chunk_idx_entries[cid] = ChunkIndexEntry(
+ section=section_labels.get(cid, ""),
+ concepts=chunk_concepts.get(cid, []),
+ tokens=0, # filled below if available
+ )
+
+ # Set prev/next chains per section group and token counts
+ sec_groups: dict[str, list[str]] = defaultdict(list)
+ for cid in all_chunk_ids:
+ sec_id = _section_id_from_chunk(cid)
+ sec_groups[sec_id].append(cid)
+ for sec_id, cids in sec_groups.items():
+ for i, cid in enumerate(cids):
+ entry = chunk_idx_entries[cid]
+ if i > 0:
+ entry.prev = cids[i - 1]
+ if i < len(cids) - 1:
+ entry.next = cids[i + 1]
+
+ # Get token counts from the chunks we just created
+ if all_chunks:
+ for chunk in all_chunks:
+ if chunk.chunk_id in chunk_idx_entries:
+ chunk_idx_entries[chunk.chunk_id].tokens = chunk.meta.token_count
+ else:
+ # Fallback: read token counts from chunk file YAML frontmatter
+ for cid in all_chunk_ids:
+ content = read_chunk(book_id, cid)
+ if content and content.startswith("---"):
+ parts = content.split("---", 2)
+ if len(parts) >= 3:
+ for line in parts[1].splitlines():
+ if line.strip().startswith("token_count:"):
+ try:
+ token_val = int(line.strip().split(":", 1)[1].strip())
+ if cid in chunk_idx_entries:
+ chunk_idx_entries[cid].tokens = token_val
+ except (ValueError, IndexError):
+ pass
+ break
+
+ # Build nav chapters (compact structure)
+ nav_chapters = []
+ for ch in manifest.chapters:
+ nav_ch = {
+ "id": ch.id,
+ "title": ch.title,
+ "summary": ch.summary[:100],
+ "concepts": ch.key_concepts[:3],
+ "sections": [
+ {"id": s.id, "title": s.title, "chunks": len(s.chunk_ids)}
+ for s in ch.sections
+ ],
+ }
+ nav_chapters.append(nav_ch)
+
+ # Build nav concepts
+ nav_concepts: dict[str, dict] = {}
+ for concept, c_entries in manifest.concept_index.items():
+ c_all_chunks: list[str] = []
+ c_all_aliases: list[str] = []
+ c_all_patterns: list[str] = []
+ c_all_related: list[str] = []
+ for ce in c_entries:
+ c_all_chunks.extend(ce.chunks)
+ c_all_aliases.extend(ce.aliases)
+ c_all_patterns.extend(ce.patterns)
+ c_all_related.extend(ce.related)
+ nav_concepts[concept] = {
+ "chunks": list(dict.fromkeys(c_all_chunks)),
+ "aliases": list(dict.fromkeys(c_all_aliases)),
+ "patterns": list(dict.fromkeys(c_all_patterns)),
+ "related": list(dict.fromkeys(c_all_related)),
+ }
+
+ book_nav = BookNav(
+ book_id=book_id,
+ chapters=nav_chapters,
+ chunks=chunk_idx_entries,
+ concepts=nav_concepts,
+ )
+ write_book_nav(book_nav)
+ logger.info(" Wrote nav.json")
+
+ # Update unified library_index.json (concepts + patterns)
+ _update_library_indices(book_id, manifest)
+ logger.info(" Updated library_index.json")
# Update catalog
total_chunks = sum(
diff --git a/preprocessing/corpus.py b/preprocessing/corpus.py
index bb44029..0b865a1 100644
--- a/preprocessing/corpus.py
+++ b/preprocessing/corpus.py
@@ -51,20 +51,25 @@ def _load_env() -> None:
CorpusCatalog,
CorpusConceptEntry,
CorpusConceptIndex,
+ LibraryConceptEntry,
+ LibraryConceptSource,
PaperEntry,
PaperManifest,
PaperMetadata,
ParsedSection,
+ PatternEntry,
SectionInfo,
)
from lib.parser import parse_pdf
from lib.storage import (
find_paper_by_filename,
list_paper_chunks,
+ read_library_index,
read_paper_manifest,
write_cluster_list,
write_corpus_catalog,
write_corpus_concept_index,
+ write_library_index,
write_navigation_md,
write_paper_chunk,
write_paper_manifest,
@@ -246,12 +251,19 @@ def _build_corpus_concept_index(
{papers_text}
-Create a concept index mapping key concepts to papers and sections. Include 15-40 concepts that appear across multiple papers. For each concept, include 2-3 aliases: abbreviations, acronyms, or alternative phrasings someone might search for.
+Create a concept index mapping key concepts to papers and sections. Include 15-40 concepts that appear across multiple papers.
+
+For each concept, include:
+- **aliases** (2-3): abbreviations, acronyms, or alternative phrasings.
+- **patterns** (1-3): structural or methodological patterns the concept exemplifies (e.g., 'layered-architecture', 'feedback-loop', 'defense-in-depth'). Use lowercase-hyphenated format. Use consistent naming across concepts — two concepts that share a pattern are structurally analogous.
+- **related** (2-5): names of OTHER concepts in this same index that are closely related to this one.
Respond with ONLY valid JSON:
{{
"concept_name": {{
"aliases": ["abbreviation", "synonym"],
+ "patterns": ["pattern-tag-1", "pattern-tag-2"],
+ "related": ["other_concept_1", "other_concept_2"],
"papers": ["paper-id-1", "paper-id-2"],
"sections": {{"paper-id-1": "ch02", "paper-id-2": "ch03"}},
"note": "brief context"
@@ -269,6 +281,8 @@ def _build_corpus_concept_index(
sections=entry_data.get("sections", {}),
note=entry_data.get("note", ""),
aliases=entry_data.get("aliases", []),
+ patterns=entry_data.get("patterns", []),
+ related=entry_data.get("related", []),
)
return CorpusConceptIndex(corpus_id=corpus_id, concepts=concepts)
@@ -278,6 +292,79 @@ def _build_corpus_concept_index(
# Main pipeline
# ---------------------------------------------------------------------------
+def _update_library_indices_corpus(
+ corpus_id: str,
+ paper_summaries: dict[str, tuple[PaperMetadata, list]],
+) -> None:
+ """Update library_index.json (concepts + patterns) with corpus concepts.
+
+ Reads the existing concept_index.json for this corpus (just written) and
+ merges it into the unified library index.
+ """
+ from lib.storage import read_corpus_concept_index
+
+ concept_index = read_corpus_concept_index(corpus_id)
+ if not concept_index:
+ return
+
+ source_prefix = f"corpus:{corpus_id}"
+
+ lib_index = read_library_index()
+
+ # Remove stale concept entries for this corpus
+ for concept_name, entry in list(lib_index.concepts.items()):
+ entry.sources = [s for s in entry.sources if not s.source.startswith(source_prefix)]
+ if not entry.sources:
+ del lib_index.concepts[concept_name]
+
+ # Remove stale pattern entries for this corpus
+ for pattern_name, entries in list(lib_index.patterns.items()):
+ lib_index.patterns[pattern_name] = [
+ e for e in entries if not e.source.startswith(source_prefix)
+ ]
+ if not lib_index.patterns[pattern_name]:
+ del lib_index.patterns[pattern_name]
+
+ # Merge corpus concepts
+ for concept_name, ce in concept_index.concepts.items():
+ # Create one source entry per paper that covers this concept
+ new_sources = []
+ for paper_id in ce.papers:
+ new_sources.append(LibraryConceptSource(
+ source=f"{source_prefix}:{paper_id}",
+ chunks=[], # corpus concept index doesn't track chunk IDs per paper
+ ))
+
+ if concept_name in lib_index.concepts:
+ existing = lib_index.concepts[concept_name]
+ existing.sources.extend(new_sources)
+ existing.aliases = list(dict.fromkeys(existing.aliases + ce.aliases))
+ existing.patterns = list(dict.fromkeys(existing.patterns + ce.patterns))
+ existing.related = list(dict.fromkeys(existing.related + ce.related))
+ else:
+ lib_index.concepts[concept_name] = LibraryConceptEntry(
+ sources=new_sources,
+ aliases=list(dict.fromkeys(ce.aliases)),
+ related=list(dict.fromkeys(ce.related)),
+ patterns=list(dict.fromkeys(ce.patterns)),
+ )
+
+ # Merge pattern entries into lib_index.patterns
+ for concept_name, ce in concept_index.concepts.items():
+ for pat in ce.patterns:
+ # Simple exact match for corpus (no fuzzy needed since patterns come from same prompt)
+ if pat not in lib_index.patterns:
+ lib_index.patterns[pat] = []
+ for paper_id in ce.papers:
+ lib_index.patterns[pat].append(PatternEntry(
+ concept=concept_name,
+ source=f"{source_prefix}:{paper_id}",
+ chunks=[],
+ ))
+
+ write_library_index(lib_index)
+
+
def ingest_corpus(
folder_path: Path,
corpus_id: str | None = None,
@@ -577,6 +664,10 @@ def ingest_corpus(
else:
logger.warning(" No paper summaries available, skipping concept index")
+ # Update unified library_index.json (concepts + patterns)
+ _update_library_indices_corpus(corpus_id, paper_summaries)
+ logger.info(" Updated library_index.json")
+
# Update NAVIGATION.md
write_navigation_md()
logger.info(" Updated NAVIGATION.md")
diff --git a/pyproject.toml b/pyproject.toml
index 4d84a27..c6ae100 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "agentlib"
-version = "1.7.0"
+version = "1.8.0"
description = "Agentic Knowledge Navigation System"
readme = "README.md"
requires-python = ">=3.12"
diff --git a/server.py b/server.py
index 3afe3b0..0aa0f50 100644
--- a/server.py
+++ b/server.py
@@ -93,8 +93,10 @@ def open_book(book_id: str) -> str:
"title": ch.title,
"summary": _truncate(ch.summary, MAX_CHAPTER_SUMMARY_CHARS),
"concepts": ch.key_concepts[:MAX_CONCEPTS_PER_CHAPTER],
- "sections": len(ch.sections),
- "chunks": sum(len(s.chunk_ids) for s in ch.sections),
+ "sections": [
+ {"id": s.id, "title": s.title, "chunk_ids": s.chunk_ids}
+ for s in ch.sections
+ ],
})
compact: dict = {"book_id": manifest.book_id}
@@ -157,5 +159,83 @@ def search_concepts(query: str, book_id: str | None = None) -> str:
return json.dumps(compact)
+@mcp.tool()
+def search_library(query: str) -> str:
+ """Search the unified library index across ALL books and corpora.
+ Matches concept names, aliases, related concepts, and structural patterns."""
+ lib_index = storage.read_library_index()
+ query_lower = query.strip().lower()
+
+ results: dict = {}
+
+ # 1. Search concepts (name, aliases, related)
+ for concept, entry in lib_index.concepts.items():
+ searchable = [concept.lower()]
+ searchable.extend(a.lower() for a in entry.aliases)
+ searchable.extend(r.lower() for r in entry.related)
+ if any(query_lower in s for s in searchable):
+ results[concept] = {
+ "sources": [{"source": s.source, "chunks": s.chunks} for s in entry.sources],
+ "aliases": entry.aliases,
+ "related": entry.related,
+ "patterns": entry.patterns,
+ }
+
+ # 2. Search patterns — find concepts that share a matching pattern
+ for pattern, entries in lib_index.patterns.items():
+ if query_lower in pattern.lower():
+ for pe in entries:
+ if pe.concept not in results:
+ # Add this concept from the concepts dict if available
+ concept_entry = lib_index.concepts.get(pe.concept)
+ if concept_entry:
+ results[pe.concept] = {
+ "sources": [{"source": s.source, "chunks": s.chunks} for s in concept_entry.sources],
+ "aliases": concept_entry.aliases,
+ "related": concept_entry.related,
+ "patterns": concept_entry.patterns,
+ "matched_via_pattern": pattern,
+ }
+ else:
+ results[pe.concept] = {
+ "sources": [{"source": pe.source, "chunks": pe.chunks}],
+ "matched_via_pattern": pattern,
+ }
+
+ if not results:
+ # Helpful fallback: list available patterns
+ available_patterns = sorted(lib_index.patterns.keys())[:20]
+ if available_patterns:
+ return f"No matches for '{query}'. Available patterns: {', '.join(available_patterns)}"
+ return f"No matches for '{query}' in library index."
+
+ # Cap results
+ if len(results) > MAX_SEARCH_RESULTS:
+ results = dict(list(results.items())[:MAX_SEARCH_RESULTS])
+
+ return json.dumps(results, indent=2)
+
+
+@mcp.tool()
+def preview_chunks(book_id: str, chunk_ids: list[str]) -> str:
+ """Preview chunk metadata before reading full content. Shows section, concepts, token count, and prev/next navigation."""
+ nav = storage.read_book_nav(book_id)
+ if nav is None:
+ return f"No navigation data found for book '{book_id}'."
+
+ MAX_PREVIEW = 20
+ chunk_ids = chunk_ids[:MAX_PREVIEW]
+
+ previews: dict = {}
+ for cid in chunk_ids:
+ entry = nav.chunks.get(cid)
+ if entry:
+ previews[cid] = entry.to_dict()
+ else:
+ previews[cid] = None
+
+ return json.dumps(previews, indent=2)
+
+
if __name__ == "__main__":
mcp.run()
diff --git a/skills/agentlib-knowledge/SKILL.md b/skills/agentlib-knowledge/SKILL.md
index 7e86f33..c0c84a4 100644
--- a/skills/agentlib-knowledge/SKILL.md
+++ b/skills/agentlib-knowledge/SKILL.md
@@ -5,73 +5,21 @@ description: "Knowledge library navigation. Trigger on: research questions, book
## AgentLib — Knowledge Library
-**This skill activates automatically** for research/knowledge questions. Users can also invoke it explicitly with `/agentlib-knowledge ` to always consult the library.
+**ALWAYS check this library BEFORE web search or answering from training data** when the user asks about topics that could be covered by ingested books or paper corpora.
-You have a preprocessed knowledge library at `~/.claude/plugins/agentlib/library/`.
+Use the MCP tools provided by the agentlib plugin. Do NOT read library files directly.
-**IMPORTANT: ALWAYS check this library BEFORE web search or answering from training data when the user asks about topics that could be covered by ingested books or paper corpora.**
+### Workflow
-**You MUST delegate this research to the `library-researcher` agent using the Agent tool.** Do NOT read library files directly — spawn the agent with the user's question and let it handle all navigation and reading. This keeps your main context clean.
+1. **`search_library(query)`** — Try ONCE with a broad 1-2 word query. Do NOT retry with rephrased queries. If no results, go to step 2.
-When spawning the agent, include the **absolute library path** in the prompt (expand `~` to the full home directory). Example:
-> Research the following question using the library at /Users/nadavbarkai/.claude/plugins/agentlib/library/
-> Question: {user's question}
+2. **`browse_library`** to find the right book, then **`open_book(book_id)`** to browse its chapter structure. Identify the relevant chapter/section.
-The agent will return a synthesized answer with citations.
+3. **`preview_chunks(book_id, chunk_ids)`** — preview candidate chunks (section title, concepts, token count). Pick the 2-3 most relevant.
-If the Agent tool is unavailable, fall back to the manual steps below.
-
-### Manual fallback (only if agent delegation fails)
-
-#### Step 1: Check what's available
-```
-Read ~/.claude/plugins/agentlib/library/NAVIGATION.md
-```
-This lists ALL books AND paper corpora. Check BOTH sections — if a book OR corpus covers the topic, continue with the appropriate path below. Only proceed with other tools if nothing is relevant.
-
----
-
-### Path A: Books
-
-**Find content (pick one):**
-
-**A1 — Search by concept (fastest, 2 reads):**
-```
-Read ~/.claude/plugins/agentlib/library/books/{book-id}/concepts.json
-```
-Find your concept → get chunk IDs → go to Step 3.
-
-**A2 — Browse chapters (3 reads):**
-```
-Read ~/.claude/plugins/agentlib/library/books/{book-id}/manifest.compact.json
-```
-Find relevant chapter/section → note chunk IDs → go to Step 3.
-
----
-
-### Path B: Corpora (scientific papers)
-
-**B1 — Search by concept across all papers (fastest, 1 read):**
-```
-Read ~/.claude/plugins/agentlib/library/corpus/{corpus-id}/concept_index.json
-```
-Find your concept → get paper IDs and chunk IDs → go to Step 3.
-
-**B2 — Browse by topic cluster (2-3 reads):**
-1. `corpus_catalog.json` — see topic clusters
-2. `clusters/{cluster-id}.json` — see papers with abstracts
-3. Pick papers → read `papers/{paper-id}/manifest.compact.json`
-
----
-
-### Step 3: Read the content
-```
-Read ~/.claude/plugins/agentlib/library/books/{book-id}/chunks/{chunk-id}.md
-Read ~/.claude/plugins/agentlib/library/corpus/{corpus-id}/papers/{paper-id}/chunks/{chunk-id}.md
-```
-Each chunk is ~300-500 tokens. Read up to 5 chunks per question. Chunks have `prev`/`next` links for adjacent context.
+4. **`read_chunks(book_id, chunk_ids)`** — read the full content of the selected chunks.
### Rules
-- ALWAYS use `manifest.compact.json`, NEVER `manifest.json`
-- Max 4 navigation reads, then up to 5 content chunks
-- Cite the book/paper and chunk ID when answering
+
+- Cite the book/paper title and chunk ID when answering
+- Max 2-3 content chunks per question — use preview to pick well
diff --git a/tests/conftest.py b/tests/conftest.py
index 7b2237a..8411299 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -70,12 +70,19 @@ def sample_manifest() -> Manifest:
),
],
concept_index={
- "unit testing": [ConceptEntry(ch="ch01", sec="ch01-s01", chunks=["ch01-s01-001"])],
- "mocking": [ConceptEntry(ch="ch02", sec="ch02-s01", chunks=["ch02-s01-001"])],
+ "unit testing": [ConceptEntry(
+ ch="ch01", sec="ch01-s01", chunks=["ch01-s01-001"],
+ patterns=["test-driven-development", "fail-fast"],
+ )],
+ "mocking": [ConceptEntry(
+ ch="ch02", sec="ch02-s01", chunks=["ch02-s01-001"],
+ patterns=["dependency-injection", "inversion-of-control"],
+ )],
"retrieval augmented generation": [
ConceptEntry(
ch="ch01", sec="ch01-s01", chunks=["ch01-s01-002"],
aliases=["RAG", "RAG pipeline"],
+ patterns=["pipeline-stages", "fan-out-aggregation"],
),
],
},
diff --git a/tests/test_server.py b/tests/test_server.py
index 323c99d..5f6a75b 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -5,8 +5,23 @@
import json
from pathlib import Path
-from lib.models import CatalogEntry, Manifest
-from lib.storage import update_catalog_entry, write_chunk, write_manifest
+from lib.models import (
+ BookNav,
+ CatalogEntry,
+ ChunkIndexEntry,
+ LibraryConceptEntry,
+ LibraryConceptSource,
+ LibraryIndex,
+ Manifest,
+ PatternEntry,
+)
+from lib.storage import (
+ update_catalog_entry,
+ write_book_nav,
+ write_chunk,
+ write_library_index,
+ write_manifest,
+)
class TestBrowseLibrary:
@@ -74,3 +89,114 @@ def test_search_empty(self, tmp_data_dir: Path) -> None:
from server import search_concepts
result = json.loads(search_concepts("anything"))
assert result == {}
+
+
+class TestSearchLibrary:
+ def test_search_unified(self, tmp_data_dir: Path) -> None:
+ from server import search_library
+ lib_index = LibraryIndex(concepts={
+ "OAuth 2.0": LibraryConceptEntry(
+ sources=[LibraryConceptSource(source="book:api-sec", chunks=["ch03-001"])],
+ aliases=["OAuth"],
+ related=["JWT"],
+ patterns=["credential-cycling"],
+ ),
+ })
+ write_library_index(lib_index)
+
+ result = json.loads(search_library("OAuth"))
+ assert "OAuth 2.0" in result
+ assert result["OAuth 2.0"]["patterns"] == ["credential-cycling"]
+ assert result["OAuth 2.0"]["related"] == ["JWT"]
+
+ def test_search_by_related(self, tmp_data_dir: Path) -> None:
+ from server import search_library
+ lib_index = LibraryIndex(concepts={
+ "token lifecycle": LibraryConceptEntry(
+ sources=[LibraryConceptSource(source="book:auth", chunks=["ch01-001"])],
+ aliases=[],
+ related=["refresh tokens"],
+ patterns=["credential-cycling"],
+ ),
+ })
+ write_library_index(lib_index)
+
+ result = json.loads(search_library("refresh tokens"))
+ assert "token lifecycle" in result
+
+ def test_search_empty_library(self, tmp_data_dir: Path) -> None:
+ from server import search_library
+ result = search_library("anything")
+ assert "No matches" in result
+
+
+class TestSearchLibraryPatterns:
+ """Tests for pattern search within search_library (migrated from TestExplorePatterns)."""
+
+ def test_search_by_pattern(self, tmp_data_dir: Path) -> None:
+ from server import search_library
+ lib_index = LibraryIndex(
+ concepts={
+ "OAuth tokens": LibraryConceptEntry(
+ sources=[LibraryConceptSource(source="book:api-sec", chunks=["ch03-001"])],
+ aliases=[],
+ related=[],
+ patterns=["credential-cycling"],
+ ),
+ "TLS certs": LibraryConceptEntry(
+ sources=[LibraryConceptSource(source="book:tls", chunks=["ch08-001"])],
+ aliases=[],
+ related=[],
+ patterns=["credential-cycling"],
+ ),
+ },
+ patterns={
+ "credential-cycling": [
+ PatternEntry(concept="OAuth tokens", source="book:api-sec", chunks=["ch03-001"]),
+ PatternEntry(concept="TLS certs", source="book:tls", chunks=["ch08-001"]),
+ ],
+ },
+ )
+ write_library_index(lib_index)
+
+ result = json.loads(search_library("credential"))
+ assert "OAuth tokens" in result
+ assert "TLS certs" in result
+
+ def test_no_match_shows_available_patterns(self, tmp_data_dir: Path) -> None:
+ from server import search_library
+ lib_index = LibraryIndex(
+ concepts={},
+ patterns={"retry-with-backoff": []},
+ )
+ write_library_index(lib_index)
+
+ result = search_library("nonexistent")
+ assert "retry-with-backoff" in result
+ assert "No matches" in result
+
+
+class TestPreviewChunks:
+ def test_preview_existing(self, tmp_data_dir: Path) -> None:
+ from server import preview_chunks
+ nav = BookNav(
+ book_id="test-book",
+ chunks={
+ "ch01-001": ChunkIndexEntry(
+ section="Intro", concepts=["testing"], tokens=420,
+ prev=None, next="ch01-002",
+ ),
+ },
+ )
+ write_book_nav(nav)
+
+ result = json.loads(preview_chunks("test-book", ["ch01-001", "ch01-999"]))
+ assert result["ch01-001"]["section"] == "Intro"
+ assert result["ch01-001"]["concepts"] == ["testing"]
+ assert result["ch01-001"]["tokens"] == 420
+ assert result["ch01-999"] is None
+
+ def test_preview_nonexistent_book(self, tmp_data_dir: Path) -> None:
+ from server import preview_chunks
+ result = preview_chunks("nonexistent", ["ch01-001"])
+ assert "No navigation data" in result
diff --git a/tests/test_storage.py b/tests/test_storage.py
index 48e0ed6..1b7eb11 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -4,16 +4,26 @@
from pathlib import Path
-from lib.models import Catalog, CatalogEntry, ConceptEntry, Manifest
-from lib.storage import (
+from lib.models import ( # pyright: ignore[reportAttributeAccessIssue]
+ Catalog,
+ CatalogEntry,
+ ConceptEntry,
+ LibraryConceptEntry, # pyright: ignore[reportAttributeAccessIssue]
+ LibraryConceptSource, # pyright: ignore[reportAttributeAccessIssue]
+ LibraryIndex, # pyright: ignore[reportAttributeAccessIssue]
+ Manifest,
+)
+from lib.storage import ( # pyright: ignore[reportAttributeAccessIssue]
read_catalog,
read_chunk,
read_chunks,
+ read_library_index, # pyright: ignore[reportAttributeAccessIssue]
read_manifest,
search_concepts,
update_catalog_entry,
write_catalog,
write_chunk,
+ write_library_index, # pyright: ignore[reportAttributeAccessIssue]
write_manifest,
list_chunks,
book_exists,
@@ -150,5 +160,33 @@ def test_backward_compat_no_aliases(self, tmp_data_dir: Path) -> None:
loaded = read_manifest("old-book")
assert loaded is not None
assert loaded.concept_index["testing"][0].aliases == []
-
-
+ assert loaded.concept_index["testing"][0].patterns == [] # pyright: ignore[reportAttributeAccessIssue]
+
+
+class TestLibraryIndex:
+ def test_read_empty(self, tmp_data_dir: Path) -> None:
+ lib_index = read_library_index()
+ assert lib_index.concepts == {}
+
+ def test_write_and_read(self, tmp_data_dir: Path) -> None:
+ lib_index = LibraryIndex(concepts={
+ "OAuth 2.0": LibraryConceptEntry(
+ sources=[
+ LibraryConceptSource(source="book:api-security", chunks=["ch03-s01-001"]),
+ LibraryConceptSource(source="book:web-auth", chunks=["ch07-s02-004"]),
+ ],
+ aliases=["OAuth", "OAuth2"],
+ related=["JWT", "access tokens"],
+ patterns=["credential-cycling", "time-bounded-trust"],
+ ),
+ })
+ write_library_index(lib_index)
+
+ loaded = read_library_index()
+ assert "OAuth 2.0" in loaded.concepts
+ entry = loaded.concepts["OAuth 2.0"]
+ assert len(entry.sources) == 2
+ assert entry.sources[0].source == "book:api-security"
+ assert entry.aliases == ["OAuth", "OAuth2"]
+ assert entry.related == ["JWT", "access tokens"]
+ assert entry.patterns == ["credential-cycling", "time-bounded-trust"]
diff --git a/tests/test_summariser.py b/tests/test_summariser.py
index 2fead6d..5db98c2 100644
--- a/tests/test_summariser.py
+++ b/tests/test_summariser.py
@@ -55,6 +55,8 @@ def _llm_response_for_batch(batch_chapters: list[ChapterSummary]) -> str:
sec = ch.sections[0]
concepts[concept_name] = {
"aliases": [f"alias_{ch.chapter_id}"],
+ "patterns": [f"pattern_{ch.chapter_id}"],
+ "related": [],
"locations": [
{"ch": ch.chapter_id, "sec": sec.section_id, "chunks": sec.chunk_ids}
],
@@ -100,6 +102,8 @@ def test_new_format_with_aliases_and_locations(self):
data = {
"Concept A": {
"aliases": ["alias1"],
+ "patterns": ["credential-cycling"],
+ "related": ["Concept B"],
"locations": [
{"ch": "ch01", "sec": "ch01-s01", "chunks": ["ch01-s01-001"]}
],
@@ -117,6 +121,8 @@ def test_new_format_with_aliases_and_locations(self):
assert m.sec == "ch01-s01"
assert m.chunks == ["ch01-s01-001"]
assert m.aliases == ["alias1"]
+ assert m.patterns == ["credential-cycling"]
+ assert m.related == ["Concept B"]
def test_old_format_list(self):
data = {
@@ -130,6 +136,8 @@ def test_old_format_list(self):
mappings = result["Concept B"]
assert len(mappings) == 1
assert mappings[0].aliases == []
+ assert mappings[0].patterns == []
+ assert mappings[0].related == []
assert mappings[0].ch == "ch02"
def test_multiple_locations(self):
@@ -215,6 +223,8 @@ def side_effect(config, prompt, **kwargs):
return json.dumps({
shared_concept: {
"aliases": ["shared"],
+ "patterns": ["test-pattern"],
+ "related": [],
"locations": [
{"ch": "ch01", "sec": "ch01-s01", "chunks": ["ch01-s01-001"]}
],
@@ -226,6 +236,8 @@ def side_effect(config, prompt, **kwargs):
return json.dumps({
shared_concept: {
"aliases": ["shared"],
+ "patterns": ["test-pattern"],
+ "related": [],
"locations": [
{"ch": ch_id, "sec": f"{ch_id}-s01", "chunks": [f"{ch_id}-s01-001"]}
],
@@ -258,6 +270,8 @@ def side_effect(config, prompt, **kwargs):
return json.dumps({
shared_concept: {
"aliases": ["shared", "sc"],
+ "patterns": ["test-pattern"],
+ "related": [],
"locations": [
{"ch": "ch01", "sec": "ch01-s01", "chunks": ["ch01-s01-001"]},
],
@@ -290,6 +304,8 @@ def test_concept_cap_at_50(self, mock_provider, mock_llm):
]
concepts[f"Concept {i:03d}"] = {
"aliases": [f"c{i}"],
+ "patterns": [f"pattern-{i}"],
+ "related": [],
"locations": locations,
}
mock_llm.return_value = json.dumps(concepts)