Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
869 changes: 869 additions & 0 deletions PRPs/PRP-23-rag-corpus-manager.md

Large diffs are not rendered by default.

87 changes: 87 additions & 0 deletions app/features/rag/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from app.features.rag.embeddings import EmbeddingError
from app.features.rag.schemas import (
DeleteResponse,
IndexProjectDocsRequest,
IndexProjectDocsResponse,
IndexRequest,
IndexResponse,
RetrieveRequest,
Expand Down Expand Up @@ -133,6 +135,91 @@ async def index_document(
) from e


@router.post(
"/index/project-docs",
response_model=IndexProjectDocsResponse,
summary="Index bundled project documentation",
description="""
Discover and bulk-index the repository's own bundled markdown.

**Discovery roots (all toggleable, all default on):**
- `include_docs`: every `docs/**/*.md`
- `include_prps`: every `PRPs/**/*.md`
- `include_root`: `README.md`, `AGENTS.md`, `CHANGELOG.md`

Each file is indexed through the same path as `POST /rag/index`, so chunking,
embedding, the SHA-256 content-hash idempotency short-circuit, and upsert are
all reused. Re-runs return every unchanged file as `status: "unchanged"`.

**Returns:** per-file results plus aggregate counts (indexed / updated /
unchanged / failed / total_chunks). A single unreadable file is reported
`status: "failed"` without aborting the batch; an embedding-provider or
database failure is batch-fatal and surfaces as `502` / problem+json.
""",
)
async def index_project_docs(
request: IndexProjectDocsRequest,
db: AsyncSession = Depends(get_db),
) -> IndexProjectDocsResponse:
"""Bulk-index bundled project documentation into the knowledge base.

Args:
request: Toggles selecting which doc roots to index.
db: Async database session from dependency.

Returns:
Per-file results plus aggregate indexing statistics.

Raises:
HTTPException: If embedding generation fails (502).
DatabaseError: If a database operation fails.
"""
logger.info(
"rag.index_project_docs_request_received",
include_docs=request.include_docs,
include_prps=request.include_prps,
include_root=request.include_root,
)

service = RAGService()

try:
response = await service.index_project_docs(db=db, request=request)

logger.info(
"rag.index_project_docs_request_completed",
total_files=response.total_files,
total_chunks=response.total_chunks,
failed=response.failed,
)

return response

except EmbeddingError as e:
logger.error(
"rag.index_project_docs_request_failed",
error=str(e),
error_type=type(e).__name__,
exc_info=True,
)
raise HTTPException(
status_code=status.HTTP_502_BAD_GATEWAY,
detail=f"Embedding generation failed: {e}",
) from e

except SQLAlchemyError as e:
logger.error(
"rag.index_project_docs_request_failed",
error=str(e),
error_type=type(e).__name__,
exc_info=True,
)
raise DatabaseError(
message="Failed to index project docs",
details={"error": str(e)},
) from e


# =============================================================================
# Retrieve Endpoint
# =============================================================================
Expand Down
60 changes: 60 additions & 0 deletions app/features/rag/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,63 @@ class DeleteResponse(BaseModel):
source_id: str
chunks_deleted: int
status: Literal["deleted"]


class IndexProjectDocsRequest(BaseModel):
"""Request to bulk-index bundled project documentation.

All fields default to True so an empty ``{}`` body indexes every root.

Args:
include_docs: Index markdown discovered under docs/**.
include_prps: Index markdown discovered under PRPs/**.
include_root: Index the root allow-list (README/AGENTS/CHANGELOG).
"""

model_config = ConfigDict(extra="forbid")

include_docs: bool = Field(default=True, description="Index docs/**/*.md")
include_prps: bool = Field(default=True, description="Index PRPs/**/*.md")
include_root: bool = Field(
default=True, description="Index README.md / AGENTS.md / CHANGELOG.md"
)


class ProjectDocResult(BaseModel):
"""Per-file outcome of a project-docs index run.

Args:
source_path: Relative POSIX path of the file (the source identifier).
status: Outcome — indexed, updated, unchanged, or failed.
chunks_created: Number of chunks created (0 when unchanged or failed).
error: Error message when status is "failed", otherwise None.
"""

source_path: str
status: Literal["indexed", "updated", "unchanged", "failed"]
chunks_created: int
error: str | None = None


class IndexProjectDocsResponse(BaseModel):
"""Aggregate result of POST /rag/index/project-docs.

Args:
results: Per-file outcomes.
total_files: Total files discovered and processed.
indexed: Count of newly indexed files.
updated: Count of re-indexed (changed) files.
unchanged: Count of files skipped by the content-hash short-circuit.
failed: Count of files that could not be read.
total_chunks: Total chunks created across all files.
duration_ms: Wall-clock time taken for the batch.
"""

results: list[ProjectDocResult]
total_files: int
indexed: int
updated: int
unchanged: int
failed: int
total_chunks: int
duration_ms: float
136 changes: 136 additions & 0 deletions app/features/rag/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@
from app.features.rag.schemas import (
ChunkResult,
DeleteResponse,
IndexProjectDocsRequest,
IndexProjectDocsResponse,
IndexRequest,
IndexResponse,
ProjectDocResult,
RetrieveRequest,
RetrieveResponse,
SourceListResponse,
Expand All @@ -39,6 +42,10 @@

logger = structlog.get_logger()

# Allow-listed root markdown files indexed by index_project_docs. CLAUDE.md is
# deliberately excluded — it is an operating index that @imports AGENTS.md.
_PROJECT_ROOT_FILES: tuple[str, ...] = ("README.md", "AGENTS.md", "CHANGELOG.md")


class SourceNotFoundError(ValueError):
"""Source not found in the knowledge base."""
Expand Down Expand Up @@ -250,6 +257,135 @@ async def index_document(
status=status,
)

def _discover_project_doc_files(
self, request: IndexProjectDocsRequest
) -> list[tuple[Path, str]]:
"""Discover bundled markdown under the allow-listed project-doc roots.

Pure and synchronous — no DB, no network. ``rglob`` on a non-existent
directory yields nothing (no exception), so an absent docs/ or PRPs/
root simply contributes 0 files.

Args:
request: Toggles selecting which roots to discover.

Returns:
A deterministically sorted list of (absolute_path, category) pairs
where category is "docs", "prp", or "root".
"""
found: list[tuple[Path, str]] = []

if request.include_docs:
found += [(p, "docs") for p in (self._base_dir / "docs").rglob("*.md")]

if request.include_prps:
found += [(p, "prp") for p in (self._base_dir / "PRPs").rglob("*.md")]

if request.include_root:
for name in _PROJECT_ROOT_FILES:
candidate = self._base_dir / name
if candidate.is_file():
found.append((candidate, "root"))

# rglob order is filesystem-dependent — sort for stable, reproducible runs.
return sorted(found, key=lambda pair: str(pair[0]))

async def index_project_docs(
self,
db: AsyncSession,
request: IndexProjectDocsRequest,
) -> IndexProjectDocsResponse:
"""Bulk-index discovered project docs via index_document. Idempotent.

Each file is indexed through index_document, reusing its chunking,
embedding, SHA-256 content-hash idempotency, and upsert. A single
unreadable / non-UTF-8 file is reported status="failed" and does NOT
abort the batch. EmbeddingError / SQLAlchemyError are NOT caught here —
they are batch-fatal and propagate to the route's error handlers.

Args:
db: Database session.
request: Toggles selecting which roots to index.

Returns:
Per-file results plus aggregate counts.
"""
start_time = time.time()

logger.info(
"rag.index_project_docs_started",
include_docs=request.include_docs,
include_prps=request.include_prps,
include_root=request.include_root,
)

results: list[ProjectDocResult] = []

for abs_path, category in self._discover_project_doc_files(request):
# abs_path was globbed under self._base_dir, so relative_to is safe.
rel = abs_path.relative_to(self._base_dir).as_posix()
try:
content = abs_path.read_text(encoding="utf-8")
index_response = await self.index_document(
db,
IndexRequest(
source_type="markdown",
source_path=rel,
content=content,
metadata={"category": category},
),
)
results.append(
ProjectDocResult(
source_path=rel,
status=index_response.status,
chunks_created=index_response.chunks_created,
error=None,
)
)
except (OSError, ValueError) as exc:
# FileNotFoundError ⊂ OSError; UnicodeDecodeError ⊂ ValueError.
logger.warning(
"rag.index_project_docs_file_failed",
source_path=rel,
error=str(exc),
error_type=type(exc).__name__,
)
results.append(
ProjectDocResult(
source_path=rel,
status="failed",
chunks_created=0,
error=str(exc),
)
)

duration_ms = (time.time() - start_time) * 1000

summary = IndexProjectDocsResponse(
results=results,
total_files=len(results),
indexed=sum(r.status == "indexed" for r in results),
updated=sum(r.status == "updated" for r in results),
unchanged=sum(r.status == "unchanged" for r in results),
failed=sum(r.status == "failed" for r in results),
total_chunks=sum(r.chunks_created for r in results),
duration_ms=duration_ms,
)

logger.info(
"rag.index_project_docs_completed",
total_files=summary.total_files,
indexed=summary.indexed,
updated=summary.updated,
unchanged=summary.unchanged,
failed=summary.failed,
total_chunks=summary.total_chunks,
duration_ms=duration_ms,
)

return summary

async def retrieve(
self,
db: AsyncSession,
Expand Down
5 changes: 3 additions & 2 deletions app/features/rag/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]:
try:
yield session
finally:
# Clean up test data (delete sources with test- prefix)
# Clean up test data (delete sources whose path contains a test- token,
# including nested project-doc fixture paths like docs/test-*.md)
test_source_ids = delete(DocumentSource).where(
DocumentSource.source_path.like("test-%")
DocumentSource.source_path.like("%test-%")
)
await session.execute(test_source_ids)
await session.commit()
Expand Down
Loading