From 9471b6ae596f5ce26206e96f09032172aed6c305 Mon Sep 17 00:00:00 2001 From: khushi shukla <149226589+khushishukla2813@users.noreply.github.com> Date: Sun, 15 Mar 2026 00:48:00 +0530 Subject: [PATCH] Fix #622: Add Neo4j indexing and performance documentation (Clean) --- application/database/db.py | 10 ++--- application/prompt_client/prompt_client.py | 3 ++ docs/neo4j-indexing.md | 46 ++++++++++++++++++++++ 3 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 docs/neo4j-indexing.md diff --git a/application/database/db.py b/application/database/db.py index 6c1613277..696a555b0 100644 --- a/application/database/db.py +++ b/application/database/db.py @@ -214,7 +214,7 @@ class SameRel(StructuredRel): class NeoDocument(StructuredNode): document_id = UniqueIdProperty() - name = StringProperty(required=True) + name = StringProperty(required=True, index=True) description = StringProperty(required=True) tags = ArrayProperty(StringProperty()) doctype = StringProperty(required=True) @@ -248,9 +248,9 @@ def to_cre_def(self, node, parse_links=True): class NeoStandard(NeoNode): - section = StringProperty() - subsection = StringProperty() - section_id = StringProperty() + section = StringProperty(index=True) + subsection = StringProperty(index=True) + section_id = StringProperty(index=True) @classmethod def to_cre_def(self, node, parse_links=True) -> cre_defs.Standard: @@ -324,7 +324,7 @@ def to_cre_def(self, node, parse_links=True) -> cre_defs.Code: class NeoCRE(NeoDocument): # type: ignore - external_id = StringProperty() + external_id = StringProperty(index=True) contains = RelationshipTo("NeoCRE", "CONTAINS", model=ContainsRel) contained_in = RelationshipFrom("NeoCRE", "CONTAINS", model=ContainsRel) linked = RelationshipTo("NeoStandard", "LINKED_TO", model=LinkedToRel) diff --git a/application/prompt_client/prompt_client.py b/application/prompt_client/prompt_client.py index 09546c204..8b2cca917 100644 --- a/application/prompt_client/prompt_client.py +++ b/application/prompt_client/prompt_client.py @@ -101,6 +101,9 @@ def find_missing_embeddings(self, database: db.Node_collection) -> List[str]: Returns: List[str]: a list of db ids which do not have embeddings + + PERFORMANCE NOTE: This method iterates over all document types and performs lookups. + Ensure Neo4j indexes on name and external_id are active to speed up node resolution. """ logger.info(f"syncing nodes with embeddings") missing_embeddings = [] diff --git a/docs/neo4j-indexing.md b/docs/neo4j-indexing.md new file mode 100644 index 000000000..a286c5f1d --- /dev/null +++ b/docs/neo4j-indexing.md @@ -0,0 +1,46 @@ +# Neo4j Indexing Strategies for OpenCRE + +To ensure high performance of graph queries, especially when dealing with large datasets (e.g., gap analysis or AI mapping), specific Neo4j properties must be indexed. + +## Recommended Primary Indexes + +The following properties are frequently used in `MATCH` and `WHERE` clauses and should always be indexed: + +| Node Label | Property | Reason | +|------------|----------|--------| +| `NeoDocument` | `name` | Used extensively in Gap Analysis to locate start/end nodes. | +| `NeoCRE` | `external_id` | Used for mapping standards to CREs and AI pipeline lookups. | +| `NeoStandard` | `section` | Used for filtering standards by section. | +| `NeoStandard` | `section_id` | Used for precise standard section lookups. | +| `NeoStandard` | `subsection` | Used for granular standard filtering. | + +> [!NOTE] +> `document_id` is automatically indexed by `neomodel` via `UniqueIdProperty`. + +## Suggested Composite Indexes + +For patterns that frequently filter by multiple properties simultaneously, consider creating composite indexes: + +```cypher +CREATE INDEX standard_lookup_idx IF NOT EXISTS +FOR (n:NeoStandard) +ON (n.name, n.section, n.section_id); +``` + +## Performance Tips & Best Practices + +### 1. Avoid Deep Unbounded Traversal +Query patterns like `[:REL*..20]` can be extremely expensive on dense graphs. +- **Tip**: Limit the max depth as much as possible. +- **Tip**: Use "Tiered Pruning" (as seen in `db.py`) to search for strong links first before falling back to complex traversals. + +### 2. Use `PROFILE` for Query Analysis +When a query is slow, prefix it with `PROFILE` in the Neo4j Browser to see the execution plan. +- Look for **"NodeByLabelScan"** (indicates lack of index). +- Aim for **"NodeIndexSeek"** or **"NodeIndexLookup"**. + +### 3. AI Mapping Performance +The AI embedding pipeline performs frequent `document_id` lookups. Ensure the database matches are using indexed fields to avoid full-label scans during high-concurrency embedding generation. + +## How to Apply Indexes +Indexes are defined in `application/database/db.py` using `neomodel`'s `index=True` property. If you add new models or properties that are used for filtering, ensure they are marked as indexed.