From 9471b6ae596f5ce26206e96f09032172aed6c305 Mon Sep 17 00:00:00 2001
From: khushi shukla <149226589+khushishukla2813@users.noreply.github.com>
Date: Sun, 15 Mar 2026 00:48:00 +0530
Subject: [PATCH] Fix #622: Add Neo4j indexing and performance documentation
 (Clean)

---
 application/database/db.py                 | 10 ++---
 application/prompt_client/prompt_client.py |  3 ++
 docs/neo4j-indexing.md                     | 46 ++++++++++++++++++++++
 3 files changed, 54 insertions(+), 5 deletions(-)
 create mode 100644 docs/neo4j-indexing.md

diff --git a/application/database/db.py b/application/database/db.py
index 6c1613277..696a555b0 100644
--- a/application/database/db.py
+++ b/application/database/db.py
@@ -214,7 +214,7 @@ class SameRel(StructuredRel):
 
 class NeoDocument(StructuredNode):
     document_id = UniqueIdProperty()
-    name = StringProperty(required=True)
+    name = StringProperty(required=True, index=True)
     description = StringProperty(required=True)
     tags = ArrayProperty(StringProperty())
     doctype = StringProperty(required=True)
@@ -248,9 +248,9 @@ def to_cre_def(self, node, parse_links=True):
 
 
 class NeoStandard(NeoNode):
-    section = StringProperty()
-    subsection = StringProperty()
-    section_id = StringProperty()
+    section = StringProperty(index=True)
+    subsection = StringProperty(index=True)
+    section_id = StringProperty(index=True)
 
     @classmethod
     def to_cre_def(self, node, parse_links=True) -> cre_defs.Standard:
@@ -324,7 +324,7 @@ def to_cre_def(self, node, parse_links=True) -> cre_defs.Code:
 
 
 class NeoCRE(NeoDocument):  # type: ignore
-    external_id = StringProperty()
+    external_id = StringProperty(index=True)
     contains = RelationshipTo("NeoCRE", "CONTAINS", model=ContainsRel)
     contained_in = RelationshipFrom("NeoCRE", "CONTAINS", model=ContainsRel)
     linked = RelationshipTo("NeoStandard", "LINKED_TO", model=LinkedToRel)
diff --git a/application/prompt_client/prompt_client.py b/application/prompt_client/prompt_client.py
index 09546c204..8b2cca917 100644
--- a/application/prompt_client/prompt_client.py
+++ b/application/prompt_client/prompt_client.py
@@ -101,6 +101,9 @@ def find_missing_embeddings(self, database: db.Node_collection) -> List[str]:
 
         Returns:
             List[str]: a list of db ids which do not have embeddings
+
+        PERFORMANCE NOTE: This method iterates over all document types and performs lookups.
+        Ensure Neo4j indexes on name and external_id are active to speed up node resolution.
         """
         logger.info(f"syncing nodes with embeddings")
         missing_embeddings = []
diff --git a/docs/neo4j-indexing.md b/docs/neo4j-indexing.md
new file mode 100644
index 000000000..a286c5f1d
--- /dev/null
+++ b/docs/neo4j-indexing.md
@@ -0,0 +1,46 @@
+# Neo4j Indexing Strategies for OpenCRE
+
+To ensure high performance of graph queries, especially when dealing with large datasets (e.g., gap analysis or AI mapping), specific Neo4j properties must be indexed.
+
+## Recommended Primary Indexes
+
+The following properties are frequently used in `MATCH` and `WHERE` clauses and should always be indexed:
+
+| Node Label | Property | Reason |
+|------------|----------|--------|
+| `NeoDocument` | `name` | Used extensively in Gap Analysis to locate start/end nodes. |
+| `NeoCRE` | `external_id` | Used for mapping standards to CREs and AI pipeline lookups. |
+| `NeoStandard` | `section` | Used for filtering standards by section. |
+| `NeoStandard` | `section_id` | Used for precise standard section lookups. |
+| `NeoStandard` | `subsection` | Used for granular standard filtering. |
+
+> [!NOTE]
+> `document_id` is automatically indexed by `neomodel` via `UniqueIdProperty`.
+
+## Suggested Composite Indexes
+
+For patterns that frequently filter by multiple properties simultaneously, consider creating composite indexes:
+
+```cypher
+CREATE INDEX standard_lookup_idx IF NOT EXISTS
+FOR (n:NeoStandard)
+ON (n.name, n.section, n.section_id);
+```
+
+## Performance Tips & Best Practices
+
+### 1. Avoid Deep Unbounded Traversal
+Query patterns like `[:REL*..20]` can be extremely expensive on dense graphs.
+- **Tip**: Limit the max depth as much as possible.
+- **Tip**: Use "Tiered Pruning" (as seen in `db.py`) to search for strong links first before falling back to complex traversals.
+
+### 2. Use `PROFILE` for Query Analysis
+When a query is slow, prefix it with `PROFILE` in the Neo4j Browser to see the execution plan.
+- Look for **"NodeByLabelScan"** (indicates lack of index).
+- Aim for **"NodeIndexSeek"** or **"NodeIndexLookup"**.
+
+### 3. AI Mapping Performance
+The AI embedding pipeline performs frequent `document_id` lookups. Ensure the database matches are using indexed fields to avoid full-label scans during high-concurrency embedding generation.
+
+## How to Apply Indexes
+Indexes are defined in `application/database/db.py` using `neomodel`'s `index=True` property. If you add new models or properties that are used for filtering, ensure they are marked as indexed.