diff --git a/README.md b/README.md index ead7228..8575013 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,24 @@ python main.py --query "artificial intelligence" --retrieval_model all-minilm -- python main.py --query "research papers" --corpus_dir ./my_documents --retrieval_model colpali ``` +### ✅ Easy UI (No Command Line) + +If you prefer a simple UI, use the Streamlit app: + +1. **Install dependencies** (one time): + ```bash + pip install -r requirements.txt + ``` +2. **Double-click** `start_ui.bat` (Windows) to launch the UI. +3. A browser window will open with the NanoSage UI. + +The UI lets you select: +- Query text +- FAISS root folder or a single index/meta pair +- Retrieval model (must match your index embedding model) + +**Note for Windows users:** The UI launcher prefers Python 3.10/3.11. If you installed Python 3.13+ (e.g., 3.14) and see `%1 is not a valid Win32 application`, install Python 3.11 and try again. The launcher will skip Python 3.13+ on PATH and fall back automatically if other versions are installed. + **Parameters**: - `--query`: Main search query (natural language). - `--web_search`: Enables web-based retrieval via Tavily API. @@ -163,6 +181,29 @@ python main.py --query "AI in finance" \ Now the system searches **both** local docs and web data (if `--web_search` is enabled). +#### ✅ Using a FAISS Index (index.faiss + meta.jsonl) + +If you already have FAISS indexes and JSONL metadata (one line per document), you can load them directly: + +```bash +pip install faiss-cpu + +python main.py --query "contract law precedents" \ + --faiss_index_path "C:\path\to\index.faiss" \ + --faiss_meta_path "C:\path\to\meta.jsonl" \ + --retrieval_model all-minilm +``` + +Make sure the FAISS index was built with embeddings from the same retrieval model you select (e.g., `all-minilm`). The metadata lines should include `file_path` and optional `snippet` fields for best report output. + +If you have many year/court folders, point to a root directory that contains multiple `index.faiss` + `meta.jsonl` pairs and NanoSage will load them all: + +```bash +python main.py --query "contract law precedents" \ + --faiss_root_dir "C:\Users\vasil\Desktop\Ai project\_INDEX" \ + --retrieval_model all-minilm +``` + #### 🔄 RAG with Gemma 2B ```bash diff --git a/app.py b/app.py new file mode 100644 index 0000000..163ff78 --- /dev/null +++ b/app.py @@ -0,0 +1,125 @@ +# app.py + +import asyncio +import os +import yaml +import streamlit as st + +from search_session import SearchSession + + +def load_config(config_path: str): + if not config_path or not os.path.isfile(config_path): + return {} + with open(config_path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + + +def run_search_session( + query: str, + config: dict, + corpus_dir: str, + device: str, + retrieval_model: str, + top_k: int, + web_search_enabled: bool, + personality: str, + rag_model: str, + max_depth: int, + llm_provider: str, + llm_model: str, + faiss_index_path: str, + faiss_meta_path: str, + faiss_root_dir: str, +): + session = SearchSession( + query=query, + config=config, + corpus_dir=corpus_dir or None, + device=device, + retrieval_model=retrieval_model, + top_k=top_k, + web_search_enabled=web_search_enabled, + personality=personality or None, + rag_model=rag_model, + max_depth=max_depth, + llm_provider=llm_provider, + llm_model=llm_model or None, + faiss_index_path=faiss_index_path or None, + faiss_meta_path=faiss_meta_path or None, + faiss_root_dir=faiss_root_dir or None, + ) + final_answer = asyncio.run(session.run_session()) + output_path = session.save_report(final_answer) + return final_answer, output_path + + +st.set_page_config(page_title="NanoSage UI", layout="wide") +st.title("NanoSage UI") + +with st.sidebar: + st.header("Configuration") + config_path = st.text_input("Config path", value="config.yaml") + device = st.selectbox("Device", ["cpu", "cuda"], index=0) + retrieval_model = st.selectbox( + "Retrieval model", + ["colpali", "all-minilm", "siglip", "clip"], + index=1, + ) + top_k = st.number_input("Top K", min_value=1, max_value=20, value=3, step=1) + web_search_enabled = st.checkbox("Enable web search", value=False) + personality = st.text_input("Personality (optional)") + rag_model = st.text_input("RAG model", value="gemma") + max_depth = st.number_input("Max depth", min_value=0, max_value=5, value=1, step=1) + llm_provider = st.selectbox("LLM provider", ["ollama", "openai", "anthropic"], index=0) + llm_model = st.text_input("LLM model (optional)") + +st.subheader("Query") +query = st.text_area("Enter your question", height=120) + +st.subheader("Local Data Sources") +corpus_dir = st.text_input("Corpus folder (optional)") +faiss_root_dir = st.text_input("FAISS root folder (optional)") +faiss_index_path = st.text_input("FAISS index path (optional)") +faiss_meta_path = st.text_input("FAISS meta.jsonl path (optional)") + +run_button = st.button("Run Search") + +if run_button: + if not query.strip(): + st.error("Please enter a query.") + elif faiss_root_dir and (faiss_index_path or faiss_meta_path): + st.error("Provide either a FAISS root folder or a single FAISS index/meta pair, not both.") + elif (faiss_index_path and not faiss_meta_path) or (faiss_meta_path and not faiss_index_path): + st.error("Provide both FAISS index and metadata paths together.") + else: + try: + config = load_config(config_path) + if web_search_enabled: + config.update({ + "web_concurrency": config.get("web_concurrency", 8), + "include_wikipedia": config.get("include_wikipedia", False), + }) + with st.spinner("Running search..."): + answer, output_path = run_search_session( + query=query.strip(), + config=config, + corpus_dir=corpus_dir, + device=device, + retrieval_model=retrieval_model, + top_k=top_k, + web_search_enabled=web_search_enabled, + personality=personality, + rag_model=rag_model, + max_depth=max_depth, + llm_provider=llm_provider, + llm_model=llm_model, + faiss_index_path=faiss_index_path, + faiss_meta_path=faiss_meta_path, + faiss_root_dir=faiss_root_dir, + ) + st.success(f"Report saved to: {output_path}") + st.markdown("### Final Answer") + st.write(answer) + except Exception as exc: + st.error(f"Error: {exc}") diff --git a/knowledge_base.py b/knowledge_base.py index 288a575..6ab7706 100644 --- a/knowledge_base.py +++ b/knowledge_base.py @@ -1,9 +1,10 @@ -import os -import io -import torch -import numpy as np -import fitz # PyMuPDF -from PIL import Image +import os +import io +import json +import torch +import numpy as np +import fitz # PyMuPDF +from PIL import Image ############################ # Load & Configure Retrieval @@ -16,7 +17,7 @@ def _pick_dtype(device: str): return torch.float32 -def load_retrieval_model(model_choice="colpali", device="cpu"): +def load_retrieval_model(model_choice="colpali", device="cpu"): """ Backward-compatible loader with extra, faster VLM options. Returns: (model, processor, model_type) @@ -61,7 +62,47 @@ def load_retrieval_model(model_choice="colpali", device="cpu"): else: raise ValueError(f"Unsupported retrieval model choice: {model_choice}") - return model, processor, model_type + return model, processor, model_type + + +def load_faiss_index(index_path: str, meta_path: str): + if not index_path or not meta_path: + raise ValueError("Both index_path and meta_path are required to load a FAISS index.") + if not os.path.isfile(index_path): + raise FileNotFoundError(f"FAISS index file not found: {index_path}") + if not os.path.isfile(meta_path): + raise FileNotFoundError(f"FAISS metadata file not found: {meta_path}") + try: + import faiss + except ImportError as exc: + raise ImportError("faiss is required to load FAISS indexes. Install faiss-cpu.") from exc + + index = faiss.read_index(index_path) + metadata = [] + with open(meta_path, "r", encoding="utf-8") as f: + for line_no, line in enumerate(f, start=1): + if not line.strip(): + continue + try: + metadata.append(json.loads(line)) + except json.JSONDecodeError as exc: + print(f"[WARN] Skipping malformed JSONL line {line_no} in {meta_path}: {exc}") + return index, metadata + + +def find_faiss_pairs(root_dir: str, index_name: str = "index.faiss", meta_name: str = "meta.jsonl"): + if not root_dir: + return [] + if not os.path.isdir(root_dir): + raise FileNotFoundError(f"FAISS root directory not found: {root_dir}") + + pairs = [] + for dirpath, _, filenames in os.walk(root_dir): + if index_name in filenames and meta_name in filenames: + pairs.append( + (os.path.join(dirpath, index_name), os.path.join(dirpath, meta_name)) + ) + return pairs def _l2norm(x: torch.Tensor) -> torch.Tensor: diff --git a/main.py b/main.py index b5a3798..0adc65a 100644 --- a/main.py +++ b/main.py @@ -24,12 +24,15 @@ def main(): parser.add_argument("--web_search", action="store_true", default=False, help="Enable web search") parser.add_argument("--personality", type=str, default=None, help="Optional personality for LLM (e.g. cheerful)") parser.add_argument("--rag_model", type=str, default="gemma", help="Which model to use for final RAG steps") - parser.add_argument("--max_depth", type=int, default=1, help="Depth limit for subquery expansions") - parser.add_argument("--llm_provider", type=str, choices=["ollama", "openai", "anthropic"], default="ollama", help="LLM provider to use") - parser.add_argument("--llm_model", type=str, default=None, help="Specific LLM model to use (overrides provider default)") - parser.add_argument("--web_concurrency", type=int, default=8, help="Concurrent web downloads") - parser.add_argument("--include_wikipedia", action="store_true", help="Include Wikipedia in web search") - args = parser.parse_args() + parser.add_argument("--max_depth", type=int, default=1, help="Depth limit for subquery expansions") + parser.add_argument("--llm_provider", type=str, choices=["ollama", "openai", "anthropic"], default="ollama", help="LLM provider to use") + parser.add_argument("--llm_model", type=str, default=None, help="Specific LLM model to use (overrides provider default)") + parser.add_argument("--web_concurrency", type=int, default=8, help="Concurrent web downloads") + parser.add_argument("--include_wikipedia", action="store_true", help="Include Wikipedia in web search") + parser.add_argument("--faiss_index_path", type=str, default=None, help="Path to FAISS index file") + parser.add_argument("--faiss_meta_path", type=str, default=None, help="Path to FAISS metadata JSONL file") + parser.add_argument("--faiss_root_dir", type=str, default=None, help="Root directory containing FAISS index/meta pairs") + args = parser.parse_args() config = load_config(args.config) @@ -49,10 +52,13 @@ def main(): web_search_enabled=args.web_search, personality=args.personality, rag_model=args.rag_model, - max_depth=args.max_depth, - llm_provider=args.llm_provider, - llm_model=args.llm_model - ) + max_depth=args.max_depth, + llm_provider=args.llm_provider, + llm_model=args.llm_model, + faiss_index_path=args.faiss_index_path, + faiss_meta_path=args.faiss_meta_path, + faiss_root_dir=args.faiss_root_dir + ) loop = asyncio.get_event_loop() final_answer = loop.run_until_complete(session.run_session()) diff --git a/requirements.txt b/requirements.txt index eee8a29..a215bd5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,3 +30,6 @@ ollama # Tavily Search API langchain-tavily python-dotenv + +# UI +streamlit diff --git a/search_session.py b/search_session.py index 6722d6c..a7836fd 100644 --- a/search_session.py +++ b/search_session.py @@ -10,7 +10,15 @@ import torch from datetime import datetime -from knowledge_base import KnowledgeBase, late_interaction_score, load_corpus_from_dir, load_retrieval_model, embed_text +from knowledge_base import ( + KnowledgeBase, + late_interaction_score, + load_corpus_from_dir, + find_faiss_pairs, + load_faiss_index, + load_retrieval_model, + embed_text, +) from web_crawler import search_and_download, parse_any_to_text, sanitize_filename import json from aggregator import aggregate_results @@ -260,9 +268,10 @@ def save_toc_to_json(toc_nodes, output_path, include_analytics=True): ######################################################### class SearchSession: - def __init__(self, query, config, corpus_dir=None, device="cpu", - retrieval_model="colpali", top_k=3, web_search_enabled=False, - personality=None, rag_model="gemma", max_depth=1, llm_provider="ollama", llm_model=None): + def __init__(self, query, config, corpus_dir=None, device="cpu", + retrieval_model="colpali", top_k=3, web_search_enabled=False, + personality=None, rag_model="gemma", max_depth=1, llm_provider="ollama", llm_model=None, + faiss_index_path=None, faiss_meta_path=None, faiss_root_dir=None): """ :param max_depth: Maximum recursion depth for subquery expansion. :param llm_provider: LLM provider to use ("ollama", "openai", "anthropic") @@ -276,8 +285,11 @@ def __init__(self, query, config, corpus_dir=None, device="cpu", self.top_k = top_k self.web_search_enabled = web_search_enabled self.personality = personality - self.rag_model = rag_model - self.max_depth = max_depth + self.rag_model = rag_model + self.max_depth = max_depth + self.faiss_index_path = faiss_index_path + self.faiss_meta_path = faiss_meta_path + self.faiss_root_dir = faiss_root_dir # Initialize LLM manager llm_config = { @@ -322,13 +334,41 @@ def __init__(self, query, config, corpus_dir=None, device="cpu", print("[INFO] Creating KnowledgeBase...") self.kb = KnowledgeBase(self.model, self.processor, model_type=self.model_type, device=self.device, text_model=self.text_model) - # Load local corpus if available. - self.corpus = [] - if self.corpus_dir: - print(f"[INFO] Loading local documents from {self.corpus_dir}") - local_docs = load_corpus_from_dir(self.corpus_dir, self.model, self.processor, self.device, self.model_type) - self.corpus.extend(local_docs) - self.kb.add_documents(self.corpus) + # Load local corpus if available. + self.corpus = [] + if self.corpus_dir: + print(f"[INFO] Loading local documents from {self.corpus_dir}") + local_docs = load_corpus_from_dir(self.corpus_dir, self.model, self.processor, self.device, self.model_type) + self.corpus.extend(local_docs) + self.kb.add_documents(self.corpus) + + self.faiss_indexes = [] + if self.faiss_root_dir and (self.faiss_index_path or self.faiss_meta_path): + raise ValueError("Provide either faiss_root_dir or faiss_index_path/faiss_meta_path, not both.") + if self.faiss_root_dir: + pairs = find_faiss_pairs(self.faiss_root_dir) + if not pairs: + raise FileNotFoundError(f"No FAISS index pairs found under: {self.faiss_root_dir}") + print(f"[INFO] Loading {len(pairs)} FAISS index pair(s) from {self.faiss_root_dir}") + for index_path, meta_path in pairs: + index, metadata = load_faiss_index(index_path, meta_path) + self.faiss_indexes.append({ + "index": index, + "metadata": metadata, + "index_path": index_path, + "meta_path": meta_path + }) + elif self.faiss_index_path or self.faiss_meta_path: + if not (self.faiss_index_path and self.faiss_meta_path): + raise ValueError("Both faiss_index_path and faiss_meta_path must be provided together.") + print(f"[INFO] Loading FAISS index from {self.faiss_index_path}") + index, metadata = load_faiss_index(self.faiss_index_path, self.faiss_meta_path) + self.faiss_indexes.append({ + "index": index, + "metadata": metadata, + "index_path": self.faiss_index_path, + "meta_path": self.faiss_meta_path + }) # Placeholders for web search results and TOC tree. self.web_results = [] @@ -376,16 +416,74 @@ async def run_session(self): else: print("[INFO] Web search is disabled or max_depth < 1, skipping web expansion.") - # 4) Local retrieval - print(f"[INFO] Retrieving top {self.top_k} local documents for final answer.") - self.local_results = self.kb.search(self.enhanced_query, top_k=self.top_k) + # 4) Local retrieval + print(f"[INFO] Retrieving top {self.top_k} local documents for final answer.") + if self.faiss_indexes: + self.local_results = self._search_faiss(self.enhanced_query, top_k=self.top_k) + else: + self.local_results = self.kb.search(self.enhanced_query, top_k=self.top_k) # 5) Summaries and final RAG generation summarized_web = self._summarize_web_results(self.web_results) summarized_local = self._summarize_local_results(self.local_results) final_answer = self._build_final_answer(summarized_web, summarized_local) print("[INFO] Finished building final advanced report.") - return final_answer + return final_answer + + def _search_faiss(self, query, top_k=3): + if self.model_type in ["siglip", "clip"] and self.text_model: + query_embedding = self.text_model.encode(query, convert_to_tensor=True) + else: + query_embedding = embed_text(query, self.model, self.processor, self.model_type, self.device) + query_vector = query_embedding.detach().cpu().numpy().astype("float32").reshape(1, -1) + results = [] + for entry in self.faiss_indexes: + index = entry["index"] + metadata = entry["metadata"] + if getattr(index, "d", None) != query_vector.shape[1]: + raise RuntimeError( + "Embedding dim mismatch: index.d=" + f"{getattr(index, 'd', 'unknown')} but query embedding dim={query_vector.shape[1]} " + f"for index {entry['index_path']}. " + "Fix: run with the SAME embedding model used to build the index " + "(e.g., set --retrieval_model to the model used during indexing)." + ) + distances, indices = index.search(query_vector, top_k) + metric_type = getattr(index, "metric_type", None) + if metric_type is None: + sort_multiplier = -1.0 + else: + try: + import faiss + except ImportError: + faiss = None + if faiss and metric_type == faiss.METRIC_L2: + sort_multiplier = 1.0 + else: + sort_multiplier = -1.0 + + for rank, idx in enumerate(indices[0]): + if idx < 0 or idx >= len(metadata): + continue + meta = metadata[idx] or {} + snippet = meta.get("snippet") or meta.get("text_preview") or "" + score = float(distances[0][rank]) + results.append({ + "embedding": query_embedding, + "metadata": { + "file_path": meta.get("file_path") or meta.get("path", ""), + "type": meta.get("type", "faiss"), + "snippet": snippet, + "score": score, + "index_path": entry["index_path"] + }, + "_sort_score": score * sort_multiplier + }) + + results.sort(key=lambda item: item.get("_sort_score", 0.0), reverse=True) + for item in results: + item.pop("_sort_score", None) + return results[:top_k] def perform_monte_carlo_subqueries(self, parent_query, subqueries): """ diff --git a/start_ui.bat b/start_ui.bat new file mode 100644 index 0000000..0c40689 --- /dev/null +++ b/start_ui.bat @@ -0,0 +1,28 @@ +@echo off +setlocal +cd /d "%~dp0" + +where py >nul 2>nul +if %errorlevel%==0 ( + py -3.11 -m streamlit run app.py + if %errorlevel%==0 goto :eof + py -3.10 -m streamlit run app.py + if %errorlevel%==0 goto :eof + py -3 -m streamlit run app.py + if %errorlevel%==0 goto :eof +) + +where python >nul 2>nul +if %errorlevel%==0 ( + python -c "import sys; sys.exit(0 if sys.version_info < (3, 13) else 1)" + if %errorlevel%==0 ( + python -m streamlit run app.py + if %errorlevel%==0 goto :eof + ) else ( + echo Detected Python 3.13+ on PATH. Streamlit may fail with it on Windows. + ) +) + +echo Failed to launch Streamlit. +echo Please install Python 3.10 or 3.11 and Streamlit (pip install -r requirements.txt). +pause