From 5dad71474a5757bcc3769379e9de7ca35803033d Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 5 Mar 2026 14:14:19 +0530 Subject: [PATCH 01/56] feat: add kubernetes app role selection Signed-off-by: Anupam Kumar --- appinfo/info.xml | 14 ++++++++++++++ context_chat_backend/controller.py | 15 ++++++++------- context_chat_backend/task_fetcher.py | 4 ++++ context_chat_backend/types.py | 8 ++++++++ context_chat_backend/utils.py | 13 ++++++++++++- 5 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 context_chat_backend/task_fetcher.py diff --git a/appinfo/info.xml b/appinfo/info.xml index 9760cd29..30194baa 100644 --- a/appinfo/info.xml +++ b/appinfo/info.xml @@ -82,5 +82,19 @@ Setup background job workers as described here: https://docs.nextcloud.com/serve Password to be used for authenticating requests to the OpenAI-compatible endpoint set in CC_EM_BASE_URL. + + + rp + Request Processing Mode + APP_ROLE=rp + true + + + indexing + Indexing Mode + APP_ROLE=indexing + false + + diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index c26b930a..0b6b53dc 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -75,6 +75,7 @@ def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: if enabled: app_enabled.set() + # todo: start bg threads to fetch docs, updates and requests to process else: app_enabled.clear() @@ -213,6 +214,13 @@ def _(): return JSONResponse(content={'enabled': app_enabled.is_set()}, status_code=200) +@app.post('/countIndexedDocuments') +@enabled_guard(app) +def _(): + counts = exec_in_proc(target=count_documents_by_provider, args=(vectordb_loader,)) + return JSONResponse(counts) + + @app.post('/updateAccessDeclarative') @enabled_guard(app) def _( @@ -328,13 +336,6 @@ def _(userId: str = Body(embed=True)): return JSONResponse('User deleted') -@app.post('/countIndexedDocuments') -@enabled_guard(app) -def _(): - counts = exec_in_proc(target=count_documents_by_provider, args=(vectordb_loader,)) - return JSONResponse(counts) - - @app.put('/loadSources') @enabled_guard(app) def _(sources: list[UploadFile]): diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py new file mode 100644 index 00000000..5e2f317f --- /dev/null +++ b/context_chat_backend/task_fetcher.py @@ -0,0 +1,4 @@ +# +# SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors +# SPDX-License-Identifier: AGPL-3.0-or-later +# diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 500a97d0..78680866 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +from enum import Enum + from pydantic import BaseModel __all__ = [ @@ -71,3 +73,9 @@ class FatalEmbeddingException(EmbeddingException): Either malformed request, authentication error, or other non-retryable error. """ + + +class AppRole(str, Enum): + NORMAL = 'normal' + INDEXING = 'indexing' + RP = 'rp' diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index f6d6e672..224f466e 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -4,6 +4,7 @@ # import logging import multiprocessing as mp +import os import re import traceback from collections.abc import Callable @@ -14,7 +15,7 @@ from fastapi.responses import JSONResponse as FastAPIJSONResponse -from .types import TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig +from .types import AppRole, TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig T = TypeVar('T') _logger = logging.getLogger('ccb.utils') @@ -144,3 +145,13 @@ def redact_config(config: TConfig | TEmbeddingConfig) -> TConfig | TEmbeddingCon em_conf.auth.password = '***REDACTED***' # noqa: S105 return config_copy + + +def get_app_role() -> AppRole: + role = os.getenv('APP_ROLE', '').lower() + if role == '': + return AppRole.NORMAL + if role not in ['indexing', 'rp']: + _logger.warning(f'Invalid app role: {role}, defaulting to all roles') + return AppRole.NORMAL + return AppRole(role) From 089d27a41643c165d0474258c840ba6e048279a9 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 5 Mar 2026 16:42:41 +0530 Subject: [PATCH 02/56] feat: add thread start and stop logic Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 17 ++++-- context_chat_backend/task_fetcher.py | 82 ++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 0b6b53dc..fadc5f83 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -42,6 +42,7 @@ from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of +from .task_fetcher import start_bg_threads, stop_bg_threads from .vectordb.service import ( count_documents_by_provider, decl_update_access, @@ -73,11 +74,16 @@ app_enabled = Event() def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: - if enabled: - app_enabled.set() - # todo: start bg threads to fetch docs, updates and requests to process - else: - app_enabled.clear() + try: + if enabled: + app_enabled.set() + start_bg_threads() + else: + app_enabled.clear() + stop_bg_threads() + except Exception as e: + logger.exception('Error in enabled handler:', exc_info=e) + return f'Error in enabled handler: {e}' logger.info(f'App {("disabled", "enabled")[enabled]}') return '' @@ -95,6 +101,7 @@ async def lifespan(app: FastAPI): yield vectordb_loader.offload() llm_loader.offload() + stop_bg_threads() app_config = get_config(os.environ['CC_CONFIG_PATH']) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 5e2f317f..9660b44c 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -2,3 +2,85 @@ # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # + +from enum import Enum +from threading import Thread + +from .types import AppRole +from .utils import get_app_role + +APP_ROLE = get_app_role() +THREADS = {} +THREADS_STOP_EVENTS = {} + + +class ThreadType(Enum): + FILES_INDEXING = 'files_indexing' + UPDATES_PROCESSING = 'updates_processing' + REQUEST_PROCESSING = 'request_processing' + + +def files_indexing_thread(): + ... + + +def updates_processing_thread(): + ... + + +def request_processing_thread(): + ... + + +def start_bg_threads(): + match APP_ROLE: + case AppRole.INDEXING | AppRole.NORMAL: + THREADS[ThreadType.FILES_INDEXING] = Thread( + target=files_indexing_thread, + name='FilesIndexingThread', + daemon=True, + ) + THREADS[ThreadType.UPDATES_PROCESSING] = Thread( + target=updates_processing_thread, + name='UpdatesProcessingThread', + daemon=True, + ) + THREADS[ThreadType.FILES_INDEXING].start() + THREADS[ThreadType.UPDATES_PROCESSING].start() + case AppRole.RP | AppRole.NORMAL: + THREADS[ThreadType.REQUEST_PROCESSING] = Thread( + target=request_processing_thread, + name='RequestProcessingThread', + daemon=True, + ) + THREADS[ThreadType.REQUEST_PROCESSING].start() + + +def stop_bg_threads(): + match APP_ROLE: + case AppRole.INDEXING | AppRole.NORMAL: + if ( + ThreadType.FILES_INDEXING not in THREADS + or ThreadType.UPDATES_PROCESSING not in THREADS + or ThreadType.FILES_INDEXING not in THREADS_STOP_EVENTS + or ThreadType.UPDATES_PROCESSING not in THREADS_STOP_EVENTS + ): + return + THREADS_STOP_EVENTS[ThreadType.FILES_INDEXING].set() + THREADS_STOP_EVENTS[ThreadType.UPDATES_PROCESSING].set() + THREADS[ThreadType.FILES_INDEXING].join() + THREADS[ThreadType.UPDATES_PROCESSING].join() + THREADS.pop(ThreadType.FILES_INDEXING) + THREADS.pop(ThreadType.UPDATES_PROCESSING) + THREADS_STOP_EVENTS.pop(ThreadType.FILES_INDEXING) + THREADS_STOP_EVENTS.pop(ThreadType.UPDATES_PROCESSING) + case AppRole.RP | AppRole.NORMAL: + if ( + ThreadType.REQUEST_PROCESSING not in THREADS + or ThreadType.REQUEST_PROCESSING not in THREADS_STOP_EVENTS + ): + return + THREADS_STOP_EVENTS[ThreadType.REQUEST_PROCESSING].set() + THREADS[ThreadType.REQUEST_PROCESSING].join() + THREADS.pop(ThreadType.REQUEST_PROCESSING) + THREADS_STOP_EVENTS.pop(ThreadType.REQUEST_PROCESSING) From 64ffdaf2b83dae9f450a86024cad9f3a41849c30 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 9 Mar 2026 19:22:45 +0530 Subject: [PATCH 03/56] wip: migrate the indexing process Signed-off-by: Anupam Kumar --- .../chain/ingest/doc_loader.py | 53 +-- context_chat_backend/chain/ingest/injest.py | 201 ++++++----- context_chat_backend/controller.py | 165 +++++----- .../{chain/ingest => }/mimetype_list.py | 0 context_chat_backend/task_fetcher.py | 311 ++++++++++++++++-- context_chat_backend/types.py | 121 ++++++- context_chat_backend/vectordb/base.py | 9 +- context_chat_backend/vectordb/pgvector.py | 61 ++-- 8 files changed, 659 insertions(+), 262 deletions(-) rename context_chat_backend/{chain/ingest => }/mimetype_list.py (100%) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index efb81b6d..d26f74b1 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -7,11 +7,10 @@ import re import tempfile from collections.abc import Callable -from typing import BinaryIO +from io import BytesIO import docx2txt from epub2txt import epub2txt -from fastapi import UploadFile from langchain_unstructured import UnstructuredLoader from odfdo import Document from pandas import read_csv, read_excel @@ -19,9 +18,11 @@ from pypdf.errors import FileNotDecryptedError as PdfFileNotDecryptedError from striprtf import striprtf +from ...types import SourceItem + logger = logging.getLogger('ccb.doc_loader') -def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str: +def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str: raw_bytes = file.read() with tempfile.NamedTemporaryFile(mode='wb') as tmp: tmp.write(raw_bytes) @@ -35,46 +36,46 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str # -- LOADERS -- # -def _load_pdf(file: BinaryIO) -> str: +def _load_pdf(file: BytesIO) -> str: pdf_reader = PdfReader(file) return '\n\n'.join([page.extract_text().strip() for page in pdf_reader.pages]) -def _load_csv(file: BinaryIO) -> str: +def _load_csv(file: BytesIO) -> str: return read_csv(file).to_string(header=False, na_rep='') -def _load_epub(file: BinaryIO) -> str: +def _load_epub(file: BytesIO) -> str: return _temp_file_wrapper(file, epub2txt).strip() -def _load_docx(file: BinaryIO) -> str: +def _load_docx(file: BytesIO) -> str: return docx2txt.process(file).strip() -def _load_odt(file: BinaryIO) -> str: +def _load_odt(file: BytesIO) -> str: return _temp_file_wrapper(file, lambda fp: Document(fp).get_formatted_text()).strip() -def _load_ppt_x(file: BinaryIO) -> str: +def _load_ppt_x(file: BytesIO) -> str: return _temp_file_wrapper(file, lambda fp: UnstructuredLoader(fp).load()).strip() -def _load_rtf(file: BinaryIO) -> str: +def _load_rtf(file: BytesIO) -> str: return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip() -def _load_xml(file: BinaryIO) -> str: +def _load_xml(file: BytesIO) -> str: data = file.read().decode('utf-8', 'ignore') data = re.sub(r'', '', data) return data.strip() -def _load_xlsx(file: BinaryIO) -> str: +def _load_xlsx(file: BytesIO) -> str: return read_excel(file, na_filter=False).to_string(header=False, na_rep='') -def _load_email(file: BinaryIO, ext: str = 'eml') -> str | None: +def _load_email(file: BytesIO, ext: str = 'eml') -> str | None: # NOTE: msg format is not tested if ext not in ['eml', 'msg']: return None @@ -115,30 +116,34 @@ def attachment_partitioner( } -def decode_source(source: UploadFile) -> str | None: +def decode_source(source: SourceItem) -> str | None: + io_obj: BytesIO | None = None try: # .pot files are powerpoint templates but also plain text files, # so we skip them to prevent decoding errors - if source.headers['title'].endswith('.pot'): + if source.title.endswith('.pot'): return None - mimetype = source.headers['type'] + mimetype = source.type if mimetype is None: return None + if isinstance(source.content, str): + io_obj = BytesIO(source.content.encode('utf-8', 'ignore')) + else: + io_obj = source.content + if _loader_map.get(mimetype): - result = _loader_map[mimetype](source.file) - source.file.close() + result = _loader_map[mimetype](io_obj) return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore') - result = source.file.read().decode('utf-8', 'ignore') - source.file.close() - return result + return io_obj.read().decode('utf-8', 'ignore') except PdfFileNotDecryptedError: - logger.warning(f'PDF file ({source.filename}) is encrypted and cannot be read') + logger.warning(f'PDF file ({source.reference}) is encrypted and cannot be read') return None except Exception: - logger.exception(f'Error decoding source file ({source.filename})', stack_info=True) + logger.exception(f'Error decoding source file ({source.reference})', stack_info=True) return None finally: - source.file.close() # Ensure file is closed after processing + if io_obj is not None: + io_obj.close() diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 5871ebb8..0eb70e0b 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -5,29 +5,23 @@ import logging import re -from fastapi.datastructures import UploadFile from langchain.schema import Document from ...dyn_loader import VectorDBLoader -from ...types import TConfig -from ...utils import is_valid_source_id, to_int +from ...types import IndexingError, SourceItem, TConfig from ...vectordb.base import BaseVectorDB from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp from ..types import InDocument from .doc_loader import decode_source from .doc_splitter import get_splitter_for -from .mimetype_list import SUPPORTED_MIMETYPES logger = logging.getLogger('ccb.injest') -def _allowed_file(file: UploadFile) -> bool: - return file.headers['type'] in SUPPORTED_MIMETYPES - def _filter_sources( vectordb: BaseVectorDB, - sources: list[UploadFile] -) -> tuple[list[UploadFile], list[UploadFile]]: + sources: dict[int, SourceItem] +) -> tuple[dict[int, SourceItem], dict[int, SourceItem]]: ''' Returns ------- @@ -37,30 +31,42 @@ def _filter_sources( ''' try: - existing_sources, new_sources = vectordb.check_sources(sources) + existing_source_ids, to_embed_source_ids = vectordb.check_sources(sources) except Exception as e: - raise DbException('Error: Vectordb sources_to_embed error') from e + raise DbException('Error: Vectordb error while checking existing sources in indexing') from e + + existing_sources = {} + to_embed_sources = {} - return ([ - source for source in sources - if source.filename in existing_sources - ], [ - source for source in sources - if source.filename in new_sources - ]) + for db_id, source in sources.items(): + if source.reference in existing_source_ids: + existing_sources[db_id] = source + elif source.reference in to_embed_source_ids: + to_embed_sources[db_id] = source + return existing_sources, to_embed_sources -def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[InDocument]: - indocuments = [] - for source in sources: - logger.debug('processing source', extra={ 'source_id': source.filename }) +def _sources_to_indocuments( + config: TConfig, + sources: dict[int, SourceItem] +) -> tuple[dict[int, InDocument], dict[int, IndexingError]]: + indocuments = {} + errored_docs = {} + for db_id, source in sources.items(): + logger.debug('processing source', extra={ 'source_id': source.reference }) + + # todo: maybe fetch the content of the files here # transform the source to have text data content = decode_source(source) if content is None or (content := content.strip()) == '': - logger.debug('decoded empty source', extra={ 'source_id': source.filename }) + logger.debug('decoded empty source', extra={ 'source_id': source.reference }) + errored_docs[db_id] = IndexingError( + error='Decoded content is empty', + retryable=False, + ) continue # replace more than two newlines with two newlines (also blank spaces, more than 4) @@ -71,94 +77,123 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[ content = content.replace('\0', '') if content is None or content == '': - logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.filename }) + logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.reference }) + errored_docs[db_id] = IndexingError( + error='Decoded content is empty', + retryable=False, + ) continue - logger.debug('decoded non empty source', extra={ 'source_id': source.filename }) + logger.debug('decoded non empty source', extra={ 'source_id': source.reference }) metadata = { - 'source': source.filename, - 'title': _decode_latin_1(source.headers['title']), - 'type': source.headers['type'], + 'source': source.reference, + 'title': _decode_latin_1(source.title), + 'type': source.type, } doc = Document(page_content=content, metadata=metadata) - splitter = get_splitter_for(config.embedding_chunk_size, source.headers['type']) + splitter = get_splitter_for(config.embedding_chunk_size, source.type) split_docs = splitter.split_documents([doc]) logger.debug('split document into chunks', extra={ - 'source_id': source.filename, + 'source_id': source.reference, 'len(split_docs)': len(split_docs), }) - indocuments.append(InDocument( + indocuments[db_id] = InDocument( documents=split_docs, - userIds=list(map(_decode_latin_1, source.headers['userIds'].split(','))), - source_id=source.filename, # pyright: ignore[reportArgumentType] - provider=source.headers['provider'], - modified=to_int(source.headers['modified']), - )) + userIds=list(map(_decode_latin_1, source.userIds)), + source_id=source.reference, + provider=source.provider, + modified=source.modified, # pyright: ignore[reportArgumentType] + ) + + return indocuments, errored_docs + + +def _increase_access_for_existing_sources( + vectordb: BaseVectorDB, + existing_sources: dict[int, SourceItem] +) -> dict[int, IndexingError | None]: + ''' + update userIds for existing sources + allow the userIds as additional users, not as the only users + ''' + if len(existing_sources) == 0: + return {} - return indocuments + results = {} + logger.debug('Increasing access for existing sources', extra={ + 'source_ids': [source.reference for source in existing_sources.values()] + }) + for db_id, source in existing_sources.items(): + try: + vectordb.update_access( + UpdateAccessOp.allow, + list(map(_decode_latin_1, source.userIds)), + source.reference, + ) + results[db_id] = None + except SafeDbException as e: + logger.error(f'Failed to update access for source ({source.reference}): {e.args[0]}') + results[db_id] = IndexingError( + error=str(e), + retryable=False, + ) + continue + except Exception as e: + logger.error(f'Unexpected error while updating access for source ({source.reference}): {e}') + results[db_id] = IndexingError( + error='Unexpected error while updating access', + retryable=True, + ) + continue + return results def _process_sources( vectordb: BaseVectorDB, config: TConfig, - sources: list[UploadFile], -) -> tuple[list[str],list[str]]: + sources: dict[int, SourceItem] +) -> dict[int, IndexingError | None]: ''' Processes the sources and adds them to the vectordb. Returns the list of source ids that were successfully added and those that need to be retried. ''' - existing_sources, filtered_sources = _filter_sources(vectordb, sources) + existing_sources, to_embed_sources = _filter_sources(vectordb, sources) logger.debug('db filter source results', extra={ 'len(existing_sources)': len(existing_sources), 'existing_sources': existing_sources, - 'len(filtered_sources)': len(filtered_sources), - 'filtered_sources': filtered_sources, + 'len(to_embed_sources)': len(to_embed_sources), + 'to_embed_sources': to_embed_sources, }) - loaded_source_ids = [source.filename for source in existing_sources] - # update userIds for existing sources - # allow the userIds as additional users, not as the only users - if len(existing_sources) > 0: - logger.debug('Increasing access for existing sources', extra={ - 'source_ids': [source.filename for source in existing_sources] - }) - for source in existing_sources: - try: - vectordb.update_access( - UpdateAccessOp.allow, - list(map(_decode_latin_1, source.headers['userIds'].split(','))), - source.filename, # pyright: ignore[reportArgumentType] - ) - except SafeDbException as e: - logger.error(f'Failed to update access for source ({source.filename}): {e.args[0]}') - continue - - if len(filtered_sources) == 0: + source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources) + + if len(to_embed_sources) == 0: # no new sources to embed logger.debug('Filtered all sources, nothing to embed') - return loaded_source_ids, [] # pyright: ignore[reportReturnType] + return source_proc_results logger.debug('Filtered sources:', extra={ - 'source_ids': [source.filename for source in filtered_sources] + 'source_ids': [source.reference for source in to_embed_sources.values()] }) # invalid/empty sources are filtered out here and not counted in loaded/retryable - indocuments = _sources_to_indocuments(config, filtered_sources) + indocuments, errored_docs = _sources_to_indocuments(config, to_embed_sources) - logger.debug('Converted all sources to documents') + source_proc_results.update(errored_docs) + logger.debug('Converted sources to documents') if len(indocuments) == 0: # filtered document(s) were invalid/empty, not an error logger.debug('All documents were found empty after being processed') - return loaded_source_ids, [] # pyright: ignore[reportReturnType] + return source_proc_results - added_source_ids, retry_source_ids = vectordb.add_indocuments(indocuments) - loaded_source_ids.extend(added_source_ids) + doc_add_results = vectordb.add_indocuments(indocuments) + source_proc_results.update(doc_add_results) logger.debug('Added documents to vectordb') - return loaded_source_ids, retry_source_ids # pyright: ignore[reportReturnType] + return source_proc_results def _decode_latin_1(s: str) -> str: @@ -172,31 +207,15 @@ def _decode_latin_1(s: str) -> str: def embed_sources( vectordb_loader: VectorDBLoader, config: TConfig, - sources: list[UploadFile], -) -> tuple[list[str],list[str]]: - # either not a file or a file that is allowed - sources_filtered = [ - source for source in sources - if is_valid_source_id(source.filename) # pyright: ignore[reportArgumentType] - or _allowed_file(source) - ] - + sources: dict[int, SourceItem] +) -> dict[int, IndexingError | None]: logger.debug('Embedding sources:', extra={ 'source_ids': [ - f'{source.filename} ({_decode_latin_1(source.headers["title"])})' - for source in sources_filtered - ], - 'invalid_source_ids': [ - source.filename for source in sources - if not is_valid_source_id(source.filename) # pyright: ignore[reportArgumentType] - ], - 'not_allowed_file_ids': [ - source.filename for source in sources - if not _allowed_file(source) + f'{source.reference} ({_decode_latin_1(source.title)})' + for source in sources.values() ], - 'len(source_ids)': len(sources_filtered), - 'len(total_source_ids)': len(sources), + 'len(source_ids)': len(sources), }) vectordb = vectordb_loader.load() - return _process_sources(vectordb, config, sources_filtered) + return _process_sources(vectordb, config, sources) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index fadc5f83..3e70ee1b 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -27,7 +27,7 @@ from time import sleep from typing import Annotated, Any -from fastapi import Body, FastAPI, Request, UploadFile +from fastapi import Body, FastAPI, Request from langchain.llms.base import LLM from nc_py_api import AsyncNextcloudApp, NextcloudApp from nc_py_api.ex_app import persistent_storage, set_handlers @@ -35,14 +35,13 @@ from starlette.responses import FileResponse from .chain.context import do_doc_search -from .chain.ingest.injest import embed_sources from .chain.one_shot import process_context_query, process_query from .config_parser import get_config from .dyn_loader import LLMModelLoader, VectorDBLoader from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of -from .task_fetcher import start_bg_threads, stop_bg_threads +from .task_fetcher import start_bg_threads, wait_for_bg_threads from .vectordb.service import ( count_documents_by_provider, decl_update_access, @@ -57,6 +56,7 @@ repair_run() ensure_config_file() logger = logging.getLogger('ccb.controller') +app_config = get_config(os.environ['CC_CONFIG_PATH']) __download_models_from_hf = os.environ.get('CC_DOWNLOAD_MODELS_FROM_HF', 'true').lower() in ('1', 'true', 'yes') models_to_fetch = { @@ -77,10 +77,10 @@ def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: try: if enabled: app_enabled.set() - start_bg_threads() + start_bg_threads(app_config, app_enabled) else: app_enabled.clear() - stop_bg_threads() + wait_for_bg_threads() except Exception as e: logger.exception('Error in enabled handler:', exc_info=e) return f'Error in enabled handler: {e}' @@ -101,10 +101,9 @@ async def lifespan(app: FastAPI): yield vectordb_loader.offload() llm_loader.offload() - stop_bg_threads() + wait_for_bg_threads() -app_config = get_config(os.environ['CC_CONFIG_PATH']) app = FastAPI(debug=app_config.debug, lifespan=lifespan) # pyright: ignore[reportArgumentType] app.extra['CONFIG'] = app_config @@ -343,86 +342,78 @@ def _(userId: str = Body(embed=True)): return JSONResponse('User deleted') -@app.put('/loadSources') -@enabled_guard(app) -def _(sources: list[UploadFile]): - global _indexing - - if len(sources) == 0: - return JSONResponse('No sources provided', 400) - - filtered_sources = [] - - for source in sources: - if not value_of(source.filename): - logger.warning('Skipping source with invalid source_id', extra={ - 'source_id': source.filename, - 'title': source.headers.get('title'), - }) - continue - - with index_lock: - if source.filename in _indexing: - # this request will be retried by the client - return JSONResponse( - f'This source ({source.filename}) is already being processed in another request, try again later', - 503, - headers={'cc-retry': 'true'}, - ) - - if not ( - value_of(source.headers.get('userIds')) - and source.headers.get('title', None) is not None - and value_of(source.headers.get('type')) - and value_of(source.headers.get('modified')) - and source.headers['modified'].isdigit() - and value_of(source.headers.get('provider')) - ): - logger.warning('Skipping source with invalid/missing headers', extra={ - 'source_id': source.filename, - 'title': source.headers.get('title'), - 'headers': source.headers, - }) - continue - - filtered_sources.append(source) - - # wait for 10 minutes before failing the request - semres = doc_parse_semaphore.acquire(block=True, timeout=10*60) - if not semres: - return JSONResponse( - 'Document parser worker limit reached, try again in some time or consider increasing the limit', - 503, - headers={'cc-retry': 'true'} - ) - - with index_lock: - for source in filtered_sources: - _indexing[source.filename] = source.size - - try: - loaded_sources, not_added_sources = exec_in_proc( - target=embed_sources, - args=(vectordb_loader, app.extra['CONFIG'], filtered_sources) - ) - except (DbException, EmbeddingException): - raise - except Exception as e: - raise DbException('Error: failed to load sources') from e - finally: - with index_lock: - for source in filtered_sources: - _indexing.pop(source.filename, None) - doc_parse_semaphore.release() - - if len(loaded_sources) != len(filtered_sources): - logger.debug('Some sources were not loaded', extra={ - 'Count of loaded sources': f'{len(loaded_sources)}/{len(filtered_sources)}', - 'source_ids': loaded_sources, - }) - - # loaded sources include the existing sources that may only have their access updated - return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources}) +# @app.put('/loadSources') +# @enabled_guard(app) +# def _(sources: list[UploadFile]): +# global _indexing + +# if len(sources) == 0: +# return JSONResponse('No sources provided', 400) + +# for source in sources: +# if not value_of(source.filename): +# return JSONResponse(f'Invalid source filename for: {source.headers.get("title")}', 400) + +# with index_lock: +# if source.filename in _indexing: +# # this request will be retried by the client +# return JSONResponse( +# f'This source ({source.filename}) is already being processed in another request, try again later', +# 503, +# headers={'cc-retry': 'true'}, +# ) + +# if not ( +# value_of(source.headers.get('userIds')) +# and source.headers.get('title', None) is not None +# and value_of(source.headers.get('type')) +# and value_of(source.headers.get('modified')) +# and source.headers['modified'].isdigit() +# and value_of(source.headers.get('provider')) +# ): +# logger.error('Invalid/missing headers received', extra={ +# 'source_id': source.filename, +# 'title': source.headers.get('title'), +# 'headers': source.headers, +# }) +# return JSONResponse(f'Invaild/missing headers for: {source.filename}', 400) + +# # wait for 10 minutes before failing the request +# semres = doc_parse_semaphore.acquire(block=True, timeout=10*60) +# if not semres: +# return JSONResponse( +# 'Document parser worker limit reached, try again in some time or consider increasing the limit', +# 503, +# headers={'cc-retry': 'true'} +# ) + +# with index_lock: +# for source in sources: +# _indexing[source.filename] = source.size + +# try: +# loaded_sources, not_added_sources = exec_in_proc( +# target=embed_sources, +# args=(vectordb_loader, app.extra['CONFIG'], sources) +# ) +# except (DbException, EmbeddingException): +# raise +# except Exception as e: +# raise DbException('Error: failed to load sources') from e +# finally: +# with index_lock: +# for source in sources: +# _indexing.pop(source.filename, None) +# doc_parse_semaphore.release() + +# if len(loaded_sources) != len(sources): +# logger.debug('Some sources were not loaded', extra={ +# 'Count of loaded sources': f'{len(loaded_sources)}/{len(sources)}', +# 'source_ids': loaded_sources, +# }) + +# # loaded sources include the existing sources that may only have their access updated +# return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources}) class Query(BaseModel): diff --git a/context_chat_backend/chain/ingest/mimetype_list.py b/context_chat_backend/mimetype_list.py similarity index 100% rename from context_chat_backend/chain/ingest/mimetype_list.py rename to context_chat_backend/mimetype_list.py diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 9660b44c..a548bcfd 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -3,15 +3,41 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # +import asyncio +import logging +from contextlib import suppress from enum import Enum -from threading import Thread +from io import BytesIO +from threading import Event, Thread +from time import sleep -from .types import AppRole -from .utils import get_app_role +import niquests +from nc_py_api import AsyncNextcloudApp, NextcloudApp +from pydantic import ValidationError + +from .chain.ingest.injest import embed_sources +from .dyn_loader import VectorDBLoader +from .types import ( + AppRole, + EmbeddingException, + FilesQueueItem, + IndexingError, + IndexingException, + LoaderException, + ReceivedFileItem, + SourceItem, + TConfig, +) +from .utils import exec_in_proc, get_app_role +from .vectordb.types import DbException APP_ROLE = get_app_role() THREADS = {} -THREADS_STOP_EVENTS = {} +LOGGER = logging.getLogger('ccb.task_fetcher') +FILES_INDEXING_BATCH_SIZE = 64 # todo: config? +# max concurrent fetches to avoid overloading the NC server or hitting rate limits +CONCURRENT_FILE_FETCHES = 10 # todo: config? +MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? class ThreadType(Enum): @@ -20,67 +46,294 @@ class ThreadType(Enum): REQUEST_PROCESSING = 'request_processing' -def files_indexing_thread(): - ... +async def __fetch_file_content( + semaphore: asyncio.Semaphore, + file_id: int, + user_id: str, + _rlimit = 3, +) -> BytesIO: + ''' + Raises + ------ + IndexingException + ''' + + async with semaphore: + nc = AsyncNextcloudApp() + try: + # a file pointer for storing the stream in memory until it is consumed + fp = BytesIO() + await nc._session.download2fp( + url_path=f'/apps/context_chat/files/{file_id}', + fp=fp, + dav=False, + params={ 'userId': user_id }, + ) + return fp + except niquests.exceptions.RequestException as e: + # todo: raise IndexingException with retryable=True for rate limit errors, + # todo: and handle it in the caller to not delete the source from the queue and retry later through + # todo: the normal lock expiry mechanism + if e.response is None: + raise + + if e.response.status_code == niquests.codes.too_many_requests: # pyright: ignore[reportAttributeAccessIssue] + # todo: implement rate limits in php CC? + wait_for = int(e.response.headers.get('Retry-After', '30')) + if _rlimit <= 0: + raise IndexingException( + f'Rate limited when fetching content for file id {file_id}, user id {user_id},' + ' max retries exceeded', + retryable=True, + ) from e + LOGGER.warning( + f'Rate limited when fetching content for file id {file_id}, user id {user_id},' + f' waiting {wait_for} before retrying', + exc_info=e, + ) + await asyncio.sleep(wait_for) + return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1) + + raise + except IndexingException: + raise + except Exception as e: + LOGGER.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e) + raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e + + +async def __fetch_files_content( + files: dict[int, ReceivedFileItem] +) -> dict[int, SourceItem | IndexingError]: + source_items = {} + semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) + tasks = [] + + for file_id, file_item in files.items(): + if file_item.size > MAX_FILE_SIZE: + LOGGER.info( + f'Skipping file id {file_id}, source id {file_item.reference} due to size' + f' {(file_item.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', + ) + source_items[file_id] = IndexingError( + error=( + f'File size {(file_item.size/(1024*1024)):.2f} MiB' + f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB' + ), + retryable=False, + ) + continue + # todo: perform the existing file check before fetching the content to avoid unnecessary fetches + # any user id from the list should have read access to the file + tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file_id, file_item.userIds[0]))) + results = await asyncio.gather(*tasks, return_exceptions=True) + for (file_id, file_item), result in zip(files.items(), results, strict=True): + if isinstance(result, IndexingException): + LOGGER.error( + f'Error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + exc_info=result, + ) + source_items[file_id] = IndexingError( + error=str(result), + retryable=result.retryable, + ) + elif isinstance(result, str) or isinstance(result, BytesIO): + source_items[file_id] = SourceItem( + **file_item.model_dump(), + content=result, + ) + elif isinstance(result, BaseException): + LOGGER.error( + f'Unexpected error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + exc_info=result, + ) + source_items[file_id] = IndexingError( + error=f'Unexpected error: {result}', + retryable=True, + ) + else: + LOGGER.error( + f'Unknown error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + exc_info=True, + ) + source_items[file_id] = IndexingError( + error='Unknown error', + retryable=True, + ) + return source_items + + +def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: + try: + vectordb_loader = VectorDBLoader(app_config) + except LoaderException as e: + LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) + return + + def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingError | None]: + try: + return exec_in_proc( + target=embed_sources, + args=(vectordb_loader, app_config, source_items), + ) + except (DbException, EmbeddingException): + raise + except Exception as e: + raise DbException('Error: failed to load sources') from e -def updates_processing_thread(): + + while True: + if not app_enabled.is_set(): + LOGGER.info('Files indexing thread is stopping as the app is disabled') + return + + try: + nc = NextcloudApp() + # todo: add the 'size' param to the return of this call. + q_items_res = nc.ocs( + 'GET', + '/apps/context_chat/queues/documents', + params={ 'n': FILES_INDEXING_BATCH_SIZE } + ) + + try: + q_items = FilesQueueItem.model_validate(q_items_res) + except ValidationError as e: + raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e + + # populate files content and convert to source items + fetched_files = {} + source_files = {} + # unified error structure for files and content providers + source_errors = {} + + if q_items.files: + fetched_files = asyncio.run(__fetch_files_content(q_items.files)) + + for file_id, result in fetched_files.items(): + if isinstance(result, SourceItem): + source_files[file_id] = result + else: + source_errors[file_id] = result + + files_result = _load_sources(source_files) + providers_result = _load_sources(q_items.content_providers) + + if ( + any(isinstance(res, IndexingError) for res in files_result.values()) + or any(isinstance(res, IndexingError) for res in providers_result.values()) + ): + LOGGER.error('Some sources failed to index', extra={ + 'file_errors': { + file_id: error + for file_id, error in files_result.items() + if isinstance(error, IndexingError) + }, + 'provider_errors': { + provider_id: error + for provider_id, error in providers_result.items() + if isinstance(error, IndexingError) + }, + }) + except ( + niquests.exceptions.ConnectionError, + niquests.exceptions.Timeout, + ) as e: + LOGGER.info('Temporary error fetching documents to index, will retry:', exc_info=e) + sleep(5) + continue + except Exception as e: + LOGGER.exception('Error fetching documents to index:', exc_info=e) + sleep(5) + continue + + # delete the entries from the PHP side queue where indexing succeeded or the error is not retryable + to_delete_file_ids = [ + file_id for file_id, result in files_result.items() + if result is None or (isinstance(result, IndexingError) and not result.retryable) + ] + to_delete_provider_ids = [ + provider_id for provider_id, result in providers_result.items() + if result is None or (isinstance(result, IndexingError) and not result.retryable) + ] + + try: + nc.ocs( + 'DELETE', + '/apps/context_chat/queues/documents/', + json={ + 'files': to_delete_file_ids, + 'content_providers': to_delete_provider_ids, + }, + ) + except ( + niquests.exceptions.ConnectionError, + niquests.exceptions.Timeout, + ) as e: + LOGGER.info('Temporary error reporting indexing results, will retry:', exc_info=e) + sleep(5) + with suppress(Exception): + nc = NextcloudApp() + nc.ocs( + 'DELETE', + '/apps/context_chat/queues/documents/', + json={ + 'files': to_delete_file_ids, + 'content_providers': to_delete_provider_ids, + }, + ) + continue + except Exception as e: + LOGGER.exception('Error reporting indexing results:', exc_info=e) + sleep(5) + continue + + + +def updates_processing_thread(app_config: TConfig): ... -def request_processing_thread(): +def request_processing_thread(app_config: TConfig): ... -def start_bg_threads(): +def start_bg_threads(app_config: TConfig, app_enabled: Event): match APP_ROLE: case AppRole.INDEXING | AppRole.NORMAL: THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, + args=(app_config, Event), name='FilesIndexingThread', - daemon=True, ) THREADS[ThreadType.UPDATES_PROCESSING] = Thread( target=updates_processing_thread, + args=(app_config, Event), name='UpdatesProcessingThread', - daemon=True, ) THREADS[ThreadType.FILES_INDEXING].start() THREADS[ThreadType.UPDATES_PROCESSING].start() case AppRole.RP | AppRole.NORMAL: THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, + args=(app_config, Event), name='RequestProcessingThread', - daemon=True, ) THREADS[ThreadType.REQUEST_PROCESSING].start() -def stop_bg_threads(): +def wait_for_bg_threads(): match APP_ROLE: case AppRole.INDEXING | AppRole.NORMAL: - if ( - ThreadType.FILES_INDEXING not in THREADS - or ThreadType.UPDATES_PROCESSING not in THREADS - or ThreadType.FILES_INDEXING not in THREADS_STOP_EVENTS - or ThreadType.UPDATES_PROCESSING not in THREADS_STOP_EVENTS - ): + if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): return - THREADS_STOP_EVENTS[ThreadType.FILES_INDEXING].set() - THREADS_STOP_EVENTS[ThreadType.UPDATES_PROCESSING].set() THREADS[ThreadType.FILES_INDEXING].join() THREADS[ThreadType.UPDATES_PROCESSING].join() THREADS.pop(ThreadType.FILES_INDEXING) THREADS.pop(ThreadType.UPDATES_PROCESSING) - THREADS_STOP_EVENTS.pop(ThreadType.FILES_INDEXING) - THREADS_STOP_EVENTS.pop(ThreadType.UPDATES_PROCESSING) case AppRole.RP | AppRole.NORMAL: - if ( - ThreadType.REQUEST_PROCESSING not in THREADS - or ThreadType.REQUEST_PROCESSING not in THREADS_STOP_EVENTS - ): + if (ThreadType.REQUEST_PROCESSING not in THREADS): return - THREADS_STOP_EVENTS[ThreadType.REQUEST_PROCESSING].set() THREADS[ThreadType.REQUEST_PROCESSING].join() THREADS.pop(ThreadType.REQUEST_PROCESSING) - THREADS_STOP_EVENTS.pop(ThreadType.REQUEST_PROCESSING) diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 78680866..97d48ce6 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -3,8 +3,13 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # from enum import Enum +from io import BytesIO +from typing import Self -from pydantic import BaseModel +from pydantic import BaseModel, field_validator + +from .mimetype_list import SUPPORTED_MIMETYPES +from .utils import is_valid_provider_id, is_valid_source_id __all__ = [ 'DEFAULT_EM_MODEL_ALIAS', @@ -17,6 +22,7 @@ ] DEFAULT_EM_MODEL_ALIAS = 'em_model' +FILES_PROVIDER_ID = 'files__default' class TEmbeddingAuthApiKey(BaseModel): @@ -79,3 +85,116 @@ class AppRole(str, Enum): NORMAL = 'normal' INDEXING = 'indexing' RP = 'rp' + + +class CommonSourceItem(BaseModel): + userIds: list[str] + reference: str # source_id of the form "appId__providerId: itemId" + title: str + modified: int | str # todo: int/string? + type: str + provider: str + size: int + + @field_validator('modified', mode='before') + @classmethod + def validate_modified(cls, v): + if isinstance(v, int): + return v + if isinstance(v, str): + try: + return int(v) + except ValueError as e: + raise ValueError(f'Invalid modified value: {v}') from e + raise ValueError(f'Invalid modified type: {type(v)}') + + @field_validator('reference', 'title', 'type', 'provider') + @classmethod + def validate_strings_non_empty(cls, v): + if not isinstance(v, str) or v.strip() == '': + raise ValueError('Must be a non-empty string') + return v.strip() + + @field_validator('userIds', mode='after') + def validate_user_ids(self) -> Self: + if ( + not isinstance(self.userIds, list) + or not all( + isinstance(uid, str) + and uid.strip() != '' + for uid in self.userIds + ) + or len(self.userIds) == 0 + ): + raise ValueError('userIds must be a non-empty list of non-empty strings') + self.userIds = [uid.strip() for uid in self.userIds] + return self + + @field_validator('reference', mode='after') + def validate_reference_format(self) -> Self: + # validate reference format: "appId__providerId: itemId" + if not is_valid_source_id(self.reference): + raise ValueError('Invalid reference format, must be "appId__providerId: itemId"') + return self + + @field_validator('provider', mode='after') + def validate_provider_format(self) -> Self: + # validate provider format: "appId__providerId" + if not is_valid_provider_id(self.provider): + raise ValueError('Invalid provider format, must be "appId__providerId"') + return self + + @field_validator('type', mode='after') + def validate_type(self) -> Self: + if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES: + raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}') + return self + + @field_validator('size', mode='after') + def validate_size(self) -> Self: + if not isinstance(self.size, int) or self.size < 0: + raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer') + return self + + +class ReceivedFileItem(CommonSourceItem): + content: None + + +class SourceItem(CommonSourceItem): + ''' + Used for the unified queue of items to process, after fetching the content for files + and for directly fetched content providers. + ''' + content: str | BytesIO + + @field_validator('content') + @classmethod + def validate_content(cls, v): + if isinstance(v, str): + if v.strip() == '': + raise ValueError('Content must be a non-empty string') + return v.strip() + if isinstance(v, BytesIO): + if v.getbuffer().nbytes == 0: + raise ValueError('Content must be a non-empty BytesIO') + return v + raise ValueError('Content must be either a non-empty string or a non-empty BytesIO') + + +class FilesQueueItem(BaseModel): + files: dict[int, ReceivedFileItem] # [db id]: FileItem + content_providers: dict[int, SourceItem] # [db id]: SourceItem + + +class IndexingException(Exception): + retryable: bool = False + + def __init__(self, message: str, retryable: bool = False): + super().__init__(message) + self.retryable = retryable + + +class IndexingError(BaseModel): + error: str + retryable: bool = False diff --git a/context_chat_backend/vectordb/base.py b/context_chat_backend/vectordb/base.py index 0bf10200..ebd54075 100644 --- a/context_chat_backend/vectordb/base.py +++ b/context_chat_backend/vectordb/base.py @@ -5,12 +5,12 @@ from abc import ABC, abstractmethod from typing import Any -from fastapi import UploadFile from langchain.schema import Document from langchain.schema.embeddings import Embeddings from langchain.schema.vectorstore import VectorStore from ..chain.types import InDocument, ScopeType +from ..types import IndexingError, SourceItem from ..utils import timed from .types import UpdateAccessOp @@ -62,7 +62,7 @@ def get_instance(self) -> VectorStore: ''' @abstractmethod - def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str],list[str]]: + def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]: ''' Adds the given indocuments to the vectordb and updates the docs + access tables. @@ -79,10 +79,7 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str],list @timed @abstractmethod - def check_sources( - self, - sources: list[UploadFile], - ) -> tuple[list[str], list[str]]: + def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]: ''' Checks the sources in the vectordb if they are already embedded and are up to date. diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 2b7fc060..f5879feb 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -11,14 +11,13 @@ import sqlalchemy.dialects.postgresql as postgresql_dialects import sqlalchemy.orm as orm from dotenv import load_dotenv -from fastapi import UploadFile from langchain.schema import Document from langchain.vectorstores import VectorStore from langchain_core.embeddings import Embeddings from langchain_postgres.vectorstores import Base, PGVector from ..chain.types import InDocument, ScopeType -from ..types import EmbeddingException, RetryableEmbeddingException +from ..types import EmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem from ..utils import timed from .base import BaseVectorDB from .types import DbException, SafeDbException, UpdateAccessOp @@ -130,17 +129,16 @@ def get_users(self) -> list[str]: except Exception as e: raise DbException('Error: getting a list of all users from access list') from e - def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], list[str]]: + def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]: """ Raises EmbeddingException: if the embedding request definitively fails """ - added_sources = [] - retry_sources = [] + results = {} batch_size = PG_BATCH_SIZE // 5 with self.session_maker() as session: - for indoc in indocuments: + for php_db_id, indoc in indocuments.items(): try: # query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html) # so we chunk the documents into (5 values * 10k) chunks @@ -170,7 +168,7 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], lis ) self.decl_update_access(indoc.userIds, indoc.source_id, session) - added_sources.append(indoc.source_id) + results[php_db_id] = None session.commit() except SafeDbException as e: # for when the source_id is not found. This here can be an error in the DB @@ -178,51 +176,67 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], lis logger.exception('Error adding documents to vectordb', exc_info=e, extra={ 'source_id': indoc.source_id, }) - retry_sources.append(indoc.source_id) + results[php_db_id] = IndexingError( + error=str(e), + retryable=True, + ) continue except RetryableEmbeddingException as e: # temporary error, continue with the next document logger.exception('Error adding documents to vectordb, should be retried later.', exc_info=e, extra={ 'source_id': indoc.source_id, }) - retry_sources.append(indoc.source_id) + results[php_db_id] = IndexingError( + error=str(e), + retryable=True, + ) continue except EmbeddingException as e: logger.exception('Error adding documents to vectordb', exc_info=e, extra={ 'source_id': indoc.source_id, }) - raise + results[php_db_id] = IndexingError( + error=str(e), + retryable=False, + ) + continue except Exception as e: logger.exception('Error adding documents to vectordb', exc_info=e, extra={ 'source_id': indoc.source_id, }) - retry_sources.append(indoc.source_id) + results[php_db_id] = IndexingError( + error='An unexpected error occurred while adding documents to the database.', + retryable=True, + ) continue - return added_sources, retry_sources + return results @timed - def check_sources(self, sources: list[UploadFile]) -> tuple[list[str], list[str]]: + def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]: + ''' + returns a tuple of (existing_source_ids, to_embed_source_ids) + ''' with self.session_maker() as session: try: stmt = ( sa.select(DocumentsStore.source_id) - .filter(DocumentsStore.source_id.in_([source.filename for source in sources])) + .filter(DocumentsStore.source_id.in_([source.reference for source in sources.values()])) .with_for_update() ) results = session.execute(stmt).fetchall() existing_sources = {r.source_id for r in results} - to_embed = [source.filename for source in sources if source.filename not in existing_sources] + to_embed = [source.reference for source in sources.values() if source.reference not in existing_sources] to_delete = [] - for source in sources: + for source in sources.values(): stmt = ( sa.select(DocumentsStore.source_id) - .filter(DocumentsStore.source_id == source.filename) + .filter(DocumentsStore.source_id == source.reference) .filter(DocumentsStore.modified < sa.cast( - datetime.fromtimestamp(int(source.headers['modified'])), + datetime.fromtimestamp(int(source.modified)), sa.DateTime, )) ) @@ -239,14 +253,13 @@ def check_sources(self, sources: list[UploadFile]) -> tuple[list[str], list[str] session.rollback() raise DbException('Error: checking sources in vectordb') from e - still_existing_sources = [ - source - for source in existing_sources - if source not in to_delete + still_existing_source_ids = [ + source_id + for source_id in existing_sources + if source_id not in to_delete ] - # the pyright issue stems from source.filename, which has already been validated - return list(still_existing_sources), to_embed # pyright: ignore[reportReturnType] + return list(still_existing_source_ids), to_embed def decl_update_access(self, user_ids: list[str], source_id: str, session_: orm.Session | None = None): session = session_ or self.session_maker() From 03a3f433caccdf7121c3171538828c8f6fefa5af Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 9 Mar 2026 19:42:21 +0530 Subject: [PATCH 04/56] wip: parallelize file parsing and processing based on cpu count Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index a548bcfd..853a68c8 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -5,6 +5,7 @@ import asyncio import logging +import os from contextlib import suppress from enum import Enum from io import BytesIO @@ -35,6 +36,8 @@ THREADS = {} LOGGER = logging.getLogger('ccb.task_fetcher') FILES_INDEXING_BATCH_SIZE = 64 # todo: config? +# divides the batch into these many chunks +PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1) # todo: config? # max concurrent fetches to avoid overloading the NC server or hitting rate limits CONCURRENT_FILE_FETCHES = 10 # todo: config? MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? @@ -217,8 +220,18 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro else: source_errors[file_id] = result - files_result = _load_sources(source_files) - providers_result = _load_sources(q_items.content_providers) + files_result = {} + providers_result = {} + chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING + + # chunk file parsing for better file operation parallelism + for i in range(0, len(source_files), chunk_size): + chunk = dict(list(source_files.items())[i:i+chunk_size]) + files_result.update(_load_sources(chunk)) + + for i in range(0, len(q_items.content_providers), chunk_size): + chunk = dict(list(q_items.content_providers.items())[i:i+chunk_size]) + providers_result.update(_load_sources(chunk)) if ( any(isinstance(res, IndexingError) for res in files_result.values()) From 0dc404bf48cff0e358b723bcb12775956d0c2eac Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 10 Mar 2026 17:36:03 +0530 Subject: [PATCH 05/56] ci: use the kubernetes branch of context_chat Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 10e2d61b..fb06bafa 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -113,6 +113,8 @@ jobs: repository: nextcloud/context_chat path: apps/context_chat persist-credentials: false + # todo: remove later + ref: feat/reverse-content-flow - name: Checkout backend uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 From c7339828818ff49e8a2c44aa7896b4b2fdf495fb Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 10 Mar 2026 17:43:27 +0530 Subject: [PATCH 06/56] fix typo Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 853a68c8..cfa9293c 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -304,11 +304,11 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro -def updates_processing_thread(app_config: TConfig): +def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: ... -def request_processing_thread(app_config: TConfig): +def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: ... @@ -317,12 +317,12 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event): case AppRole.INDEXING | AppRole.NORMAL: THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, - args=(app_config, Event), + args=(app_config, app_enabled), name='FilesIndexingThread', ) THREADS[ThreadType.UPDATES_PROCESSING] = Thread( target=updates_processing_thread, - args=(app_config, Event), + args=(app_config, app_enabled), name='UpdatesProcessingThread', ) THREADS[ThreadType.FILES_INDEXING].start() @@ -330,7 +330,7 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event): case AppRole.RP | AppRole.NORMAL: THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, - args=(app_config, Event), + args=(app_config, app_enabled), name='RequestProcessingThread', ) THREADS[ThreadType.REQUEST_PROCESSING].start() From dda312f21f74955d70e6f5f74840a31b26bb3f9d Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 11:58:50 +0530 Subject: [PATCH 07/56] migrate the update process to be thread based Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 2 +- context_chat_backend/controller.py | 203 ++++++++++---------- context_chat_backend/task_fetcher.py | 183 +++++++++++++++++- context_chat_backend/types.py | 183 +++++++++++++++++- context_chat_backend/vectordb/pgvector.py | 27 ++- context_chat_backend/vectordb/service.py | 54 +++++- context_chat_backend/vectordb/types.py | 4 +- 7 files changed, 531 insertions(+), 125 deletions(-) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 0eb70e0b..7369f452 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -129,7 +129,7 @@ def _increase_access_for_existing_sources( for db_id, source in existing_sources.items(): try: vectordb.update_access( - UpdateAccessOp.allow, + UpdateAccessOp.ALLOW, list(map(_decode_latin_1, source.userIds)), source.reference, ) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 3e70ee1b..580416f7 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -6,7 +6,7 @@ # isort: off from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult from .types import LoaderException, EmbeddingException -from .vectordb.types import DbException, SafeDbException, UpdateAccessOp +from .vectordb.types import DbException, SafeDbException from .setup_functions import ensure_config_file, repair_run, setup_env_vars # setup env vars before importing other modules @@ -25,9 +25,9 @@ from functools import wraps from threading import Event, Thread from time import sleep -from typing import Annotated, Any +from typing import Any -from fastapi import Body, FastAPI, Request +from fastapi import FastAPI, Request from langchain.llms.base import LLM from nc_py_api import AsyncNextcloudApp, NextcloudApp from nc_py_api.ex_app import persistent_storage, set_handlers @@ -40,16 +40,9 @@ from .dyn_loader import LLMModelLoader, VectorDBLoader from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware -from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of +from .utils import JSONResponse, exec_in_proc, value_of from .task_fetcher import start_bg_threads, wait_for_bg_threads -from .vectordb.service import ( - count_documents_by_provider, - decl_update_access, - delete_by_provider, - delete_by_source, - delete_user, - update_access, -) +from .vectordb.service import count_documents_by_provider # setup @@ -227,119 +220,131 @@ def _(): return JSONResponse(counts) -@app.post('/updateAccessDeclarative') -@enabled_guard(app) -def _( - userIds: Annotated[list[str], Body()], - sourceId: Annotated[str, Body()], -): - logger.debug('Update access declarative request:', extra={ - 'user_ids': userIds, - 'source_id': sourceId, - }) +@app.get('/downloadLogs') +def download_logs() -> FileResponse: + with tempfile.NamedTemporaryFile('wb', delete=False) as tmp: + with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file: + files = os.listdir(os.path.join(persistent_storage(), 'logs')) + for file in files: + file_path = os.path.join(persistent_storage(), 'logs', file) + if os.path.isfile(file_path): # Might be a folder (just skip it then) + zip_file.write(file_path) + return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip') - if len(userIds) == 0: - return JSONResponse('Empty list of user ids', 400) - if not is_valid_source_id(sourceId): - return JSONResponse('Invalid source id', 400) +# @app.post('/updateAccessDeclarative') +# @enabled_guard(app) +# def _( +# userIds: Annotated[list[str], Body()], +# sourceId: Annotated[str, Body()], +# ): +# logger.debug('Update access declarative request:', extra={ +# 'user_ids': userIds, +# 'source_id': sourceId, +# }) - exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId)) +# if len(userIds) == 0: +# return JSONResponse('Empty list of user ids', 400) - return JSONResponse('Access updated') +# if not is_valid_source_id(sourceId): +# return JSONResponse('Invalid source id', 400) +# exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId)) -@app.post('/updateAccess') -@enabled_guard(app) -def _( - op: Annotated[UpdateAccessOp, Body()], - userIds: Annotated[list[str], Body()], - sourceId: Annotated[str, Body()], -): - logger.debug('Update access request', extra={ - 'op': op, - 'user_ids': userIds, - 'source_id': sourceId, - }) +# return JSONResponse('Access updated') - if len(userIds) == 0: - return JSONResponse('Empty list of user ids', 400) - if not is_valid_source_id(sourceId): - return JSONResponse('Invalid source id', 400) +# @app.post('/updateAccess') +# @enabled_guard(app) +# def _( +# op: Annotated[UpdateAccessOp, Body()], +# userIds: Annotated[list[str], Body()], +# sourceId: Annotated[str, Body()], +# ): +# logger.debug('Update access request', extra={ +# 'op': op, +# 'user_ids': userIds, +# 'source_id': sourceId, +# }) - exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId)) +# if len(userIds) == 0: +# return JSONResponse('Empty list of user ids', 400) - return JSONResponse('Access updated') +# if not is_valid_source_id(sourceId): +# return JSONResponse('Invalid source id', 400) +# exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId)) -@app.post('/updateAccessProvider') -@enabled_guard(app) -def _( - op: Annotated[UpdateAccessOp, Body()], - userIds: Annotated[list[str], Body()], - providerId: Annotated[str, Body()], -): - logger.debug('Update access by provider request', extra={ - 'op': op, - 'user_ids': userIds, - 'provider_id': providerId, - }) +# return JSONResponse('Access updated') - if len(userIds) == 0: - return JSONResponse('Empty list of user ids', 400) - if not is_valid_provider_id(providerId): - return JSONResponse('Invalid provider id', 400) +# @app.post('/updateAccessProvider') +# @enabled_guard(app) +# def _( +# op: Annotated[UpdateAccessOp, Body()], +# userIds: Annotated[list[str], Body()], +# providerId: Annotated[str, Body()], +# ): +# logger.debug('Update access by provider request', extra={ +# 'op': op, +# 'user_ids': userIds, +# 'provider_id': providerId, +# }) - exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, providerId)) +# if len(userIds) == 0: +# return JSONResponse('Empty list of user ids', 400) - return JSONResponse('Access updated') +# if not is_valid_provider_id(providerId): +# return JSONResponse('Invalid provider id', 400) +# exec_in_proc(target=update_access_provider, args=(vectordb_loader, op, userIds, providerId)) -@app.post('/deleteSources') -@enabled_guard(app) -def _(sourceIds: Annotated[list[str], Body(embed=True)]): - logger.debug('Delete sources request', extra={ - 'source_ids': sourceIds, - }) +# return JSONResponse('Access updated') - sourceIds = [source.strip() for source in sourceIds if source.strip() != ''] - if len(sourceIds) == 0: - return JSONResponse('No sources provided', 400) +# @app.post('/deleteSources') +# @enabled_guard(app) +# def _(sourceIds: Annotated[list[str], Body(embed=True)]): +# logger.debug('Delete sources request', extra={ +# 'source_ids': sourceIds, +# }) - res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds)) - if res is False: - return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400) +# sourceIds = [source.strip() for source in sourceIds if source.strip() != ''] - return JSONResponse('All valid sources deleted') +# if len(sourceIds) == 0: +# return JSONResponse('No sources provided', 400) +# res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds)) +# if res is False: +# return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400) -@app.post('/deleteProvider') -@enabled_guard(app) -def _(providerKey: str = Body(embed=True)): - logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey }) +# return JSONResponse('All valid sources deleted') - if value_of(providerKey) is None: - return JSONResponse('Invalid provider key provided', 400) - exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey)) +# @app.post('/deleteProvider') +# @enabled_guard(app) +# def _(providerKey: str = Body(embed=True)): +# logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey }) - return JSONResponse('All valid sources deleted') +# if value_of(providerKey) is None: +# return JSONResponse('Invalid provider key provided', 400) +# exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey)) -@app.post('/deleteUser') -@enabled_guard(app) -def _(userId: str = Body(embed=True)): - logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId }) +# return JSONResponse('All valid sources deleted') - if value_of(userId) is None: - return JSONResponse('Invalid userId provided', 400) - exec_in_proc(target=delete_user, args=(vectordb_loader, userId)) +# @app.post('/deleteUser') +# @enabled_guard(app) +# def _(userId: str = Body(embed=True)): +# logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId }) + +# if value_of(userId) is None: +# return JSONResponse('Invalid userId provided', 400) - return JSONResponse('User deleted') +# exec_in_proc(target=delete_user, args=(vectordb_loader, userId)) + +# return JSONResponse('User deleted') # @app.put('/loadSources') @@ -503,15 +508,3 @@ def _(query: Query) -> list[SearchResult]: query.scopeType, query.scopeList, )) - - -@app.get('/downloadLogs') -def download_logs() -> FileResponse: - with tempfile.NamedTemporaryFile('wb', delete=False) as tmp: - with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file: - files = os.listdir(os.path.join(persistent_storage(), 'logs')) - for file in files: - file_path = os.path.join(persistent_storage(), 'logs', file) - if os.path.isfile(file_path): # Might be a folder (just skip it then) - zip_file.write(file_path) - return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip') diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index cfa9293c..84b974b2 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -19,9 +19,11 @@ from .chain.ingest.injest import embed_sources from .dyn_loader import VectorDBLoader from .types import ( + ActionsQueueItems, + ActionType, AppRole, EmbeddingException, - FilesQueueItem, + FilesQueueItems, IndexingError, IndexingException, LoaderException, @@ -30,7 +32,15 @@ TConfig, ) from .utils import exec_in_proc, get_app_role -from .vectordb.types import DbException +from .vectordb.service import ( + decl_update_access, + delete_by_provider, + delete_by_source, + delete_user, + update_access, + update_access_provider, +) +from .vectordb.types import DbException, SafeDbException APP_ROLE = get_app_role() THREADS = {} @@ -41,6 +51,8 @@ # max concurrent fetches to avoid overloading the NC server or hitting rate limits CONCURRENT_FILE_FETCHES = 10 # todo: config? MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? +ACTIONS_BATCH_SIZE = 512 # todo: config? +POLLING_COOLDOWN = 30 class ThreadType(Enum): @@ -201,10 +213,15 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro ) try: - q_items = FilesQueueItem.model_validate(q_items_res) + q_items: FilesQueueItems = FilesQueueItems.model_validate(q_items_res) except ValidationError as e: raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e + if not q_items.files and not q_items.content_providers: + LOGGER.debug('No documents to index') + sleep(POLLING_COOLDOWN) + continue + # populate files content and convert to source items fetched_files = {} source_files = {} @@ -305,7 +322,165 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: - ... + try: + vectordb_loader = VectorDBLoader(app_config) + except LoaderException as e: + LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) + return + + while True: + if not app_enabled.is_set(): + LOGGER.info('Files indexing thread is stopping as the app is disabled') + return + + try: + nc = NextcloudApp() + q_items_res = nc.ocs( + 'GET', + '/apps/context_chat/queues/actions', + params={ 'n': ACTIONS_BATCH_SIZE } + ) + + try: + q_items: ActionsQueueItems = ActionsQueueItems.model_validate(q_items_res) + except ValidationError as e: + raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e + except ( + niquests.exceptions.ConnectionError, + niquests.exceptions.Timeout, + ) as e: + LOGGER.info('Temporary error fetching updates to process, will retry:', exc_info=e) + sleep(5) + continue + except Exception as e: + LOGGER.exception('Error fetching updates to process:', exc_info=e) + sleep(5) + continue + + if not q_items.actions: + LOGGER.debug('No updates to process') + sleep(POLLING_COOLDOWN) + continue + + processed_event_ids = [] + errored_events = {} + for i, (db_id, action_item) in enumerate(q_items.actions.items()): + try: + match action_item.type: + case ActionType.DELETE_SOURCE_IDS: + exec_in_proc(target=delete_by_source, args=(vectordb_loader, action_item.payload.sourceIds)) + + case ActionType.DELETE_PROVIDER_ID: + exec_in_proc(target=delete_by_provider, args=(vectordb_loader, action_item.payload.providerId)) + + case ActionType.DELETE_USER_ID: + exec_in_proc(target=delete_user, args=(vectordb_loader, action_item.payload.userId)) + + case ActionType.UPDATE_ACCESS_SOURCE_ID: + exec_in_proc( + target=update_access, + args=( + vectordb_loader, + action_item.payload.op, + action_item.payload.userIds, + action_item.payload.sourceId, + ), + ) + + case ActionType.UPDATE_ACCESS_PROVIDER_ID: + exec_in_proc( + target=update_access_provider, + args=( + vectordb_loader, + action_item.payload.op, + action_item.payload.userIds, + action_item.payload.providerId, + ), + ) + + case ActionType.UPDATE_ACCESS_DECL_SOURCE_ID: + exec_in_proc( + target=decl_update_access, + args=( + vectordb_loader, + action_item.payload.userIds, + action_item.payload.sourceId, + ), + ) + + case _: + LOGGER.warning( + f'Unknown action type {action_item.type} for action id {db_id},' + f' type {action_item.type}, skipping and marking as processed', + extra={ 'action_item': action_item }, + ) + continue + + processed_event_ids.append(db_id) + except SafeDbException as e: + LOGGER.debug( + f'Safe DB error thrown while processing action id {db_id}, type {action_item.type},' + " it's safe to ignore and mark as processed.", + exc_info=e, + extra={ 'action_item': action_item }, + ) + processed_event_ids.append(db_id) + continue + + except (LoaderException, DbException) as e: + LOGGER.error( + f'Error deleting source for action id {db_id}, type {action_item.type}: {e}', + exc_info=e, + extra={ 'action_item': action_item }, + ) + errored_events[db_id] = str(e) + continue + + except Exception as e: + LOGGER.error( + f'Unexpected error processing action id {db_id}, type {action_item.type}: {e}', + exc_info=e, + extra={ 'action_item': action_item }, + ) + errored_events[db_id] = f'Unexpected error: {e}' + continue + + if (i + 1) % 20 == 0: + LOGGER.debug(f'Processed {i + 1} updates, sleeping for a bit to allow other operations to proceed') + sleep(2) + + LOGGER.info(f'Processed {len(processed_event_ids)} updates with {len(errored_events)} errors', extra={ + 'errored_events': errored_events, + }) + + if len(processed_event_ids) == 0: + LOGGER.debug('No updates processed, skipping reporting to the server') + continue + + try: + nc.ocs( + 'DELETE', + '/apps/context_chat/queues/actions/', + json={ 'actions': processed_event_ids }, + ) + except ( + niquests.exceptions.ConnectionError, + niquests.exceptions.Timeout, + ) as e: + LOGGER.info('Temporary error reporting processed updates, will retry:', exc_info=e) + sleep(5) + with suppress(Exception): + nc = NextcloudApp() + nc.ocs( + 'DELETE', + '/apps/context_chat/queues/actions/', + json={ 'ids': processed_event_ids }, + ) + continue + except Exception as e: + LOGGER.exception('Error reporting processed updates:', exc_info=e) + sleep(5) + continue def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 97d48ce6..849c2e31 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -4,12 +4,13 @@ # from enum import Enum from io import BytesIO -from typing import Self +from typing import Annotated, Literal, Self -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, Discriminator, field_validator from .mimetype_list import SUPPORTED_MIMETYPES from .utils import is_valid_provider_id, is_valid_source_id +from .vectordb.types import UpdateAccessOp __all__ = [ 'DEFAULT_EM_MODEL_ALIAS', @@ -182,7 +183,7 @@ def validate_content(cls, v): raise ValueError('Content must be either a non-empty string or a non-empty BytesIO') -class FilesQueueItem(BaseModel): +class FilesQueueItems(BaseModel): files: dict[int, ReceivedFileItem] # [db id]: FileItem content_providers: dict[int, SourceItem] # [db id]: SourceItem @@ -198,3 +199,179 @@ def __init__(self, message: str, retryable: bool = False): class IndexingError(BaseModel): error: str retryable: bool = False + + +# PHP equivalent for reference: + +# class ActionType { +# // { sourceIds: array } +# public const DELETE_SOURCE_IDS = 'delete_source_ids'; +# // { providerId: string } +# public const DELETE_PROVIDER_ID = 'delete_provider_id'; +# // { userId: string } +# public const DELETE_USER_ID = 'delete_user_id'; +# // { op: string, userIds: array, sourceId: string } +# public const UPDATE_ACCESS_SOURCE_ID = 'update_access_source_id'; +# // { op: string, userIds: array, providerId: string } +# public const UPDATE_ACCESS_PROVIDER_ID = 'update_access_provider_id'; +# // { userIds: array, sourceId: string } +# public const UPDATE_ACCESS_DECL_SOURCE_ID = 'update_access_decl_source_id'; +# } + + +def _validate_source_ids(source_ids: list[str]) -> list[str]: + if ( + not isinstance(source_ids, list) + or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids) + or len(source_ids) == 0 + ): + raise ValueError('sourceIds must be a non-empty list of non-empty strings') + return [sid.strip() for sid in source_ids] + + +def _validate_provider_id(provider_id: str) -> str: + if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id): + raise ValueError('providerId must be a valid provider ID string') + return provider_id + + +def _validate_user_ids(user_ids: list[str]) -> list[str]: + if ( + not isinstance(user_ids, list) + or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids) + or len(user_ids) == 0 + ): + raise ValueError('userIds must be a non-empty list of non-empty strings') + return [uid.strip() for uid in user_ids] + + +class ActionPayloadDeleteSourceIds(BaseModel): + sourceIds: list[str] + + @field_validator('sourceIds', mode='after') + def validate_source_ids(self) -> Self: + self.sourceIds = _validate_source_ids(self.sourceIds) + return self + + +class ActionPayloadDeleteProviderId(BaseModel): + providerId: str + + @field_validator('providerId') + def validate_provider_id(self) -> Self: + self.providerId = _validate_provider_id(self.providerId) + return self + + +class ActionPayloadDeleteUserId(BaseModel): + userId: str + + @field_validator('userId') + def validate_user_id(self) -> Self: + self.userId = _validate_user_ids([self.userId])[0] + return self + + +class ActionPayloadUpdateAccessSourceId(BaseModel): + op: UpdateAccessOp + userIds: list[str] + sourceId: str + + @field_validator('userIds', mode='after') + def validate_user_ids(self) -> Self: + self.userIds = _validate_user_ids(self.userIds) + return self + + @field_validator('sourceId') + def validate_source_id(self) -> Self: + self.sourceId = _validate_source_ids([self.sourceId])[0] + return self + + +class ActionPayloadUpdateAccessProviderId(BaseModel): + op: UpdateAccessOp + userIds: list[str] + providerId: str + + @field_validator('userIds', mode='after') + def validate_user_ids(self) -> Self: + self.userIds = _validate_user_ids(self.userIds) + return self + + @field_validator('providerId') + def validate_provider_id(self) -> Self: + self.providerId = _validate_provider_id(self.providerId) + return self + + +class ActionPayloadUpdateAccessDeclSourceId(BaseModel): + userIds: list[str] + sourceId: str + + @field_validator('userIds', mode='after') + def validate_user_ids(self) -> Self: + self.userIds = _validate_user_ids(self.userIds) + return self + + @field_validator('sourceId') + def validate_source_id(self) -> Self: + self.sourceId = _validate_source_ids([self.sourceId])[0] + return self + + +class ActionType(str, Enum): + DELETE_SOURCE_IDS = 'delete_source_ids' + DELETE_PROVIDER_ID = 'delete_provider_id' + DELETE_USER_ID = 'delete_user_id' + UPDATE_ACCESS_SOURCE_ID = 'update_access_source_id' + UPDATE_ACCESS_PROVIDER_ID = 'update_access_provider_id' + UPDATE_ACCESS_DECL_SOURCE_ID = 'update_access_decl_source_id' + + +class CommonActionsQueueItem(BaseModel): + id: int + + +class ActionsQueueItemDeleteSourceIds(CommonActionsQueueItem): + type: Literal[ActionType.DELETE_SOURCE_IDS] + payload: ActionPayloadDeleteSourceIds + + +class ActionsQueueItemDeleteProviderId(CommonActionsQueueItem): + type: Literal[ActionType.DELETE_PROVIDER_ID] + payload: ActionPayloadDeleteProviderId + + +class ActionsQueueItemDeleteUserId(CommonActionsQueueItem): + type: Literal[ActionType.DELETE_USER_ID] + payload: ActionPayloadDeleteUserId + + +class ActionsQueueItemUpdateAccessSourceId(CommonActionsQueueItem): + type: Literal[ActionType.UPDATE_ACCESS_SOURCE_ID] + payload: ActionPayloadUpdateAccessSourceId + + +class ActionsQueueItemUpdateAccessProviderId(CommonActionsQueueItem): + type: Literal[ActionType.UPDATE_ACCESS_PROVIDER_ID] + payload: ActionPayloadUpdateAccessProviderId + + +class ActionsQueueItemUpdateAccessDeclSourceId(CommonActionsQueueItem): + type: Literal[ActionType.UPDATE_ACCESS_DECL_SOURCE_ID] + payload: ActionPayloadUpdateAccessDeclSourceId + + +ActionsQueueItem = Annotated[ + ActionsQueueItemDeleteSourceIds + | ActionsQueueItemDeleteProviderId + | ActionsQueueItemDeleteUserId + | ActionsQueueItemUpdateAccessSourceId + | ActionsQueueItemUpdateAccessProviderId + | ActionsQueueItemUpdateAccessDeclSourceId, + Discriminator('type'), +] + + +class ActionsQueueItems(BaseModel): + actions: dict[int, ActionsQueueItem] diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index f5879feb..8bcc6f4c 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -338,7 +338,7 @@ def update_access( ) match op: - case UpdateAccessOp.allow: + case UpdateAccessOp.ALLOW: for i in range(0, len(user_ids), PG_BATCH_SIZE): batched_uids = user_ids[i:i+PG_BATCH_SIZE] stmt = ( @@ -355,7 +355,7 @@ def update_access( session.execute(stmt) session.commit() - case UpdateAccessOp.deny: + case UpdateAccessOp.DENY: for i in range(0, len(user_ids), PG_BATCH_SIZE): batched_uids = user_ids[i:i+PG_BATCH_SIZE] stmt = ( @@ -448,15 +448,17 @@ def delete_source_ids(self, source_ids: list[str], session_: orm.Session | None # entry from "AccessListStore" is deleted automatically due to the foreign key constraint # batch the deletion to avoid hitting the query parameter limit chunks_to_delete = [] + deleted_source_ids = [] for i in range(0, len(source_ids), PG_BATCH_SIZE): batched_ids = source_ids[i:i+PG_BATCH_SIZE] stmt_doc = ( sa.delete(DocumentsStore) .filter(DocumentsStore.source_id.in_(batched_ids)) - .returning(DocumentsStore.chunks) + .returning(DocumentsStore.chunks, DocumentsStore.source_id) ) doc_result = session.execute(stmt_doc) chunks_to_delete.extend(str(c) for res in doc_result for c in res.chunks) + deleted_source_ids.extend(str(res.source_id) for res in doc_result) for i in range(0, len(chunks_to_delete), PG_BATCH_SIZE): batched_chunks = chunks_to_delete[i:i+PG_BATCH_SIZE] @@ -476,6 +478,14 @@ def delete_source_ids(self, source_ids: list[str], session_: orm.Session | None if session_ is None: session.close() + undeleted_source_ids = set(source_ids) - set(deleted_source_ids) + if len(undeleted_source_ids) > 0: + logger.info( + f'Source ids {undeleted_source_ids} were not deleted from documents store.' + ' This can be due to the source ids not existing in the documents store due to' + ' already being deleted or not having been added yet.' + ) + def delete_provider(self, provider_key: str): with self.session_maker() as session: try: @@ -519,7 +529,16 @@ def delete_user(self, user_id: str): session.rollback() raise DbException('Error: deleting user from access list') from e - self._cleanup_if_orphaned(list(source_ids), session) + try: + self._cleanup_if_orphaned(list(source_ids), session) + except Exception as e: + session.rollback() + logger.error( + 'Error cleaning up orphaned source ids after deleting user, manual cleanup might be required', + exc_info=e, + extra={ 'source_ids': list(source_ids) }, + ) + raise DbException('Error: cleaning up orphaned source ids after deleting user') from e def count_documents_by_provider(self) -> dict[str, int]: try: diff --git a/context_chat_backend/vectordb/service.py b/context_chat_backend/vectordb/service.py index 620a0b39..06a8e19e 100644 --- a/context_chat_backend/vectordb/service.py +++ b/context_chat_backend/vectordb/service.py @@ -6,27 +6,42 @@ from ..dyn_loader import VectorDBLoader from .base import BaseVectorDB -from .types import DbException, UpdateAccessOp +from .types import UpdateAccessOp logger = logging.getLogger('ccb.vectordb') -# todo: return source ids that were successfully deleted + def delete_by_source(vectordb_loader: VectorDBLoader, source_ids: list[str]): + ''' + Raises + ------ + DbException + LoaderException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('deleting sources by id', extra={ 'source_ids': source_ids }) - try: - db.delete_source_ids(source_ids) - except Exception as e: - raise DbException('Error: Vectordb delete_source_ids error') from e + db.delete_source_ids(source_ids) def delete_by_provider(vectordb_loader: VectorDBLoader, provider_key: str): + ''' + Raises + ------ + DbException + LoaderException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug(f'deleting sources by provider: {provider_key}') db.delete_provider(provider_key) def delete_user(vectordb_loader: VectorDBLoader, user_id: str): + ''' + Raises + ------ + DbException + LoaderException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug(f'deleting user from db: {user_id}') db.delete_user(user_id) @@ -38,6 +53,13 @@ def update_access( user_ids: list[str], source_id: str, ): + ''' + Raises + ------ + DbException + LoaderException + SafeDbException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('updating access', extra={ 'op': op, 'user_ids': user_ids, 'source_id': source_id }) db.update_access(op, user_ids, source_id) @@ -49,6 +71,13 @@ def update_access_provider( user_ids: list[str], provider_id: str, ): + ''' + Raises + ------ + DbException + LoaderException + SafeDbException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('updating access by provider', extra={ 'op': op, 'user_ids': user_ids, 'provider_id': provider_id }) db.update_access_provider(op, user_ids, provider_id) @@ -59,11 +88,24 @@ def decl_update_access( user_ids: list[str], source_id: str, ): + ''' + Raises + ------ + DbException + LoaderException + SafeDbException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('decl update access', extra={ 'user_ids': user_ids, 'source_id': source_id }) db.decl_update_access(user_ids, source_id) def count_documents_by_provider(vectordb_loader: VectorDBLoader): + ''' + Raises + ------ + DbException + LoaderException + ''' db: BaseVectorDB = vectordb_loader.load() logger.debug('counting documents by provider') return db.count_documents_by_provider() diff --git a/context_chat_backend/vectordb/types.py b/context_chat_backend/vectordb/types.py index df5c6dd7..30811797 100644 --- a/context_chat_backend/vectordb/types.py +++ b/context_chat_backend/vectordb/types.py @@ -14,5 +14,5 @@ class SafeDbException(Exception): class UpdateAccessOp(Enum): - allow = 'allow' - deny = 'deny' + ALLOW = 'allow' + DENY = 'deny' From b09a93cafda6726b706f11c8e7815b4a91acfc43 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 14:33:39 +0530 Subject: [PATCH 08/56] fix pydantic types Signed-off-by: Anupam Kumar --- context_chat_backend/types.py | 180 ++++++++++++---------------------- context_chat_backend/utils.py | 10 -- 2 files changed, 64 insertions(+), 126 deletions(-) diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 849c2e31..8577c931 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -2,14 +2,14 @@ # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import re from enum import Enum from io import BytesIO from typing import Annotated, Literal, Self -from pydantic import BaseModel, Discriminator, field_validator +from pydantic import AfterValidator, BaseModel, Discriminator, field_validator, model_validator from .mimetype_list import SUPPORTED_MIMETYPES -from .utils import is_valid_provider_id, is_valid_source_id from .vectordb.types import UpdateAccessOp __all__ = [ @@ -26,6 +26,49 @@ FILES_PROVIDER_ID = 'files__default' +def is_valid_source_id(source_id: str) -> bool: + # note the ":" in the item id part + return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+: [a-zA-Z0-9:-]+$', source_id) is not None + + +def is_valid_provider_id(provider_id: str) -> bool: + return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+$', provider_id) is not None + + +def _validate_source_ids(source_ids: list[str]) -> list[str]: + if ( + not isinstance(source_ids, list) + or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids) + or len(source_ids) == 0 + ): + raise ValueError('sourceIds must be a non-empty list of non-empty strings') + return [sid.strip() for sid in source_ids] + + +def _validate_source_id(source_id: str) -> str: + return _validate_source_ids([source_id])[0] + + +def _validate_provider_id(provider_id: str) -> str: + if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id): + raise ValueError('providerId must be a valid provider ID string') + return provider_id + + +def _validate_user_ids(user_ids: list[str]) -> list[str]: + if ( + not isinstance(user_ids, list) + or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids) + or len(user_ids) == 0 + ): + raise ValueError('userIds must be a non-empty list of non-empty strings') + return [uid.strip() for uid in user_ids] + + +def _validate_user_id(user_id: str) -> str: + return _validate_user_ids([user_id])[0] + + class TEmbeddingAuthApiKey(BaseModel): apikey: str @@ -89,12 +132,13 @@ class AppRole(str, Enum): class CommonSourceItem(BaseModel): - userIds: list[str] - reference: str # source_id of the form "appId__providerId: itemId" + userIds: Annotated[list[str], AfterValidator(_validate_user_ids)] + # source_id of the form "appId__providerId: itemId" + reference: Annotated[str, AfterValidator(_validate_source_id)] title: str modified: int | str # todo: int/string? type: str - provider: str + provider: Annotated[str, AfterValidator(_validate_provider_id)] size: int @field_validator('modified', mode='before') @@ -116,42 +160,13 @@ def validate_strings_non_empty(cls, v): raise ValueError('Must be a non-empty string') return v.strip() - @field_validator('userIds', mode='after') - def validate_user_ids(self) -> Self: - if ( - not isinstance(self.userIds, list) - or not all( - isinstance(uid, str) - and uid.strip() != '' - for uid in self.userIds - ) - or len(self.userIds) == 0 - ): - raise ValueError('userIds must be a non-empty list of non-empty strings') - self.userIds = [uid.strip() for uid in self.userIds] - return self - - @field_validator('reference', mode='after') - def validate_reference_format(self) -> Self: - # validate reference format: "appId__providerId: itemId" - if not is_valid_source_id(self.reference): - raise ValueError('Invalid reference format, must be "appId__providerId: itemId"') - return self - - @field_validator('provider', mode='after') - def validate_provider_format(self) -> Self: - # validate provider format: "appId__providerId" - if not is_valid_provider_id(self.provider): - raise ValueError('Invalid provider format, must be "appId__providerId"') - return self - - @field_validator('type', mode='after') + @model_validator(mode='after') def validate_type(self) -> Self: if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES: raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}') return self - @field_validator('size', mode='after') + @model_validator(mode='after') def validate_size(self) -> Self: if not isinstance(self.size, int) or self.size < 0: raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer') @@ -182,6 +197,10 @@ def validate_content(cls, v): return v raise ValueError('Content must be either a non-empty string or a non-empty BytesIO') + class Config: + # to allow BytesIO in content field + arbitrary_types_allowed = True + class FilesQueueItems(BaseModel): files: dict[int, ReceivedFileItem] # [db id]: FileItem @@ -219,104 +238,33 @@ class IndexingError(BaseModel): # } -def _validate_source_ids(source_ids: list[str]) -> list[str]: - if ( - not isinstance(source_ids, list) - or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids) - or len(source_ids) == 0 - ): - raise ValueError('sourceIds must be a non-empty list of non-empty strings') - return [sid.strip() for sid in source_ids] - - -def _validate_provider_id(provider_id: str) -> str: - if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id): - raise ValueError('providerId must be a valid provider ID string') - return provider_id - - -def _validate_user_ids(user_ids: list[str]) -> list[str]: - if ( - not isinstance(user_ids, list) - or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids) - or len(user_ids) == 0 - ): - raise ValueError('userIds must be a non-empty list of non-empty strings') - return [uid.strip() for uid in user_ids] - - class ActionPayloadDeleteSourceIds(BaseModel): - sourceIds: list[str] - - @field_validator('sourceIds', mode='after') - def validate_source_ids(self) -> Self: - self.sourceIds = _validate_source_ids(self.sourceIds) - return self + sourceIds: Annotated[list[str], AfterValidator(_validate_source_ids)] class ActionPayloadDeleteProviderId(BaseModel): - providerId: str - - @field_validator('providerId') - def validate_provider_id(self) -> Self: - self.providerId = _validate_provider_id(self.providerId) - return self + providerId: Annotated[str, AfterValidator(_validate_provider_id)] class ActionPayloadDeleteUserId(BaseModel): - userId: str - - @field_validator('userId') - def validate_user_id(self) -> Self: - self.userId = _validate_user_ids([self.userId])[0] - return self + userId: Annotated[str, AfterValidator(_validate_user_id)] class ActionPayloadUpdateAccessSourceId(BaseModel): op: UpdateAccessOp - userIds: list[str] - sourceId: str - - @field_validator('userIds', mode='after') - def validate_user_ids(self) -> Self: - self.userIds = _validate_user_ids(self.userIds) - return self - - @field_validator('sourceId') - def validate_source_id(self) -> Self: - self.sourceId = _validate_source_ids([self.sourceId])[0] - return self + userIds: Annotated[list[str], AfterValidator(_validate_user_ids)] + sourceId: Annotated[str, AfterValidator(_validate_source_id)] class ActionPayloadUpdateAccessProviderId(BaseModel): op: UpdateAccessOp - userIds: list[str] - providerId: str - - @field_validator('userIds', mode='after') - def validate_user_ids(self) -> Self: - self.userIds = _validate_user_ids(self.userIds) - return self - - @field_validator('providerId') - def validate_provider_id(self) -> Self: - self.providerId = _validate_provider_id(self.providerId) - return self + userIds: Annotated[list[str], AfterValidator(_validate_user_ids)] + providerId: Annotated[str, AfterValidator(_validate_provider_id)] class ActionPayloadUpdateAccessDeclSourceId(BaseModel): - userIds: list[str] - sourceId: str - - @field_validator('userIds', mode='after') - def validate_user_ids(self) -> Self: - self.userIds = _validate_user_ids(self.userIds) - return self - - @field_validator('sourceId') - def validate_source_id(self) -> Self: - self.sourceId = _validate_source_ids([self.sourceId])[0] - return self + userIds: Annotated[list[str], AfterValidator(_validate_user_ids)] + sourceId: Annotated[str, AfterValidator(_validate_source_id)] class ActionType(str, Enum): diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 224f466e..c7e588b3 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -5,7 +5,6 @@ import logging import multiprocessing as mp import os -import re import traceback from collections.abc import Callable from functools import partial, wraps @@ -102,15 +101,6 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem return result['value'] -def is_valid_source_id(source_id: str) -> bool: - # note the ":" in the item id part - return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+: [a-zA-Z0-9:-]+$', source_id) is not None - - -def is_valid_provider_id(provider_id: str) -> bool: - return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+$', provider_id) is not None - - def timed(func: Callable): ''' Decorator to time a function From 11b436c8ce43778dbf6beda8a7e3978626e7aee5 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 14:34:55 +0530 Subject: [PATCH 09/56] fix: use a dedicated event to allow app halt without app being disabled Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 1 + context_chat_backend/task_fetcher.py | 28 ++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 580416f7..55206ca0 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -88,6 +88,7 @@ async def lifespan(app: FastAPI): nc = NextcloudApp() if nc.enabled_state: app_enabled.set() + start_bg_threads(app_config, app_enabled) logger.info(f'App enable state at startup: {app_enabled.is_set()}') t = Thread(target=background_thread_task, args=()) t.start() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 84b974b2..e93eac34 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -44,6 +44,7 @@ APP_ROLE = get_app_role() THREADS = {} +THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') FILES_INDEXING_BATCH_SIZE = 64 # todo: config? # divides the batch into these many chunks @@ -199,8 +200,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro while True: - if not app_enabled.is_set(): - LOGGER.info('Files indexing thread is stopping as the app is disabled') + if THREAD_STOP_EVENT.is_set(): + LOGGER.info('Files indexing thread is stopping due to stop event being set') return try: @@ -329,8 +330,8 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: return while True: - if not app_enabled.is_set(): - LOGGER.info('Files indexing thread is stopping as the app is disabled') + if THREAD_STOP_EVENT.is_set(): + LOGGER.info('Updates processing thread is stopping due to stop event being set') return try: @@ -490,6 +491,14 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: def start_bg_threads(app_config: TConfig, app_enabled: Event): match APP_ROLE: case AppRole.INDEXING | AppRole.NORMAL: + if ( + ThreadType.FILES_INDEXING in THREADS + or ThreadType.UPDATES_PROCESSING in THREADS + ): + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() THREADS[ThreadType.FILES_INDEXING] = Thread( target=files_indexing_thread, args=(app_config, app_enabled), @@ -502,7 +511,13 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event): ) THREADS[ThreadType.FILES_INDEXING].start() THREADS[ThreadType.UPDATES_PROCESSING].start() + case AppRole.RP | AppRole.NORMAL: + if ThreadType.REQUEST_PROCESSING in THREADS: + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() THREADS[ThreadType.REQUEST_PROCESSING] = Thread( target=request_processing_thread, args=(app_config, app_enabled), @@ -516,12 +531,17 @@ def wait_for_bg_threads(): case AppRole.INDEXING | AppRole.NORMAL: if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): return + + THREAD_STOP_EVENT.set() THREADS[ThreadType.FILES_INDEXING].join() THREADS[ThreadType.UPDATES_PROCESSING].join() THREADS.pop(ThreadType.FILES_INDEXING) THREADS.pop(ThreadType.UPDATES_PROCESSING) + case AppRole.RP | AppRole.NORMAL: if (ThreadType.REQUEST_PROCESSING not in THREADS): return + + THREAD_STOP_EVENT.set() THREADS[ThreadType.REQUEST_PROCESSING].join() THREADS.pop(ThreadType.REQUEST_PROCESSING) From c88e15364d53764257f7fddaca76505cf27c80d9 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 17:54:48 +0530 Subject: [PATCH 10/56] fix fetch url and pydantic types Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 14 +++++++------- context_chat_backend/types.py | 17 +++++++++-------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index e93eac34..5784d12b 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -80,7 +80,7 @@ async def __fetch_file_content( # a file pointer for storing the stream in memory until it is consumed fp = BytesIO() await nc._session.download2fp( - url_path=f'/apps/context_chat/files/{file_id}', + url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}', fp=fp, dav=False, params={ 'userId': user_id }, @@ -209,7 +209,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro # todo: add the 'size' param to the return of this call. q_items_res = nc.ocs( 'GET', - '/apps/context_chat/queues/documents', + '/ocs/v2.php/apps/context_chat/queues/documents', params={ 'n': FILES_INDEXING_BATCH_SIZE } ) @@ -292,7 +292,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro try: nc.ocs( 'DELETE', - '/apps/context_chat/queues/documents/', + '/ocs/v2.php/apps/context_chat/queues/documents/', json={ 'files': to_delete_file_ids, 'content_providers': to_delete_provider_ids, @@ -308,7 +308,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro nc = NextcloudApp() nc.ocs( 'DELETE', - '/apps/context_chat/queues/documents/', + '/ocs/v2.php/apps/context_chat/queues/documents/', json={ 'files': to_delete_file_ids, 'content_providers': to_delete_provider_ids, @@ -338,7 +338,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: nc = NextcloudApp() q_items_res = nc.ocs( 'GET', - '/apps/context_chat/queues/actions', + '/ocs/v2.php/apps/context_chat/queues/actions', params={ 'n': ACTIONS_BATCH_SIZE } ) @@ -461,7 +461,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: try: nc.ocs( 'DELETE', - '/apps/context_chat/queues/actions/', + '/ocs/v2.php/apps/context_chat/queues/actions/', json={ 'actions': processed_event_ids }, ) except ( @@ -474,7 +474,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: nc = NextcloudApp() nc.ocs( 'DELETE', - '/apps/context_chat/queues/actions/', + '/ocs/v2.php/apps/context_chat/queues/actions/', json={ 'ids': processed_event_ids }, ) continue diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 8577c931..972756fa 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -136,10 +136,10 @@ class CommonSourceItem(BaseModel): # source_id of the form "appId__providerId: itemId" reference: Annotated[str, AfterValidator(_validate_source_id)] title: str - modified: int | str # todo: int/string? + modified: int type: str provider: Annotated[str, AfterValidator(_validate_provider_id)] - size: int + size: float @field_validator('modified', mode='before') @classmethod @@ -160,18 +160,19 @@ def validate_strings_non_empty(cls, v): raise ValueError('Must be a non-empty string') return v.strip() + @field_validator('size') + @classmethod + def validate_size(cls, v): + if isinstance(v, int | float) and v >= 0: + return float(v) + raise ValueError(f'Invalid size value: {v}, must be a non-negative number') + @model_validator(mode='after') def validate_type(self) -> Self: if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES: raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}') return self - @model_validator(mode='after') - def validate_size(self) -> Self: - if not isinstance(self.size, int) or self.size < 0: - raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer') - return self - class ReceivedFileItem(CommonSourceItem): content: None From cd5241e199a2ae2316d4f8f3841aa27bb7c12842 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 18:52:35 +0530 Subject: [PATCH 11/56] fix: use the correct file id Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 9 ++-- context_chat_backend/task_fetcher.py | 79 +++++++++++++++++----------- context_chat_backend/types.py | 22 +++++++- 3 files changed, 75 insertions(+), 35 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 55206ca0..797ba201 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -24,7 +24,6 @@ from contextlib import asynccontextmanager from functools import wraps from threading import Event, Thread -from time import sleep from typing import Any from fastapi import FastAPI, Request @@ -130,9 +129,11 @@ async def lifespan(app: FastAPI): # logger background thread def background_thread_task(): - while(True): - logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing}) - sleep(10) + # todo + # while(True): + # logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing}) + # sleep(10) + ... # exception handlers diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 5784d12b..0442cd53 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -125,15 +125,29 @@ async def __fetch_files_content( semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) tasks = [] - for file_id, file_item in files.items(): - if file_item.size > MAX_FILE_SIZE: + for db_id, file in files.items(): + try: + # to detect any validation errors but it should not happen since file.reference is validated + file.file_id # noqa: B018 + except ValueError as e: + LOGGER.error( + f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}', + exc_info=e, + ) + source_items[db_id] = IndexingError( + error=f'Invalid file reference format: {file.reference}', + retryable=False, + ) + continue + + if file.size > MAX_FILE_SIZE: LOGGER.info( - f'Skipping file id {file_id}, source id {file_item.reference} due to size' - f' {(file_item.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', + f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size' + f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', ) - source_items[file_id] = IndexingError( + source_items[db_id] = IndexingError( error=( - f'File size {(file_item.size/(1024*1024)):.2f} MiB' + f'File size {(file.size/(1024*1024)):.2f} MiB' f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB' ), retryable=False, @@ -141,39 +155,44 @@ async def __fetch_files_content( continue # todo: perform the existing file check before fetching the content to avoid unnecessary fetches # any user id from the list should have read access to the file - tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file_id, file_item.userIds[0]))) + tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) results = await asyncio.gather(*tasks, return_exceptions=True) - for (file_id, file_item), result in zip(files.items(), results, strict=True): + for (db_id, file), result in zip(files.items(), results, strict=True): if isinstance(result, IndexingException): LOGGER.error( - f'Error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' + f': {result}', exc_info=result, ) - source_items[file_id] = IndexingError( + source_items[db_id] = IndexingError( error=str(result), retryable=result.retryable, ) elif isinstance(result, str) or isinstance(result, BytesIO): - source_items[file_id] = SourceItem( - **file_item.model_dump(), - content=result, + source_items[db_id] = SourceItem( + **{ + **file.model_dump(), + 'content': result, + } ) elif isinstance(result, BaseException): LOGGER.error( - f'Unexpected error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},' + f' reference {file.reference}: {result}', exc_info=result, ) - source_items[file_id] = IndexingError( + source_items[db_id] = IndexingError( error=f'Unexpected error: {result}', retryable=True, ) else: LOGGER.error( - f'Unknown error fetching content for file id {file_id}, reference {file_item.reference}: {result}', + f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' + f': {result}', exc_info=True, ) - source_items[file_id] = IndexingError( + source_items[db_id] = IndexingError( error='Unknown error', retryable=True, ) @@ -232,11 +251,11 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro if q_items.files: fetched_files = asyncio.run(__fetch_files_content(q_items.files)) - for file_id, result in fetched_files.items(): + for db_id, result in fetched_files.items(): if isinstance(result, SourceItem): - source_files[file_id] = result + source_files[db_id] = result else: - source_errors[file_id] = result + source_errors[db_id] = result files_result = {} providers_result = {} @@ -257,8 +276,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro ): LOGGER.error('Some sources failed to index', extra={ 'file_errors': { - file_id: error - for file_id, error in files_result.items() + db_id: error + for db_id, error in files_result.items() if isinstance(error, IndexingError) }, 'provider_errors': { @@ -280,12 +299,12 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro continue # delete the entries from the PHP side queue where indexing succeeded or the error is not retryable - to_delete_file_ids = [ - file_id for file_id, result in files_result.items() + to_delete_files_db_ids = [ + db_id for db_id, result in files_result.items() if result is None or (isinstance(result, IndexingError) and not result.retryable) ] - to_delete_provider_ids = [ - provider_id for provider_id, result in providers_result.items() + to_delete_provider_db_ids = [ + db_id for db_id, result in providers_result.items() if result is None or (isinstance(result, IndexingError) and not result.retryable) ] @@ -294,8 +313,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro 'DELETE', '/ocs/v2.php/apps/context_chat/queues/documents/', json={ - 'files': to_delete_file_ids, - 'content_providers': to_delete_provider_ids, + 'files': to_delete_files_db_ids, + 'content_providers': to_delete_provider_db_ids, }, ) except ( @@ -310,8 +329,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro 'DELETE', '/ocs/v2.php/apps/context_chat/queues/documents/', json={ - 'files': to_delete_file_ids, - 'content_providers': to_delete_provider_ids, + 'files': to_delete_files_db_ids, + 'content_providers': to_delete_provider_db_ids, }, ) continue diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 972756fa..9f23e14f 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -7,7 +7,7 @@ from io import BytesIO from typing import Annotated, Literal, Self -from pydantic import AfterValidator, BaseModel, Discriminator, field_validator, model_validator +from pydantic import AfterValidator, BaseModel, Discriminator, computed_field, field_validator, model_validator from .mimetype_list import SUPPORTED_MIMETYPES from .vectordb.types import UpdateAccessOp @@ -69,6 +69,21 @@ def _validate_user_id(user_id: str) -> str: return _validate_user_ids([user_id])[0] +def _get_file_id_from_source_ref(source_ref: str) -> int: + ''' + source reference is in the format "FILES_PROVIDER_ID: ". + ''' + if not source_ref.startswith(f'{FILES_PROVIDER_ID}: '): + raise ValueError(f'Source reference does not start with expected prefix: {source_ref}') + + try: + return int(source_ref[len(f'{FILES_PROVIDER_ID}: '):]) + except ValueError as e: + raise ValueError( + f'Invalid source reference format for extracting file_id: {source_ref}' + ) from e + + class TEmbeddingAuthApiKey(BaseModel): apikey: str @@ -177,6 +192,11 @@ def validate_type(self) -> Self: class ReceivedFileItem(CommonSourceItem): content: None + @computed_field + @property + def file_id(self) -> int: + return _get_file_id_from_source_ref(self.reference) + class SourceItem(CommonSourceItem): ''' From 4958d1d980b0d0741762ffc9c3eac3ff91e5c2b0 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 19:24:51 +0530 Subject: [PATCH 12/56] fix: wip: improve embeddings exception handling Signed-off-by: Anupam Kumar --- context_chat_backend/network_em.py | 13 +++++++++---- context_chat_backend/task_fetcher.py | 1 + context_chat_backend/vectordb/pgvector.py | 17 ++++++----------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py index 18bb11f4..d39ea56a 100644 --- a/context_chat_backend/network_em.py +++ b/context_chat_backend/network_em.py @@ -79,6 +79,7 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] raise FatalEmbeddingException(response.text) if response.status_code // 100 != 2: raise EmbeddingException(response.text) + # todo: rework exception handling and their downstream interpretation except FatalEmbeddingException as e: logger.error('Fatal error while getting embeddings: %s', str(e), exc_info=e) raise e @@ -108,10 +109,14 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] logger.error('Unexpected error while getting embeddings', exc_info=e) raise EmbeddingException('Error: unexpected error while getting embeddings') from e - # converts TypedDict to a pydantic model - resp = CreateEmbeddingResponse(**response.json()) - if isinstance(input_, str): - return resp['data'][0]['embedding'] + try: + # converts TypedDict to a pydantic model + resp = CreateEmbeddingResponse(**response.json()) + if isinstance(input_, str): + return resp['data'][0]['embedding'] + except Exception as e: + logger.error('Error parsing embedding response', exc_info=e) + raise EmbeddingException('Error: failed to parse embedding response') from e # only one embedding in d['embedding'] since truncate is True return [d['embedding'] for d in resp['data']] # pyright: ignore[reportReturnType] diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 0442cd53..51f98e7d 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -261,6 +261,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro providers_result = {} chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING + # todo: do it in asyncio, it's not truly parallel yet # chunk file parsing for better file operation parallelism for i in range(0, len(source_files), chunk_size): chunk = dict(list(source_files.items())[i:i+chunk_size]) diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 8bcc6f4c..bfca0bb6 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -17,7 +17,7 @@ from langchain_postgres.vectorstores import Base, PGVector from ..chain.types import InDocument, ScopeType -from ..types import EmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem +from ..types import EmbeddingException, FatalEmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem from ..utils import timed from .base import BaseVectorDB from .types import DbException, SafeDbException, UpdateAccessOp @@ -181,7 +181,11 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index retryable=True, ) continue - except RetryableEmbeddingException as e: + except FatalEmbeddingException as e: + raise EmbeddingException( + f'Fatal error while embedding documents for source {indoc.source_id}: {e}' + ) from e + except (RetryableEmbeddingException, EmbeddingException) as e: # temporary error, continue with the next document logger.exception('Error adding documents to vectordb, should be retried later.', exc_info=e, extra={ 'source_id': indoc.source_id, @@ -191,15 +195,6 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index retryable=True, ) continue - except EmbeddingException as e: - logger.exception('Error adding documents to vectordb', exc_info=e, extra={ - 'source_id': indoc.source_id, - }) - results[php_db_id] = IndexingError( - error=str(e), - retryable=False, - ) - continue except Exception as e: logger.exception('Error adding documents to vectordb', exc_info=e, extra={ 'source_id': indoc.source_id, From a04912120965d8ff9a285eac559794b716a595ce Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 11 Mar 2026 19:44:06 +0530 Subject: [PATCH 13/56] fix(ci): update to the latest changes Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 104 ++++++++++++++++++------- 1 file changed, 76 insertions(+), 28 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index fb06bafa..9563bcdd 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -199,26 +199,87 @@ jobs: ls -la context_chat_backend/persistent_storage/* sleep 30 # Wait for the em server to get ready - - name: Scan files, baseline - run: | - ./occ files:scan admin - ./occ context_chat:scan admin -m text/plain - - - name: Check python memory usage + - name: Initial memory usage check run: | ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem ps -p $(cat pid.txt) -o %mem --no-headers > initial_mem.txt - - name: Scan files - run: | - ./occ files:scan admin - ./occ context_chat:scan admin -m text/markdown & - ./occ context_chat:scan admin -m text/x-rst - - - name: Check python memory usage + - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | - ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem - ps -p $(cat pid.txt) -o %mem --no-headers > after_scan_mem.txt + success=0 + for i in {1..90}; do + echo "Checking stats, attempt $i..." + + mkfifo error_pipe + stats=$(timeout 5 ./occ context_chat:stats 2>error_pipe) + echo "Stats output:" + echo "$stats" + echo "---" + + # Check for critical errors in output + if echo "$stats" | grep -q "Error during request"; then + echo "Backend connection error detected, retrying..." + rm -f error_pipe + sleep 10 + continue + fi + + # Extract Total eligible files + total_files=$(echo "$stats" | grep -oP 'Total eligible files:\s*\K\d+' || echo "") + + # Extract Indexed documents count (files__default) + indexed_count=$(echo "$stats" | grep -oP "'files__default'\s*=>\s*\K\d+" || echo "") + + # Validate parsed values + if [ -z "$total_files" ] || [ -z "$indexed_count" ]; then + echo "Error: Could not parse stats output properly" + if echo "$stats" | grep -q "Indexed documents:"; then + echo " Indexed documents section found but could not extract count" + fi + rm -f error_pipe + sleep 10 + continue + fi + + echo "Total eligible files: $total_files" + echo "Indexed documents (files__default): $indexed_count" + + # Calculate absolute difference + diff=$((total_files - indexed_count)) + if [ $diff -lt 0 ]; then + diff=$((-diff)) + fi + + # Calculate 2% threshold using bc for floating point support + threshold=$(echo "scale=4; $total_files * 0.02" | bc) + + # Check if difference is within tolerance + if (( $(echo "$diff <= $threshold" | bc -l) )); then + echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)" + rm -f error_pipe + success=1 + break + else + pct=$(echo "scale=2; ($diff / $total_files) * 100" | bc) + echo "Outside 2% tolerance: diff=$diff (${pct}%), threshold=$threshold" + fi + + # Check if backend is still alive + ccb_alive=$(ps -p $(cat pid.txt) -o cmd= | grep -c "main.py" || echo "0") + if [ "$ccb_alive" -eq 0 ]; then + echo "Error: Context Chat Backend process is not running. Exiting." + rm -f error_pipe + exit 1 + fi + + rm -f error_pipe + sleep 10 + done + + if [ $success -ne 1 ]; then + echo "Max attempts reached" + exit 1 + fi - name: Run the prompts run: | @@ -252,19 +313,6 @@ jobs: echo "Memory usage during scan is stable. No memory leak detected." fi - - name: Compare memory usage and detect leak - run: | - initial_mem=$(cat after_scan_mem.txt | tr -d ' ') - final_mem=$(cat after_prompt_mem.txt | tr -d ' ') - echo "Initial Memory Usage: $initial_mem%" - echo "Memory Usage after prompt: $final_mem%" - - if (( $(echo "$final_mem > $initial_mem" | bc -l) )); then - echo "Memory usage has increased during prompt. Possible memory leak detected!" - else - echo "Memory usage during prompt is stable. No memory leak detected." - fi - - name: Show server logs if: always() run: | From 795380c7c62ce5f60f80aa16ffa1e7568133f03e Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 12 Mar 2026 16:10:58 +0530 Subject: [PATCH 14/56] fix(ci): use file to store stderr Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 9563bcdd..de0f4659 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -210,16 +210,21 @@ jobs: for i in {1..90}; do echo "Checking stats, attempt $i..." - mkfifo error_pipe - stats=$(timeout 5 ./occ context_chat:stats 2>error_pipe) + stats_err=$(mktemp) + stats=$(timeout 5 ./occ context_chat:stats 2>"$stats_err") + stats_exit=$? echo "Stats output:" echo "$stats" + if [ -s "$stats_err" ]; then + echo "Stderr:" + cat "$stats_err" + fi echo "---" + rm -f "$stats_err" # Check for critical errors in output - if echo "$stats" | grep -q "Error during request"; then - echo "Backend connection error detected, retrying..." - rm -f error_pipe + if [ $stats_exit -ne 0 ] || echo "$stats" | grep -q "Error during request"; then + echo "Backend connection error detected (exit=$stats_exit), retrying..." sleep 10 continue fi @@ -236,7 +241,6 @@ jobs: if echo "$stats" | grep -q "Indexed documents:"; then echo " Indexed documents section found but could not extract count" fi - rm -f error_pipe sleep 10 continue fi @@ -256,7 +260,6 @@ jobs: # Check if difference is within tolerance if (( $(echo "$diff <= $threshold" | bc -l) )); then echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)" - rm -f error_pipe success=1 break else @@ -268,11 +271,9 @@ jobs: ccb_alive=$(ps -p $(cat pid.txt) -o cmd= | grep -c "main.py" || echo "0") if [ "$ccb_alive" -eq 0 ]; then echo "Error: Context Chat Backend process is not running. Exiting." - rm -f error_pipe exit 1 fi - rm -f error_pipe sleep 10 done From 7bc0ed7c3c535f930f03cc38c4dd884b5370696c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 12 Mar 2026 17:17:38 +0530 Subject: [PATCH 15/56] fix(ci): add cron jobs Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index de0f4659..0d8e4229 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -204,9 +204,18 @@ jobs: ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem ps -p $(cat pid.txt) -o %mem --no-headers > initial_mem.txt + - name: Run cron jobs + run: | + # every 10 seconds indefinitely + while true; do + php cron.php + sleep 10 + done & + - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | success=0 + echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files" for i in {1..90}; do echo "Checking stats, attempt $i..." @@ -277,6 +286,10 @@ jobs: sleep 10 done + echo "::endgroup::" + + ./occ context_chat:stats + if [ $success -ne 1 ]; then echo "Max attempts reached" exit 1 From d94c687e057a7049e6b0f1f32b580f326692acd3 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 12 Mar 2026 17:35:47 +0530 Subject: [PATCH 16/56] fix(ci): do a occ files scan before cron jobs Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 0d8e4229..58f9f50c 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -169,6 +169,10 @@ jobs: cd .. rm -rf documentation + - name: Run files scan + run: | + ./occ files:scan --all + - name: Setup python 3.11 uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5 with: From dadc8fa7d193f40ddacffecf6266d8a2b37a6817 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 16 Mar 2026 20:09:30 +0530 Subject: [PATCH 17/56] feat: record indexing errors in content decode function Signed-off-by: Anupam Kumar --- .../chain/ingest/doc_loader.py | 44 +++++++++---------- context_chat_backend/chain/ingest/injest.py | 20 ++++++--- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index d26f74b1..832c8331 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # -import logging import re import tempfile from collections.abc import Callable @@ -18,9 +17,8 @@ from pypdf.errors import FileNotDecryptedError as PdfFileNotDecryptedError from striprtf import striprtf -from ...types import SourceItem +from ...types import IndexingException, SourceItem -logger = logging.getLogger('ccb.doc_loader') def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str: raw_bytes = file.read() @@ -75,10 +73,10 @@ def _load_xlsx(file: BytesIO) -> str: return read_excel(file, na_filter=False).to_string(header=False, na_rep='') -def _load_email(file: BytesIO, ext: str = 'eml') -> str | None: +def _load_email(file: BytesIO, ext: str = 'eml') -> str: # NOTE: msg format is not tested if ext not in ['eml', 'msg']: - return None + raise IndexingException(f'Unsupported email format: {ext}') # TODO: implement attachment partitioner using unstructured.partition.partition_{email,msg} # since langchain does not pass through the attachment_partitioner kwarg @@ -116,34 +114,36 @@ def attachment_partitioner( } -def decode_source(source: SourceItem) -> str | None: +def decode_source(source: SourceItem) -> str: + ''' + Raises + ------ + IndexingException + ''' + io_obj: BytesIO | None = None try: # .pot files are powerpoint templates but also plain text files, # so we skip them to prevent decoding errors if source.title.endswith('.pot'): - return None - - mimetype = source.type - if mimetype is None: - return None + raise IndexingException('PowerPoint template files (.pot) are not supported') if isinstance(source.content, str): io_obj = BytesIO(source.content.encode('utf-8', 'ignore')) else: io_obj = source.content - if _loader_map.get(mimetype): - result = _loader_map[mimetype](io_obj) - return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore') - - return io_obj.read().decode('utf-8', 'ignore') - except PdfFileNotDecryptedError: - logger.warning(f'PDF file ({source.reference}) is encrypted and cannot be read') - return None - except Exception: - logger.exception(f'Error decoding source file ({source.reference})', stack_info=True) - return None + if _loader_map.get(source.type): + result = _loader_map[source.type](io_obj) + return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip() + + return io_obj.read().decode('utf-8', 'ignore').strip() + except IndexingException: + raise + except PdfFileNotDecryptedError as e: + raise IndexingException('PDF file is encrypted and cannot be read') from e + except Exception as e: + raise IndexingException(f'Error decoding source file: {e}') from e finally: if io_obj is not None: io_obj.close() diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 7369f452..d9ea5433 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -8,7 +8,7 @@ from langchain.schema import Document from ...dyn_loader import VectorDBLoader -from ...types import IndexingError, SourceItem, TConfig +from ...types import IndexingError, IndexingException, SourceItem, TConfig from ...vectordb.base import BaseVectorDB from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp from ..types import InDocument @@ -59,9 +59,17 @@ def _sources_to_indocuments( # todo: maybe fetch the content of the files here # transform the source to have text data - content = decode_source(source) + try: + content = decode_source(source) + except IndexingException as e: + logger.error(f'Error decoding source ({source.reference}): {e}', exc_info=e) + errored_docs[db_id] = IndexingError( + error=str(e), + retryable=False, + ) + continue - if content is None or (content := content.strip()) == '': + if content == '': logger.debug('decoded empty source', extra={ 'source_id': source.reference }) errored_docs[db_id] = IndexingError( error='Decoded content is empty', @@ -74,12 +82,12 @@ def _sources_to_indocuments( # NOTE: do not use this with all docs when programming files are added content = re.sub(r'(\s){5,}', r'\g<1>', content) # filter out null bytes - content = content.replace('\0', '') + content = content.replace('\0', '').strip() - if content is None or content == '': + if content == '': logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.reference }) errored_docs[db_id] = IndexingError( - error='Decoded content is empty', + error='Cleaned up content is empty', retryable=False, ) continue From f9d86dcf1ddac21e61edcc3698b79e0a69475a24 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 17 Mar 2026 20:27:10 +0530 Subject: [PATCH 18/56] chore: move file fetch inside injest Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 197 ++++++++++++++++++-- context_chat_backend/task_fetcher.py | 173 +---------------- context_chat_backend/types.py | 7 +- context_chat_backend/vectordb/base.py | 11 +- context_chat_backend/vectordb/pgvector.py | 14 +- 5 files changed, 208 insertions(+), 194 deletions(-) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index d9ea5433..18a37b4b 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -2,13 +2,18 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import asyncio import logging import re +from collections.abc import Mapping +from io import BytesIO +import niquests from langchain.schema import Document +from nc_py_api import AsyncNextcloudApp from ...dyn_loader import VectorDBLoader -from ...types import IndexingError, IndexingException, SourceItem, TConfig +from ...types import IndexingError, IndexingException, ReceivedFileItem, SourceItem, TConfig from ...vectordb.base import BaseVectorDB from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp from ..types import InDocument @@ -17,15 +22,165 @@ logger = logging.getLogger('ccb.injest') +# max concurrent fetches to avoid overloading the NC server or hitting rate limits +CONCURRENT_FILE_FETCHES = 10 # todo: config? +MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, all loaded in RAM at once, todo: config? + + +async def __fetch_file_content( + semaphore: asyncio.Semaphore, + file_id: int, + user_id: str, + _rlimit = 3, +) -> BytesIO: + ''' + Raises + ------ + IndexingException + ''' + + async with semaphore: + nc = AsyncNextcloudApp() + try: + # a file pointer for storing the stream in memory until it is consumed + fp = BytesIO() + await nc._session.download2fp( + url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}', + fp=fp, + dav=False, + params={ 'userId': user_id }, + ) + return fp + except niquests.exceptions.RequestException as e: + if e.response is None: + raise + + if e.response.status_code == niquests.codes.too_many_requests: # pyright: ignore[reportAttributeAccessIssue] + # todo: implement rate limits in php CC? + wait_for = int(e.response.headers.get('Retry-After', '30')) + if _rlimit <= 0: + raise IndexingException( + f'Rate limited when fetching content for file id {file_id}, user id {user_id},' + ' max retries exceeded', + retryable=True, + ) from e + logger.warning( + f'Rate limited when fetching content for file id {file_id}, user id {user_id},' + f' waiting {wait_for} before retrying', + exc_info=e, + ) + await asyncio.sleep(wait_for) + return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1) + + raise + except IndexingException: + raise + except Exception as e: + logger.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e) + raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e + + +async def __fetch_files_content( + sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> tuple[Mapping[int, SourceItem], Mapping[int, IndexingError]]: + source_items = {} + error_items = {} + semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) + tasks = [] + + for db_id, file in sources.items(): + if isinstance(file, SourceItem): + continue + + try: + # to detect any validation errors but it should not happen since file.reference is validated + file.file_id # noqa: B018 + except ValueError as e: + logger.error( + f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}', + exc_info=e, + ) + error_items[db_id] = IndexingError( + error=f'Invalid file reference format: {file.reference}', + retryable=False, + ) + continue + + if file.size > MAX_FILE_SIZE: + logger.info( + f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size' + f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', + ) + error_items[db_id] = IndexingError( + error=( + f'File size {(file.size/(1024*1024)):.2f} MiB' + f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB' + ), + retryable=False, + ) + continue + # any user id from the list should have read access to the file + tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) + + results = await asyncio.gather(*tasks, return_exceptions=True) + for (db_id, file), result in zip(sources.items(), results, strict=True): + if isinstance(file, SourceItem): + continue + + if isinstance(result, IndexingException): + logger.error( + f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' + f': {result}', + exc_info=result, + ) + error_items[db_id] = IndexingError( + error=str(result), + retryable=result.retryable, + ) + elif isinstance(result, str) or isinstance(result, BytesIO): + source_items[db_id] = SourceItem( + **{ + **file.model_dump(), + 'content': result, + } + ) + elif isinstance(result, BaseException): + logger.error( + f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},' + f' reference {file.reference}: {result}', + exc_info=result, + ) + error_items[db_id] = IndexingError( + error=f'Unexpected error: {result}', + retryable=True, + ) + else: + logger.error( + f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' + f': {result}', + exc_info=True, + ) + error_items[db_id] = IndexingError( + error='Unknown error', + retryable=True, + ) + + # add the content providers from the orginal "sources" to the result unprocessed + for db_id, source in sources.items(): + if isinstance(source, SourceItem): + source_items[db_id] = source + + return source_items, error_items + def _filter_sources( vectordb: BaseVectorDB, - sources: dict[int, SourceItem] -) -> tuple[dict[int, SourceItem], dict[int, SourceItem]]: + sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> tuple[Mapping[int, SourceItem | ReceivedFileItem], Mapping[int, SourceItem | ReceivedFileItem]]: ''' Returns ------- - tuple[list[str], list[UploadFile]] + tuple[Mapping[int, SourceItem | ReceivedFileItem], Mapping[int, SourceItem | ReceivedFileItem]]: First value is a list of sources that already exist in the vectordb. Second value is a list of sources that are new and should be embedded. ''' @@ -49,15 +204,14 @@ def _filter_sources( def _sources_to_indocuments( config: TConfig, - sources: dict[int, SourceItem] -) -> tuple[dict[int, InDocument], dict[int, IndexingError]]: + sources: Mapping[int, SourceItem] +) -> tuple[Mapping[int, InDocument], Mapping[int, IndexingError]]: indocuments = {} errored_docs = {} for db_id, source in sources.items(): logger.debug('processing source', extra={ 'source_id': source.reference }) - # todo: maybe fetch the content of the files here # transform the source to have text data try: content = decode_source(source) @@ -121,8 +275,8 @@ def _sources_to_indocuments( def _increase_access_for_existing_sources( vectordb: BaseVectorDB, - existing_sources: dict[int, SourceItem] -) -> dict[int, IndexingError | None]: + existing_sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> Mapping[int, IndexingError | None]: ''' update userIds for existing sources allow the userIds as additional users, not as the only users @@ -162,8 +316,8 @@ def _increase_access_for_existing_sources( def _process_sources( vectordb: BaseVectorDB, config: TConfig, - sources: dict[int, SourceItem] -) -> dict[int, IndexingError | None]: + sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> Mapping[int, IndexingError | None]: ''' Processes the sources and adds them to the vectordb. Returns the list of source ids that were successfully added and those that need to be retried. @@ -178,18 +332,21 @@ def _process_sources( source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources) - if len(to_embed_sources) == 0: + populated_to_embed_sources, errored_sources = asyncio.run(__fetch_files_content(to_embed_sources)) + source_proc_results.update(errored_sources) # pyright: ignore[reportAttributeAccessIssue] + + if len(populated_to_embed_sources) == 0: # no new sources to embed logger.debug('Filtered all sources, nothing to embed') return source_proc_results logger.debug('Filtered sources:', extra={ - 'source_ids': [source.reference for source in to_embed_sources.values()] + 'source_ids': [source.reference for source in populated_to_embed_sources.values()] }) # invalid/empty sources are filtered out here and not counted in loaded/retryable - indocuments, errored_docs = _sources_to_indocuments(config, to_embed_sources) + indocuments, errored_docs = _sources_to_indocuments(config, populated_to_embed_sources) - source_proc_results.update(errored_docs) + source_proc_results.update(errored_docs) # pyright: ignore[reportAttributeAccessIssue] logger.debug('Converted sources to documents') if len(indocuments) == 0: @@ -197,8 +354,12 @@ def _process_sources( logger.debug('All documents were found empty after being processed') return source_proc_results + logger.debug('Adding documents to vectordb', extra={ + 'source_ids': [indoc.source_id for indoc in indocuments.values()] + }) + doc_add_results = vectordb.add_indocuments(indocuments) - source_proc_results.update(doc_add_results) + source_proc_results.update(doc_add_results) # pyright: ignore[reportAttributeAccessIssue] logger.debug('Added documents to vectordb') return source_proc_results @@ -215,8 +376,8 @@ def _decode_latin_1(s: str) -> str: def embed_sources( vectordb_loader: VectorDBLoader, config: TConfig, - sources: dict[int, SourceItem] -) -> dict[int, IndexingError | None]: + sources: Mapping[int, SourceItem | ReceivedFileItem] +) -> Mapping[int, IndexingError | None]: logger.debug('Embedding sources:', extra={ 'source_ids': [ f'{source.reference} ({_decode_latin_1(source.title)})' diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 51f98e7d..28aff6a0 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -3,17 +3,16 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # -import asyncio import logging import os +from collections.abc import Mapping from contextlib import suppress from enum import Enum -from io import BytesIO from threading import Event, Thread from time import sleep import niquests -from nc_py_api import AsyncNextcloudApp, NextcloudApp +from nc_py_api import NextcloudApp from pydantic import ValidationError from .chain.ingest.injest import embed_sources @@ -25,7 +24,6 @@ EmbeddingException, FilesQueueItems, IndexingError, - IndexingException, LoaderException, ReceivedFileItem, SourceItem, @@ -46,12 +44,10 @@ THREADS = {} THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') -FILES_INDEXING_BATCH_SIZE = 64 # todo: config? +FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? +MIN_FILES_PER_CPU = 4 # divides the batch into these many chunks PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1) # todo: config? -# max concurrent fetches to avoid overloading the NC server or hitting rate limits -CONCURRENT_FILE_FETCHES = 10 # todo: config? -MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB, todo: config? ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 @@ -62,143 +58,6 @@ class ThreadType(Enum): REQUEST_PROCESSING = 'request_processing' -async def __fetch_file_content( - semaphore: asyncio.Semaphore, - file_id: int, - user_id: str, - _rlimit = 3, -) -> BytesIO: - ''' - Raises - ------ - IndexingException - ''' - - async with semaphore: - nc = AsyncNextcloudApp() - try: - # a file pointer for storing the stream in memory until it is consumed - fp = BytesIO() - await nc._session.download2fp( - url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}', - fp=fp, - dav=False, - params={ 'userId': user_id }, - ) - return fp - except niquests.exceptions.RequestException as e: - # todo: raise IndexingException with retryable=True for rate limit errors, - # todo: and handle it in the caller to not delete the source from the queue and retry later through - # todo: the normal lock expiry mechanism - if e.response is None: - raise - - if e.response.status_code == niquests.codes.too_many_requests: # pyright: ignore[reportAttributeAccessIssue] - # todo: implement rate limits in php CC? - wait_for = int(e.response.headers.get('Retry-After', '30')) - if _rlimit <= 0: - raise IndexingException( - f'Rate limited when fetching content for file id {file_id}, user id {user_id},' - ' max retries exceeded', - retryable=True, - ) from e - LOGGER.warning( - f'Rate limited when fetching content for file id {file_id}, user id {user_id},' - f' waiting {wait_for} before retrying', - exc_info=e, - ) - await asyncio.sleep(wait_for) - return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1) - - raise - except IndexingException: - raise - except Exception as e: - LOGGER.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e) - raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e - - -async def __fetch_files_content( - files: dict[int, ReceivedFileItem] -) -> dict[int, SourceItem | IndexingError]: - source_items = {} - semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) - tasks = [] - - for db_id, file in files.items(): - try: - # to detect any validation errors but it should not happen since file.reference is validated - file.file_id # noqa: B018 - except ValueError as e: - LOGGER.error( - f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}', - exc_info=e, - ) - source_items[db_id] = IndexingError( - error=f'Invalid file reference format: {file.reference}', - retryable=False, - ) - continue - - if file.size > MAX_FILE_SIZE: - LOGGER.info( - f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size' - f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB', - ) - source_items[db_id] = IndexingError( - error=( - f'File size {(file.size/(1024*1024)):.2f} MiB' - f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB' - ), - retryable=False, - ) - continue - # todo: perform the existing file check before fetching the content to avoid unnecessary fetches - # any user id from the list should have read access to the file - tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) - - results = await asyncio.gather(*tasks, return_exceptions=True) - for (db_id, file), result in zip(files.items(), results, strict=True): - if isinstance(result, IndexingException): - LOGGER.error( - f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' - f': {result}', - exc_info=result, - ) - source_items[db_id] = IndexingError( - error=str(result), - retryable=result.retryable, - ) - elif isinstance(result, str) or isinstance(result, BytesIO): - source_items[db_id] = SourceItem( - **{ - **file.model_dump(), - 'content': result, - } - ) - elif isinstance(result, BaseException): - LOGGER.error( - f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},' - f' reference {file.reference}: {result}', - exc_info=result, - ) - source_items[db_id] = IndexingError( - error=f'Unexpected error: {result}', - retryable=True, - ) - else: - LOGGER.error( - f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' - f': {result}', - exc_info=True, - ) - source_items[db_id] = IndexingError( - error='Unknown error', - retryable=True, - ) - return source_items - - def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: try: vectordb_loader = VectorDBLoader(app_config) @@ -206,7 +65,7 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return - def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingError | None]: + def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: try: return exec_in_proc( target=embed_sources, @@ -225,7 +84,6 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro try: nc = NextcloudApp() - # todo: add the 'size' param to the return of this call. q_items_res = nc.ocs( 'GET', '/ocs/v2.php/apps/context_chat/queues/documents', @@ -242,29 +100,14 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro sleep(POLLING_COOLDOWN) continue - # populate files content and convert to source items - fetched_files = {} - source_files = {} - # unified error structure for files and content providers - source_errors = {} - - if q_items.files: - fetched_files = asyncio.run(__fetch_files_content(q_items.files)) - - for db_id, result in fetched_files.items(): - if isinstance(result, SourceItem): - source_files[db_id] = result - else: - source_errors[db_id] = result - files_result = {} providers_result = {} - chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING + chunk_size = max(MIN_FILES_PER_CPU, FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING) # todo: do it in asyncio, it's not truly parallel yet # chunk file parsing for better file operation parallelism - for i in range(0, len(source_files), chunk_size): - chunk = dict(list(source_files.items())[i:i+chunk_size]) + for i in range(0, len(q_items.files), chunk_size): + chunk = dict(list(q_items.files.items())[i:i+chunk_size]) files_result.update(_load_sources(chunk)) for i in range(0, len(q_items.content_providers), chunk_size): diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py index 9f23e14f..59d2568f 100644 --- a/context_chat_backend/types.py +++ b/context_chat_backend/types.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # import re +from collections.abc import Mapping from enum import Enum from io import BytesIO from typing import Annotated, Literal, Self @@ -224,8 +225,8 @@ class Config: class FilesQueueItems(BaseModel): - files: dict[int, ReceivedFileItem] # [db id]: FileItem - content_providers: dict[int, SourceItem] # [db id]: SourceItem + files: Mapping[int, ReceivedFileItem] # [db id]: FileItem + content_providers: Mapping[int, SourceItem] # [db id]: SourceItem class IndexingException(Exception): @@ -343,4 +344,4 @@ class ActionsQueueItemUpdateAccessDeclSourceId(CommonActionsQueueItem): class ActionsQueueItems(BaseModel): - actions: dict[int, ActionsQueueItem] + actions: Mapping[int, ActionsQueueItem] diff --git a/context_chat_backend/vectordb/base.py b/context_chat_backend/vectordb/base.py index ebd54075..2b4aa35e 100644 --- a/context_chat_backend/vectordb/base.py +++ b/context_chat_backend/vectordb/base.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # from abc import ABC, abstractmethod +from collections.abc import Mapping from typing import Any from langchain.schema import Document @@ -10,7 +11,7 @@ from langchain.schema.vectorstore import VectorStore from ..chain.types import InDocument, ScopeType -from ..types import IndexingError, SourceItem +from ..types import IndexingError, ReceivedFileItem, SourceItem from ..utils import timed from .types import UpdateAccessOp @@ -62,7 +63,7 @@ def get_instance(self) -> VectorStore: ''' @abstractmethod - def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]: + def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, IndexingError | None]: ''' Adds the given indocuments to the vectordb and updates the docs + access tables. @@ -79,7 +80,7 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index @timed @abstractmethod - def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]: + def check_sources(self, sources: Mapping[int, SourceItem | ReceivedFileItem]) -> tuple[list[str], list[str]]: ''' Checks the sources in the vectordb if they are already embedded and are up to date. @@ -88,8 +89,8 @@ def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list Args ---- - sources: list[UploadFile] - List of source ids to check. + sources: Mapping[int, SourceItem | ReceivedFileItem] + Dict of sources to check. Returns ------- diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index bfca0bb6..86f636be 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -4,6 +4,7 @@ # import logging import os +from collections.abc import Mapping from datetime import datetime import psycopg @@ -17,7 +18,14 @@ from langchain_postgres.vectorstores import Base, PGVector from ..chain.types import InDocument, ScopeType -from ..types import EmbeddingException, FatalEmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem +from ..types import ( + EmbeddingException, + FatalEmbeddingException, + IndexingError, + ReceivedFileItem, + RetryableEmbeddingException, + SourceItem, +) from ..utils import timed from .base import BaseVectorDB from .types import DbException, SafeDbException, UpdateAccessOp @@ -129,7 +137,7 @@ def get_users(self) -> list[str]: except Exception as e: raise DbException('Error: getting a list of all users from access list') from e - def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]: + def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, IndexingError | None]: """ Raises EmbeddingException: if the embedding request definitively fails @@ -208,7 +216,7 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index return results @timed - def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]: + def check_sources(self, sources: Mapping[int, SourceItem | ReceivedFileItem]) -> tuple[list[str], list[str]]: ''' returns a tuple of (existing_source_ids, to_embed_source_ids) ''' From 1ade19186593193a5005d2aadc97a83b25f601b8 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Wed, 18 Mar 2026 16:49:09 +0530 Subject: [PATCH 19/56] fix: truly parallel file parsing and indexing Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 48 ++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 28aff6a0..f07f5012 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -4,8 +4,10 @@ # import logging +import math import os from collections.abc import Mapping +from concurrent.futures import ThreadPoolExecutor from contextlib import suppress from enum import Enum from threading import Event, Thread @@ -47,7 +49,7 @@ FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? MIN_FILES_PER_CPU = 4 # divides the batch into these many chunks -PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1) # todo: config? +PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 @@ -71,10 +73,14 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> target=embed_sources, args=(vectordb_loader, app_config, source_items), ) - except (DbException, EmbeddingException): - raise except Exception as e: - raise DbException('Error: failed to load sources') from e + err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown") + source_ids = (s.reference for s in source_items.values()) + err = IndexingError( + error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}', + retryable=True, + ) + return dict.fromkeys(source_items, err) while True: @@ -102,17 +108,33 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> files_result = {} providers_result = {} - chunk_size = max(MIN_FILES_PER_CPU, FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING) - # todo: do it in asyncio, it's not truly parallel yet # chunk file parsing for better file operation parallelism - for i in range(0, len(q_items.files), chunk_size): - chunk = dict(list(q_items.files.items())[i:i+chunk_size]) - files_result.update(_load_sources(chunk)) - - for i in range(0, len(q_items.content_providers), chunk_size): - chunk = dict(list(q_items.content_providers.items())[i:i+chunk_size]) - providers_result.update(_load_sources(chunk)) + file_chunk_size = max(MIN_FILES_PER_CPU, math.ceil(len(q_items.files) / PARALLEL_FILE_PARSING_COUNT)) + file_chunks = [ + dict(list(q_items.files.items())[i:i+file_chunk_size]) + for i in range(0, len(q_items.files), file_chunk_size) + ] + provider_chunk_size = max( + MIN_FILES_PER_CPU, + math.ceil(len(q_items.content_providers) / PARALLEL_FILE_PARSING_COUNT), + ) + provider_chunks = [ + dict(list(q_items.content_providers.items())[i:i+provider_chunk_size]) + for i in range(0, len(q_items.content_providers), provider_chunk_size) + ] + + with ThreadPoolExecutor( + max_workers=PARALLEL_FILE_PARSING_COUNT, + thread_name_prefix='IndexingPool', + ) as executor: + file_futures = [executor.submit(_load_sources, chunk) for chunk in file_chunks] + provider_futures = [executor.submit(_load_sources, chunk) for chunk in provider_chunks] + + for future in file_futures: + files_result.update(future.result()) + for future in provider_futures: + providers_result.update(future.result()) if ( any(isinstance(res, IndexingError) for res in files_result.values()) From 12fd1ca00fc6d3fab6e91b8bb4dbc6c11488ca74 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Tue, 24 Mar 2026 10:36:04 +0100 Subject: [PATCH 20/56] initial pass at request processing --- context_chat_backend/controller.py | 4 +- context_chat_backend/task_fetcher.py | 362 +++++++++++++++++++++++++-- 2 files changed, 350 insertions(+), 16 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 797ba201..3ebdc8ae 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -40,7 +40,7 @@ from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware from .utils import JSONResponse, exec_in_proc, value_of -from .task_fetcher import start_bg_threads, wait_for_bg_threads +from .task_fetcher import start_bg_threads, trigger_handler, wait_for_bg_threads from .vectordb.service import count_documents_by_provider # setup @@ -83,7 +83,7 @@ def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: @asynccontextmanager async def lifespan(app: FastAPI): - set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch) + set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch, trigger_handler=trigger_handler) nc = NextcloudApp() if nc.enabled_state: app_enabled.set() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index f07f5012..a5028029 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -12,26 +12,25 @@ from enum import Enum from threading import Event, Thread from time import sleep +from typing import Any import niquests -from nc_py_api import NextcloudApp +from langchain.llms.base import LLM +from langchain.schema import Document +from nc_py_api import NextcloudApp, NextcloudException +from niquests import JSONDecodeError, RequestException from pydantic import ValidationError +from .chain.context import get_context_chunks, get_context_docs from .chain.ingest.injest import embed_sources +from .chain.query_proc import get_pruned_query +from .chain.types import ContextException, LLMOutput, ScopeType +from .controller import llm_loader from .dyn_loader import VectorDBLoader -from .types import ( - ActionsQueueItems, - ActionType, - AppRole, - EmbeddingException, - FilesQueueItems, - IndexingError, - LoaderException, - ReceivedFileItem, - SourceItem, - TConfig, -) +from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \ + LoaderException, ReceivedFileItem, SourceItem, TConfig from .utils import exec_in_proc, get_app_role +from .vectordb.base import BaseVectorDB from .vectordb.service import ( decl_update_access, delete_by_provider, @@ -52,6 +51,10 @@ PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 +TRIGGER = Event() +CHECK_INTERVAL = 5 +CHECK_INTERVAL_WITH_TRIGGER = 5 * 60 +CHECK_INTERVAL_ON_ERROR = 15 class ThreadType(Enum): @@ -370,7 +373,78 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: - ... + logger.info('Starting task fetcher loop') + + try: + vectordb_loader = VectorDBLoader(app_config) + except LoaderException as e: + LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) + return + + nc = NextcloudApp() + llm: LLM = llm_loader.load() + + while True: + if THREAD_STOP_EVENT.is_set(): + LOGGER.info('Updates processing thread is stopping due to stop event being set') + return + + try: + # Fetch pending task + try: + response = nc.providers.task_processing.next_task(list(provider_ids), list(task_type_ids)) + if not response: + wait_for_tasks() + continue + except (NextcloudException, RequestException, JSONDecodeError) as e: + LOGGER.error(f"Network error fetching the next task {e}", exc_info=e) + wait_for_tasks(CHECK_INTERVAL_ON_ERROR) + continue + + # Process task + task = response["task"] + provider = response["provider"] + + try: + logger.debug(f'Processing task {task["id"]}') + result = process_task(task, vectordb_loader, llm, app_config) + + # Return result to Nextcloud + success = return_result_to_nextcloud(task_id, result) + + if success: + LOGGER.info(f'Task {task["id"]} completed successfully') + else: + LOGGER.error(f'Failed to return result for task {task["id"]}') + + except ContextException as e: + LOGGER.warning(f'Context error for task {task["id"]}: {e}') + # TODO: Return error to Nextcloud + except ValueError as e: + LOGGER.warning(f'Validation error for task {task["id"]}: {e}') + # TODO: Return error to Nextcloud + except Exception as e: + LOGGER.exception(f'Unexpected error processing task {task["id"]}', exc_info=e) + # TODO: Return error to Nextcloud + + except Exception as e: + logger.exception('Error in task fetcher loop', exc_info=e) + # TODO: Add appropriate error handling and backoff + +def trigger_handler(providerId: str): + global TRIGGER + print('TRIGGER called') + TRIGGER.set() + +def wait_for_tasks(interval = None): + global TRIGGER + global CHECK_INTERVAL + global CHECK_INTERVAL_WITH_TRIGGER + actual_interval = CHECK_INTERVAL if interval is None else interval + if TRIGGER.wait(timeout=actual_interval): + CHECK_INTERVAL = CHECK_INTERVAL_WITH_TRIGGER + TRIGGER.clear() + def start_bg_threads(app_config: TConfig, app_enabled: Event): @@ -430,3 +504,263 @@ def wait_for_bg_threads(): THREAD_STOP_EVENT.set() THREADS[ThreadType.REQUEST_PROCESSING].join() THREADS.pop(ThreadType.REQUEST_PROCESSING) + + +# Default LLM template for context-based queries +_LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer. +{context} + +{question} +''' + +def query_vector_database( + user_id: str, + query: str, + vectordb: BaseVectorDB, + ctx_limit: int, + scope_type: ScopeType | None = None, + scope_list: list[str] | None = None, +) -> list[Document]: + """ + Query the vector database to retrieve relevant documents. + + Args: + user_id: User ID for scoping the search + query: The search query text + vectordb: Vector database instance + ctx_limit: Maximum number of documents to return + scope_type: Optional scope type (PROVIDER or SOURCE) + scope_list: Optional list of scope identifiers + + Returns: + List of relevant Document objects + + Raises: + ContextException: If scope type is provided without scope list + """ + context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list) + logger.debug('Retrieved context documents', extra={ + 'user_id': user_id, + 'num_docs': len(context_docs), + 'ctx_limit': ctx_limit, + }) + return context_docs + + +def prepare_context_chunks(context_docs: list[Document]) -> list[str]: + """ + Extract and format text chunks from documents for LLM context. + + Args: + context_docs: List of Document objects from vector DB + + Returns: + List of formatted text chunks including titles and content + """ + return get_context_chunks(context_docs) + + +def generate_llm_response( + llm: LLM, + app_config: TConfig, + user_id: str, + query: str, + template: str, + context_chunks: list[str], + end_separator: str = '', +) -> str: + """ + Generate LLM response using the pruned query and context. + + Args: + llm: Language model instance + app_config: Application configuration + user_id: User ID for the request + query: The original query text + template: Template for formatting the prompt + context_chunks: Context chunks to include in the prompt + end_separator: Optional separator to stop generation + + Returns: + Generated LLM output text + + Raises: + ValueError: If context length is too small to fit the query + """ + pruned_query_text = get_pruned_query(llm, app_config, query, template, context_chunks) + + stop = [end_separator] if end_separator else None + output = llm.invoke( + pruned_query_text, + stop=stop, + userid=user_id, + ).strip() + + logger.debug('Generated LLM response', extra={ + 'user_id': user_id, + 'output_length': len(output), + }) + return output + + +def extract_unique_sources(context_docs: list[Document]) -> list[str]: + """ + Extract unique source IDs from context documents. + + Args: + context_docs: List of Document objects + + Returns: + List of unique source IDs + """ + unique_sources: list[str] = list({ + source for d in context_docs if (source := d.metadata.get('source')) + }) + return unique_sources + +def execute_context_query( + user_id: str, + vectordb_loader: VectorDBLoader, + llm: LLM, + app_config: TConfig, + query: str, + ctx_limit: int = 20, + scope_type: ScopeType | None = None, + scope_list: list[str] | None = None, + template: str | None = None, + end_separator: str = '', +) -> LLMOutput: + """ + Execute a RAG query with context retrieval from vector database. + + This is the main function for processing queries that require context + from the vector database. It orchestrates the entire RAG pipeline: + 1. Query vector database for relevant documents + 2. Extract and format context chunks + 3. Generate LLM response with context + 4. Return output with source references + + Args: + user_id: User ID for the request + vectordb_loader: Vector database loader instance + llm: Language model instance + app_config: Application configuration + query: The query text + ctx_limit: Maximum number of context documents (default: 20) + scope_type: Optional scope type for filtering + scope_list: Optional list of scope identifiers + template: Optional custom prompt template + end_separator: Optional separator to stop generation + + Returns: + LLMOutput with generated text and source references + + Raises: + ContextException: If no documents are retrieved + ValueError: If context length is too small to fit the query + """ + logger.info('Executing context query', extra={ + 'user_id': user_id, + 'query_length': len(query), + 'ctx_limit': ctx_limit, + }) + + # Step 1: Load vector database and retrieve relevant documents + db = vectordb_loader.load() + context_docs = query_vector_database(user_id, query, db, ctx_limit, scope_type, scope_list) + + if len(context_docs) == 0: + raise ContextException('No documents retrieved, please index a few documents first') + + # Step 2: Prepare context chunks for LLM + context_chunks = prepare_context_chunks(context_docs) + logger.debug('Prepared context chunks', extra={ + 'num_docs': len(context_docs), + 'num_chunks': len(context_chunks), + }) + + # Step 3: Generate LLM response + output = generate_llm_response( + llm, + app_config, + user_id, + query, + template or _LLM_TEMPLATE, + context_chunks, + end_separator, + ) + + # Step 4: Extract unique sources for citation + unique_sources = extract_unique_sources(context_docs) + + logger.info('Context query completed', extra={ + 'user_id': user_id, + 'num_sources': len(unique_sources), + }) + + return LLMOutput(output=output, sources=unique_sources) + +# ============================================================================ +# Task Queue Processing +# ============================================================================ + + +def return_result_to_nextcloud(task_id: str, result: LLMOutput) -> bool: + """ + Return query result back to Nextcloud. + + STUB: This function should be implemented to send results back + to Nextcloud's task queue or API endpoint. + + Args: + task_id: Unique task identifier + result: The LLMOutput result to return + + Returns: + True if successful, False otherwise + """ + logger.debug('Returning result to Nextcloud (STUB)', extra={ + 'task_id': task_id, + 'output_length': len(result['output']), + 'num_sources': len(result['sources']), + }) + # TODO: Implement actual Nextcloud result submission + return True + + +def process_task( + task: dict[str, Any], + vectordb_loader: VectorDBLoader, + llm: LLM, + app_config: TConfig, +) -> LLMOutput: + """ + Process a single query task. + + Args: + task: Task dictionary from fetch_query_tasks_from_nextcloud + vectordb_loader: Vector database loader instance + llm: Language model instance + app_config: Application configuration + + Returns: + LLMOutput with generated text and sources + + Raises: + Various exceptions from query execution + """ + user_id = task['user_id'] + query = task['query'] + + return execute_context_query( + user_id=user_id, + vectordb_loader=vectordb_loader, + llm=llm, + app_config=app_config, + query=query, + ctx_limit=task.get('ctx_limit', 20), + scope_type=task.get('scope_type'), + scope_list=task.get('scope_list'), + template=task.get('template'), # TODO: Somehow get the real template, tasks don't have it + end_separator=task.get('end_separator', ''), # TODO: same here + ) \ No newline at end of file From 8aa2471080c10ea7b0a97a9d2dac4023e005464c Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 25 Mar 2026 10:42:40 +0100 Subject: [PATCH 21/56] implement request processing --- context_chat_backend/chain/one_shot.py | 1 + context_chat_backend/chain/types.py | 12 + context_chat_backend/controller.py | 19 +- context_chat_backend/task_fetcher.py | 292 +++++++++++++++---------- 4 files changed, 201 insertions(+), 123 deletions(-) diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py index 1c0521bf..d0f5bbed 100644 --- a/context_chat_backend/chain/one_shot.py +++ b/context_chat_backend/chain/one_shot.py @@ -20,6 +20,7 @@ logger = logging.getLogger('ccb.chain') +# todo: remove this maybe def process_query( user_id: str, llm: LLM, diff --git a/context_chat_backend/chain/types.py b/context_chat_backend/chain/types.py index b006ad1a..c5277563 100644 --- a/context_chat_backend/chain/types.py +++ b/context_chat_backend/chain/types.py @@ -42,3 +42,15 @@ class LLMOutput(TypedDict): class SearchResult(TypedDict): source_id: str title: str + +class EnrichedSource(BaseModel): + id: str + label: str + icon: str + url: str + +class EnrichedSourceList(BaseModel): + sources: list[EnrichedSource] + +class ScopeList(BaseModel): + source_ids: list[str] \ No newline at end of file diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 3ebdc8ae..1e0d2773 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +from nc_py_api.ex_app.providers.task_processing import TaskProcessingProvider # isort: off from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult @@ -65,9 +66,23 @@ } if __download_models_from_hf else {} app_enabled = Event() -def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str: +def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: try: if enabled: + provider = TaskProcessingProvider( + id="context_chat-context_chat_search", + name="Context Chat", + task_type="context_chat:context_chat_search", + expected_runtime=30, + ) + nc.providers.task_processing.register(provider) + provider = TaskProcessingProvider( + id="context_chat-context_chat", + name="Context Chat", + task_type="context_chat:context_chat", + expected_runtime=30, + ) + nc.providers.task_processing.register(provider) app_enabled.set() start_bg_threads(app_config, app_enabled) else: @@ -383,7 +398,7 @@ def download_logs() -> FileResponse: # 'title': source.headers.get('title'), # 'headers': source.headers, # }) -# return JSONResponse(f'Invaild/missing headers for: {source.filename}', 400) +# return JSONResponse(f'Invaild/missing headers for:provider_ids {source.filename}', 400) # # wait for 10 minutes before failing the request # semres = doc_parse_semaphore.acquire(block=True, timeout=10*60) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index a5028029..7951f067 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # - +import json import logging import math import os @@ -21,11 +21,13 @@ from niquests import JSONDecodeError, RequestException from pydantic import ValidationError -from .chain.context import get_context_chunks, get_context_docs +from .chain.context import do_doc_search, get_context_chunks, get_context_docs from .chain.ingest.injest import embed_sources +from .chain.one_shot import process_context_query from .chain.query_proc import get_pruned_query -from .chain.types import ContextException, LLMOutput, ScopeType -from .controller import llm_loader +from .chain.types import ContextException, EnrichedSource, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, \ + SearchResult +from .controller import Query, execute_query, llm_loader from .dyn_loader import VectorDBLoader from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \ LoaderException, ReceivedFileItem, SourceItem, TConfig @@ -55,6 +57,7 @@ CHECK_INTERVAL = 5 CHECK_INTERVAL_WITH_TRIGGER = 5 * 60 CHECK_INTERVAL_ON_ERROR = 15 +CONTEXT_LIMIT=20 class ThreadType(Enum): @@ -372,8 +375,25 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None: continue +def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]: + """ + + Parameters + ---------- + source_ids + + Returns + ------- + source_ids with only files, no folders (or source_ids in case of non-file provider) + """ + nc = NextcloudApp() + data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/resolve_scope_list', json={'source_ids': source_ids, 'userId': userId}) + sources = ScopeList.model_validate(data).source_ids + return sources + + def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: - logger.info('Starting task fetcher loop') + LOGGER.info('Starting task fetcher loop') try: vectordb_loader = VectorDBLoader(app_config) @@ -392,7 +412,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: try: # Fetch pending task try: - response = nc.providers.task_processing.next_task(list(provider_ids), list(task_type_ids)) + response = nc.providers.task_processing.next_task(['context_chat-context_chat', 'context_chat-context_chat_search'], ['context_chat:context_chat', 'context_chat:context_chat_search']) if not response: wait_for_tasks() continue @@ -403,14 +423,26 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: # Process task task = response["task"] - provider = response["provider"] + userId = task['userId'] try: - logger.debug(f'Processing task {task["id"]}') - result = process_task(task, vectordb_loader, llm, app_config) - - # Return result to Nextcloud - success = return_result_to_nextcloud(task_id, result) + LOGGER.debug(f'Processing task {task["id"]}') + + if task['input'].get('scopeType') == 'source': + # Resolve scope list to only files, no folders + task['input']['scopeList'] = resolve_scope_list(task['input'].get('scopeList'), userId) + + if task['type'] == 'context_chat:context_chat': + result: LLMOutput = process_normal_task(task, vectordb_loader, llm, app_config) + # Return result to Nextcloud + success = return_normal_result_to_nextcloud(task['id'], userId, result) + elif task['type'] == 'context_chat:context_chat_search': + result: list[SearchResult] = process_search_task(task, vectordb_loader) + # Return result to Nextcloud + success = return_search_result_to_nextcloud(task['id'], userId, result) + else: + LOGGER.error(f'Unknown task type {task["type"]}') + success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}')) if success: LOGGER.info(f'Task {task["id"]} completed successfully') @@ -419,17 +451,17 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: except ContextException as e: LOGGER.warning(f'Context error for task {task["id"]}: {e}') - # TODO: Return error to Nextcloud + return_error_to_nextcloud(task['id'], e) except ValueError as e: LOGGER.warning(f'Validation error for task {task["id"]}: {e}') - # TODO: Return error to Nextcloud + return_error_to_nextcloud(task['id'], e) except Exception as e: LOGGER.exception(f'Unexpected error processing task {task["id"]}', exc_info=e) - # TODO: Return error to Nextcloud + return_error_to_nextcloud(task['id'], e) except Exception as e: - logger.exception('Error in task fetcher loop', exc_info=e) - # TODO: Add appropriate error handling and backoff + LOGGER.exception('Error in task fetcher loop', exc_info=e) + wait_for_tasks(CHECK_INTERVAL_ON_ERROR) def trigger_handler(providerId: str): global TRIGGER @@ -506,13 +538,6 @@ def wait_for_bg_threads(): THREADS.pop(ThreadType.REQUEST_PROCESSING) -# Default LLM template for context-based queries -_LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer. -{context} - -{question} -''' - def query_vector_database( user_id: str, query: str, @@ -539,7 +564,7 @@ def query_vector_database( ContextException: If scope type is provided without scope list """ context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list) - logger.debug('Retrieved context documents', extra={ + LOGGER.debug('Retrieved context documents', extra={ 'user_id': user_id, 'num_docs': len(context_docs), 'ctx_limit': ctx_limit, @@ -596,7 +621,7 @@ def generate_llm_response( userid=user_id, ).strip() - logger.debug('Generated LLM response', extra={ + LOGGER.debug('Generated LLM response', extra={ 'user_id': user_id, 'output_length': len(output), }) @@ -618,117 +643,112 @@ def extract_unique_sources(context_docs: list[Document]) -> list[str]: }) return unique_sources -def execute_context_query( - user_id: str, - vectordb_loader: VectorDBLoader, - llm: LLM, - app_config: TConfig, - query: str, - ctx_limit: int = 20, - scope_type: ScopeType | None = None, - scope_list: list[str] | None = None, - template: str | None = None, - end_separator: str = '', -) -> LLMOutput: +def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutput) -> bool: """ - Execute a RAG query with context retrieval from vector database. - - This is the main function for processing queries that require context - from the vector database. It orchestrates the entire RAG pipeline: - 1. Query vector database for relevant documents - 2. Extract and format context chunks - 3. Generate LLM response with context - 4. Return output with source references + Return query result back to Nextcloud. Args: - user_id: User ID for the request - vectordb_loader: Vector database loader instance - llm: Language model instance - app_config: Application configuration - query: The query text - ctx_limit: Maximum number of context documents (default: 20) - scope_type: Optional scope type for filtering - scope_list: Optional list of scope identifiers - template: Optional custom prompt template - end_separator: Optional separator to stop generation + task_id: Unique task identifier + result: The LLMOutput result to return Returns: - LLMOutput with generated text and source references - - Raises: - ContextException: If no documents are retrieved - ValueError: If context length is too small to fit the query + True if successful, False otherwise """ - logger.info('Executing context query', extra={ - 'user_id': user_id, - 'query_length': len(query), - 'ctx_limit': ctx_limit, + LOGGER.debug('Returning result to Nextcloud', extra={ + 'task_id': task_id, + 'output_length': len(result['output']), + 'num_sources': len(result['sources']), }) - # Step 1: Load vector database and retrieve relevant documents - db = vectordb_loader.load() - context_docs = query_vector_database(user_id, query, db, ctx_limit, scope_type, scope_list) + nc = NextcloudApp() - if len(context_docs) == 0: - raise ContextException('No documents retrieved, please index a few documents first') + try: + nc.providers.task_processing.report_result(task_id, { + 'output': result['output'], + 'sources': enrich_sources(result['sources'], userId), + }) + except (NextcloudException, RequestException, JSONDecodeError) as e: + LOGGER.error(f"Network error reporting task result {e}", exc_info=e) + return False - # Step 2: Prepare context chunks for LLM - context_chunks = prepare_context_chunks(context_docs) - logger.debug('Prepared context chunks', extra={ - 'num_docs': len(context_docs), - 'num_chunks': len(context_chunks), - }) + return True - # Step 3: Generate LLM response - output = generate_llm_response( - llm, - app_config, - user_id, - query, - template or _LLM_TEMPLATE, - context_chunks, - end_separator, - ) +def enrich_sources(results: list[str], userId: str) -> list[EnrichedSource]: + nc = NextcloudApp() + # todo: refactor to include title here + data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': [{'source_id': id} for id in results], 'userId': userId}) + sources = EnrichedSourceList.model_validate(data).sources + return sources - # Step 4: Extract unique sources for citation - unique_sources = extract_unique_sources(context_docs) +def enrich_search_sources(results: list[SearchResult], userId: str) -> list[EnrichedSource]: + nc = NextcloudApp() + data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId}) + sources = EnrichedSourceList.model_validate(data).sources + return sources - logger.info('Context query completed', extra={ - 'user_id': user_id, - 'num_sources': len(unique_sources), + +def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool: + """ + Return search result back to Nextcloud. + + Args: + task_id: Unique task identifier + result: The list of search results to return + + Returns: + True if successful, False otherwise + """ + LOGGER.debug('Returning search result to Nextcloud', extra={ + 'task_id': task_id, + 'num_sources': len(result), }) - return LLMOutput(output=output, sources=unique_sources) + nc = NextcloudApp() -# ============================================================================ -# Task Queue Processing -# ============================================================================ + try: + sources = [json.dumps(source) for source in enrich_search_sources(result, userId)] + nc.providers.task_processing.report_result(task_id, { + 'sources': sources, + }) + except (NextcloudException, RequestException, JSONDecodeError) as e: + LOGGER.error(f"Network error reporting search task result {e}", exc_info=e) + return False -def return_result_to_nextcloud(task_id: str, result: LLMOutput) -> bool: - """ - Return query result back to Nextcloud. + return True - STUB: This function should be implemented to send results back - to Nextcloud's task queue or API endpoint. +def return_error_to_nextcloud(task_id: int, e: Exception) -> bool: + """ + Return error result back to Nextcloud. Args: task_id: Unique task identifier - result: The LLMOutput result to return + e: error object Returns: True if successful, False otherwise """ - logger.debug('Returning result to Nextcloud (STUB)', extra={ - 'task_id': task_id, - 'output_length': len(result['output']), - 'num_sources': len(result['sources']), - }) - # TODO: Implement actual Nextcloud result submission + LOGGER.debug('Returning error to Nextcloud', exc_info=e) + + nc = NextcloudApp() + + if isinstance(e, ValueError): + message = "Validation error: " + str(e) + elif isinstance(e, ContextException): + message = "Context error" + str(e) + else: + message = "Unexpected error" + str(e) + + try: + nc.providers.task_processing.report_result(task_id, None, message) + except (NextcloudException, RequestException, JSONDecodeError) as e: + LOGGER.error(f"Network error reporting task result {e}", exc_info=e) + return False + return True -def process_task( +def process_normal_task( task: dict[str, Any], vectordb_loader: VectorDBLoader, llm: LLM, @@ -750,17 +770,47 @@ def process_task( Various exceptions from query execution """ user_id = task['user_id'] - query = task['query'] - - return execute_context_query( - user_id=user_id, - vectordb_loader=vectordb_loader, - llm=llm, - app_config=app_config, - query=query, - ctx_limit=task.get('ctx_limit', 20), - scope_type=task.get('scope_type'), - scope_list=task.get('scope_list'), - template=task.get('template'), # TODO: Somehow get the real template, tasks don't have it - end_separator=task.get('end_separator', ''), # TODO: same here + task_input = task['input'] + + return exec_in_proc(target=process_context_query, + args=( + user_id, + vectordb_loader, + llm, + app_config, + task_input.get('prompt'), + CONTEXT_LIMIT, + task_input.get('scopeType'), + task_input.get('scopeList'), + ) + ) + +def process_search_task( + task: dict[str, Any], + vectordb_loader: VectorDBLoader, +) -> list[SearchResult]: + """ + Process a single search task. + + Args: + task: Task dictionary from fetch_query_tasks_from_nextcloud + vectordb_loader: Vector database loader instance + + Returns: + list of Search results + + Raises: + Various exceptions from query execution + """ + user_id = task['user_id'] + task_input = task['input'] + return exec_in_proc(target=do_doc_search, + args=( + user_id, + task_input.get('prompt'), + vectordb_loader, + CONTEXT_LIMIT, + task_input.get('scopeType'), + task_input.get('scopeList'), + ) ) \ No newline at end of file From 2093936913c08e55c5aca01b559314df731b4bb4 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 26 Mar 2026 22:43:48 +0530 Subject: [PATCH 22/56] request processing fixes Signed-off-by: Anupam Kumar --- context_chat_backend/chain/one_shot.py | 7 +- context_chat_backend/chain/types.py | 14 +- context_chat_backend/controller.py | 179 ++++++++++++------------- context_chat_backend/dyn_loader.py | 16 +-- context_chat_backend/task_fetcher.py | 164 +++++++++++----------- 5 files changed, 185 insertions(+), 195 deletions(-) diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py index d0f5bbed..c79f272e 100644 --- a/context_chat_backend/chain/one_shot.py +++ b/context_chat_backend/chain/one_shot.py @@ -10,7 +10,7 @@ from ..types import TConfig from .context import get_context_chunks, get_context_docs from .query_proc import get_pruned_query -from .types import ContextException, LLMOutput, ScopeType +from .types import ContextException, LLMOutput, ScopeType, SearchResult _LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer. {context} @@ -79,6 +79,9 @@ def process_context_query( stop=[end_separator], userid=user_id, ).strip() - unique_sources: list[str] = list({source for d in context_docs if (source := d.metadata.get('source'))}) + unique_sources = [SearchResult( + source_id=source, + title=d.metadata.get('title', ''), + ) for d in context_docs if (source := d.metadata.get('source'))] return LLMOutput(output=output, sources=unique_sources) diff --git a/context_chat_backend/chain/types.py b/context_chat_backend/chain/types.py index c5277563..3afdf297 100644 --- a/context_chat_backend/chain/types.py +++ b/context_chat_backend/chain/types.py @@ -33,16 +33,16 @@ class ContextException(Exception): ... -class LLMOutput(TypedDict): - output: str - sources: list[str] - # todo: add "titles" field - - class SearchResult(TypedDict): source_id: str title: str + +class LLMOutput(TypedDict): + output: str + sources: list[SearchResult] + + class EnrichedSource(BaseModel): id: str label: str @@ -53,4 +53,4 @@ class EnrichedSourceList(BaseModel): sources: list[EnrichedSource] class ScopeList(BaseModel): - source_ids: list[str] \ No newline at end of file + source_ids: list[str] diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 1e0d2773..33e3cad4 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -5,7 +5,7 @@ from nc_py_api.ex_app.providers.task_processing import TaskProcessingProvider # isort: off -from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult +from .chain.types import ContextException from .types import LoaderException, EmbeddingException from .vectordb.types import DbException, SafeDbException from .setup_functions import ensure_config_file, repair_run, setup_env_vars @@ -25,22 +25,17 @@ from contextlib import asynccontextmanager from functools import wraps from threading import Event, Thread -from typing import Any from fastapi import FastAPI, Request -from langchain.llms.base import LLM from nc_py_api import AsyncNextcloudApp, NextcloudApp from nc_py_api.ex_app import persistent_storage, set_handlers -from pydantic import BaseModel, ValidationInfo, field_validator from starlette.responses import FileResponse -from .chain.context import do_doc_search -from .chain.one_shot import process_context_query, process_query from .config_parser import get_config -from .dyn_loader import LLMModelLoader, VectorDBLoader +from .dyn_loader import VectorDBLoader from .models.types import LlmException from nc_py_api.ex_app import AppAPIAuthMiddleware -from .utils import JSONResponse, exec_in_proc, value_of +from .utils import JSONResponse, exec_in_proc from .task_fetcher import start_bg_threads, trigger_handler, wait_for_bg_threads from .vectordb.service import count_documents_by_provider @@ -108,7 +103,6 @@ async def lifespan(app: FastAPI): t.start() yield vectordb_loader.offload() - llm_loader.offload() wait_for_bg_threads() @@ -120,7 +114,6 @@ async def lifespan(app: FastAPI): # loaders vectordb_loader = VectorDBLoader(app_config) -llm_loader = LLMModelLoader(app, app_config) # locks and semaphores @@ -438,90 +431,90 @@ def download_logs() -> FileResponse: # return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources}) -class Query(BaseModel): - userId: str - query: str - useContext: bool = True - scopeType: ScopeType | None = None - scopeList: list[str] | None = None - ctxLimit: int = 20 - - @field_validator('userId', 'query', 'ctxLimit') - @classmethod - def check_empty_values(cls, value: Any, info: ValidationInfo): - if value_of(value) is None: - raise ValueError('Empty value for field', info.field_name) - - return value - - @field_validator('ctxLimit') - @classmethod - def at_least_one_context(cls, value: int): - if value < 1: - raise ValueError('Invalid context chunk limit') - - return value - - -def execute_query(query: Query, in_proc: bool = True) -> LLMOutput: - llm: LLM = llm_loader.load() - template = app.extra.get('LLM_TEMPLATE') - no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE'] - # todo: array - end_separator = app.extra.get('LLM_END_SEPARATOR', '') - - if query.useContext: - target = process_context_query - args=( - query.userId, - vectordb_loader, - llm, - app_config, - query.query, - query.ctxLimit, - query.scopeType, - query.scopeList, - template, - end_separator, - ) - else: - target=process_query - args=( - query.userId, - llm, - app_config, - query.query, - no_ctx_template, - end_separator, - ) - - if in_proc: - return exec_in_proc(target=target, args=args) - - return target(*args) # pyright: ignore - - -@app.post('/query') -@enabled_guard(app) -def _(query: Query) -> LLMOutput: - logger.debug('received query request', extra={ 'query': query.dict() }) +# class Query(BaseModel): +# userId: str +# query: str +# useContext: bool = True +# scopeType: ScopeType | None = None +# scopeList: list[str] | None = None +# ctxLimit: int = 20 + +# @field_validator('userId', 'query', 'ctxLimit') +# @classmethod +# def check_empty_values(cls, value: Any, info: ValidationInfo): +# if value_of(value) is None: +# raise ValueError('Empty value for field', info.field_name) + +# return value + +# @field_validator('ctxLimit') +# @classmethod +# def at_least_one_context(cls, value: int): +# if value < 1: +# raise ValueError('Invalid context chunk limit') + +# return value + + +# def execute_query(query: Query, in_proc: bool = True) -> LLMOutput: +# llm: LLM = llm_loader.load() +# template = app.extra.get('LLM_TEMPLATE') +# no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE'] +# # todo: array +# end_separator = app.extra.get('LLM_END_SEPARATOR', '') + +# if query.useContext: +# target = process_context_query +# args=( +# query.userId, +# vectordb_loader, +# llm, +# app_config, +# query.query, +# query.ctxLimit, +# query.scopeType, +# query.scopeList, +# template, +# end_separator, +# ) +# else: +# target=process_query +# args=( +# query.userId, +# llm, +# app_config, +# query.query, +# no_ctx_template, +# end_separator, +# ) - if app_config.llm[0] == 'nc_texttotext': - return execute_query(query) +# if in_proc: +# return exec_in_proc(target=target, args=args) - with llm_lock: - return execute_query(query, in_proc=False) +# return target(*args) # pyright: ignore -@app.post('/docSearch') -@enabled_guard(app) -def _(query: Query) -> list[SearchResult]: - # useContext from Query is not used here - return exec_in_proc(target=do_doc_search, args=( - query.userId, - query.query, - vectordb_loader, - query.ctxLimit, - query.scopeType, - query.scopeList, - )) +# @app.post('/query') +# @enabled_guard(app) +# def _(query: Query) -> LLMOutput: +# logger.debug('received query request', extra={ 'query': query.dict() }) + +# if app_config.llm[0] == 'nc_texttotext': +# return execute_query(query) + +# with llm_lock: +# return execute_query(query, in_proc=False) + + +# @app.post('/docSearch') +# @enabled_guard(app) +# def _(query: Query) -> list[SearchResult]: +# # useContext from Query is not used here +# return exec_in_proc(target=do_doc_search, args=( +# query.userId, +# query.query, +# vectordb_loader, +# query.ctxLimit, +# query.scopeType, +# query.scopeList, +# )) diff --git a/context_chat_backend/dyn_loader.py b/context_chat_backend/dyn_loader.py index d67310ff..47b19575 100644 --- a/context_chat_backend/dyn_loader.py +++ b/context_chat_backend/dyn_loader.py @@ -7,11 +7,9 @@ import gc import logging from abc import ABC, abstractmethod -from time import time from typing import Any import torch -from fastapi import FastAPI from langchain.llms.base import LLM from .models.loader import init_model @@ -54,19 +52,11 @@ def offload(self) -> None: class LLMModelLoader(Loader): - def __init__(self, app: FastAPI, config: TConfig) -> None: + def __init__(self, config: TConfig) -> None: self.config = config - self.app = app def load(self) -> LLM: - if self.app.extra.get('LLM_MODEL') is not None: - self.app.extra['LLM_LAST_ACCESSED'] = time() - return self.app.extra['LLM_MODEL'] - llm_name, llm_config = self.config.llm - self.app.extra['LLM_TEMPLATE'] = llm_config.pop('template', '') - self.app.extra['LLM_NO_CTX_TEMPLATE'] = llm_config.pop('no_ctx_template', '') - self.app.extra['LLM_END_SEPARATOR'] = llm_config.pop('end_separator', '') try: model = init_model('llm', (llm_name, llm_config)) @@ -75,13 +65,9 @@ def load(self) -> LLM: if not isinstance(model, LLM): raise LoaderException(f'Error: {model} does not implement "llm" type or has returned an invalid object') - self.app.extra['LLM_MODEL'] = model - self.app.extra['LLM_LAST_ACCESSED'] = time() return model def offload(self) -> None: - if self.app.extra.get('LLM_MODEL') is not None: - del self.app.extra['LLM_MODEL'] clear_cache() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 7951f067..634b51cd 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # -import json import logging import math import os @@ -25,12 +24,20 @@ from .chain.ingest.injest import embed_sources from .chain.one_shot import process_context_query from .chain.query_proc import get_pruned_query -from .chain.types import ContextException, EnrichedSource, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, \ - SearchResult -from .controller import Query, execute_query, llm_loader -from .dyn_loader import VectorDBLoader -from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \ - LoaderException, ReceivedFileItem, SourceItem, TConfig +from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult +from .dyn_loader import LLMModelLoader, VectorDBLoader +from .types import ( + ActionsQueueItems, + ActionType, + AppRole, + EmbeddingException, + FilesQueueItems, + IndexingError, + LoaderException, + ReceivedFileItem, + SourceItem, + TConfig, +) from .utils import exec_in_proc, get_app_role from .vectordb.base import BaseVectorDB from .vectordb.service import ( @@ -387,9 +394,11 @@ def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]: source_ids with only files, no folders (or source_ids in case of non-file provider) """ nc = NextcloudApp() - data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/resolve_scope_list', json={'source_ids': source_ids, 'userId': userId}) - sources = ScopeList.model_validate(data).source_ids - return sources + data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/resolve_scope_list', json={ + 'source_ids': source_ids, + 'userId': userId, + }) + return ScopeList.model_validate(data).source_ids def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: @@ -397,6 +406,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: try: vectordb_loader = VectorDBLoader(app_config) + llm_loader = LLMModelLoader(app_config) except LoaderException as e: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return @@ -412,7 +422,10 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: try: # Fetch pending task try: - response = nc.providers.task_processing.next_task(['context_chat-context_chat', 'context_chat-context_chat_search'], ['context_chat:context_chat', 'context_chat:context_chat_search']) + response = nc.providers.task_processing.next_task( + ['context_chat-context_chat', 'context_chat-context_chat_search'], + ['context_chat:context_chat', 'context_chat:context_chat_search'], + ) if not response: wait_for_tasks() continue @@ -437,9 +450,9 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: # Return result to Nextcloud success = return_normal_result_to_nextcloud(task['id'], userId, result) elif task['type'] == 'context_chat:context_chat_search': - result: list[SearchResult] = process_search_task(task, vectordb_loader) + search_result: list[SearchResult] = process_search_task(task, vectordb_loader) # Return result to Nextcloud - success = return_search_result_to_nextcloud(task['id'], userId, result) + success = return_search_result_to_nextcloud(task['id'], userId, search_result) else: LOGGER.error(f'Unknown task type {task["type"]}') success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}')) @@ -480,62 +493,60 @@ def wait_for_tasks(interval = None): def start_bg_threads(app_config: TConfig, app_enabled: Event): - match APP_ROLE: - case AppRole.INDEXING | AppRole.NORMAL: - if ( - ThreadType.FILES_INDEXING in THREADS - or ThreadType.UPDATES_PROCESSING in THREADS - ): - LOGGER.info('Background threads already running, skipping start') - return - - THREAD_STOP_EVENT.clear() - THREADS[ThreadType.FILES_INDEXING] = Thread( - target=files_indexing_thread, - args=(app_config, app_enabled), - name='FilesIndexingThread', - ) - THREADS[ThreadType.UPDATES_PROCESSING] = Thread( - target=updates_processing_thread, - args=(app_config, app_enabled), - name='UpdatesProcessingThread', - ) - THREADS[ThreadType.FILES_INDEXING].start() - THREADS[ThreadType.UPDATES_PROCESSING].start() - - case AppRole.RP | AppRole.NORMAL: - if ThreadType.REQUEST_PROCESSING in THREADS: - LOGGER.info('Background threads already running, skipping start') - return - - THREAD_STOP_EVENT.clear() - THREADS[ThreadType.REQUEST_PROCESSING] = Thread( - target=request_processing_thread, - args=(app_config, app_enabled), - name='RequestProcessingThread', - ) - THREADS[ThreadType.REQUEST_PROCESSING].start() + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: + if ( + ThreadType.FILES_INDEXING in THREADS + or ThreadType.UPDATES_PROCESSING in THREADS + ): + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() + THREADS[ThreadType.FILES_INDEXING] = Thread( + target=files_indexing_thread, + args=(app_config, app_enabled), + name='FilesIndexingThread', + ) + THREADS[ThreadType.UPDATES_PROCESSING] = Thread( + target=updates_processing_thread, + args=(app_config, app_enabled), + name='UpdatesProcessingThread', + ) + THREADS[ThreadType.FILES_INDEXING].start() + THREADS[ThreadType.UPDATES_PROCESSING].start() + + if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if ThreadType.REQUEST_PROCESSING in THREADS: + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() + THREADS[ThreadType.REQUEST_PROCESSING] = Thread( + target=request_processing_thread, + args=(app_config, app_enabled), + name='RequestProcessingThread', + ) + THREADS[ThreadType.REQUEST_PROCESSING].start() def wait_for_bg_threads(): - match APP_ROLE: - case AppRole.INDEXING | AppRole.NORMAL: - if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): - return + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: + if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): + return - THREAD_STOP_EVENT.set() - THREADS[ThreadType.FILES_INDEXING].join() - THREADS[ThreadType.UPDATES_PROCESSING].join() - THREADS.pop(ThreadType.FILES_INDEXING) - THREADS.pop(ThreadType.UPDATES_PROCESSING) + THREAD_STOP_EVENT.set() + THREADS[ThreadType.FILES_INDEXING].join() + THREADS[ThreadType.UPDATES_PROCESSING].join() + THREADS.pop(ThreadType.FILES_INDEXING) + THREADS.pop(ThreadType.UPDATES_PROCESSING) - case AppRole.RP | AppRole.NORMAL: - if (ThreadType.REQUEST_PROCESSING not in THREADS): - return + if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if (ThreadType.REQUEST_PROCESSING not in THREADS): + return - THREAD_STOP_EVENT.set() - THREADS[ThreadType.REQUEST_PROCESSING].join() - THREADS.pop(ThreadType.REQUEST_PROCESSING) + THREAD_STOP_EVENT.set() + THREADS[ThreadType.REQUEST_PROCESSING].join() + THREADS.pop(ThreadType.REQUEST_PROCESSING) def query_vector_database( @@ -673,18 +684,12 @@ def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutp return True -def enrich_sources(results: list[str], userId: str) -> list[EnrichedSource]: - nc = NextcloudApp() - # todo: refactor to include title here - data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': [{'source_id': id} for id in results], 'userId': userId}) - sources = EnrichedSourceList.model_validate(data).sources - return sources -def enrich_search_sources(results: list[SearchResult], userId: str) -> list[EnrichedSource]: +def enrich_sources(results: list[SearchResult], userId: str) -> list[str]: nc = NextcloudApp() - data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId}) + data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId}) sources = EnrichedSourceList.model_validate(data).sources - return sources + return [s.model_dump_json() for s in sources] def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool: @@ -706,10 +711,8 @@ def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[Se nc = NextcloudApp() try: - sources = [json.dumps(source) for source in enrich_search_sources(result, userId)] - nc.providers.task_processing.report_result(task_id, { - 'sources': sources, + 'sources': enrich_sources(result, userId), }) except (NextcloudException, RequestException, JSONDecodeError) as e: LOGGER.error(f"Network error reporting search task result {e}", exc_info=e) @@ -769,8 +772,10 @@ def process_normal_task( Raises: Various exceptions from query execution """ - user_id = task['user_id'] + user_id = task['userId'] task_input = task['input'] + if task_input.get('scopeType') == 'none': + task_input['scopeType'] = None return exec_in_proc(target=process_context_query, args=( @@ -802,8 +807,11 @@ def process_search_task( Raises: Various exceptions from query execution """ - user_id = task['user_id'] + user_id = task['userId'] task_input = task['input'] + if task_input.get('scopeType') == 'none': + task_input['scopeType'] = None + return exec_in_proc(target=do_doc_search, args=( user_id, @@ -813,4 +821,4 @@ def process_search_task( task_input.get('scopeType'), task_input.get('scopeList'), ) - ) \ No newline at end of file + ) From 36b5f0211ee2da2123d220a312521afe204a559b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 26 Mar 2026 23:01:56 +0530 Subject: [PATCH 23/56] chore: drop commented code Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 292 +---------------------------- 1 file changed, 1 insertion(+), 291 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 33e3cad4..49d1d737 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -24,7 +24,6 @@ from collections.abc import Callable from contextlib import asynccontextmanager from functools import wraps -from threading import Event, Thread from fastapi import FastAPI, Request from nc_py_api import AsyncNextcloudApp, NextcloudApp @@ -59,7 +58,7 @@ 'revision': '607a30d783dfa663caf39e06633721c8d4cfcd7e', } } if __download_models_from_hf else {} -app_enabled = Event() +app_enabled = threading.Event() def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str: try: @@ -99,8 +98,6 @@ async def lifespan(app: FastAPI): app_enabled.set() start_bg_threads(app_config, app_enabled) logger.info(f'App enable state at startup: {app_enabled.is_set()}') - t = Thread(target=background_thread_task, args=()) - t.start() yield vectordb_loader.offload() wait_for_bg_threads() @@ -134,15 +131,6 @@ async def lifespan(app: FastAPI): if not app_config.disable_aaa: app.add_middleware(AppAPIAuthMiddleware) -# logger background thread - -def background_thread_task(): - # todo - # while(True): - # logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing}) - # sleep(10) - ... - # exception handlers @app.exception_handler(DbException) @@ -240,281 +228,3 @@ def download_logs() -> FileResponse: if os.path.isfile(file_path): # Might be a folder (just skip it then) zip_file.write(file_path) return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip') - - -# @app.post('/updateAccessDeclarative') -# @enabled_guard(app) -# def _( -# userIds: Annotated[list[str], Body()], -# sourceId: Annotated[str, Body()], -# ): -# logger.debug('Update access declarative request:', extra={ -# 'user_ids': userIds, -# 'source_id': sourceId, -# }) - -# if len(userIds) == 0: -# return JSONResponse('Empty list of user ids', 400) - -# if not is_valid_source_id(sourceId): -# return JSONResponse('Invalid source id', 400) - -# exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId)) - -# return JSONResponse('Access updated') - - -# @app.post('/updateAccess') -# @enabled_guard(app) -# def _( -# op: Annotated[UpdateAccessOp, Body()], -# userIds: Annotated[list[str], Body()], -# sourceId: Annotated[str, Body()], -# ): -# logger.debug('Update access request', extra={ -# 'op': op, -# 'user_ids': userIds, -# 'source_id': sourceId, -# }) - -# if len(userIds) == 0: -# return JSONResponse('Empty list of user ids', 400) - -# if not is_valid_source_id(sourceId): -# return JSONResponse('Invalid source id', 400) - -# exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId)) - -# return JSONResponse('Access updated') - - -# @app.post('/updateAccessProvider') -# @enabled_guard(app) -# def _( -# op: Annotated[UpdateAccessOp, Body()], -# userIds: Annotated[list[str], Body()], -# providerId: Annotated[str, Body()], -# ): -# logger.debug('Update access by provider request', extra={ -# 'op': op, -# 'user_ids': userIds, -# 'provider_id': providerId, -# }) - -# if len(userIds) == 0: -# return JSONResponse('Empty list of user ids', 400) - -# if not is_valid_provider_id(providerId): -# return JSONResponse('Invalid provider id', 400) - -# exec_in_proc(target=update_access_provider, args=(vectordb_loader, op, userIds, providerId)) - -# return JSONResponse('Access updated') - - -# @app.post('/deleteSources') -# @enabled_guard(app) -# def _(sourceIds: Annotated[list[str], Body(embed=True)]): -# logger.debug('Delete sources request', extra={ -# 'source_ids': sourceIds, -# }) - -# sourceIds = [source.strip() for source in sourceIds if source.strip() != ''] - -# if len(sourceIds) == 0: -# return JSONResponse('No sources provided', 400) - -# res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds)) -# if res is False: -# return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400) - -# return JSONResponse('All valid sources deleted') - - -# @app.post('/deleteProvider') -# @enabled_guard(app) -# def _(providerKey: str = Body(embed=True)): -# logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey }) - -# if value_of(providerKey) is None: -# return JSONResponse('Invalid provider key provided', 400) - -# exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey)) - -# return JSONResponse('All valid sources deleted') - - -# @app.post('/deleteUser') -# @enabled_guard(app) -# def _(userId: str = Body(embed=True)): -# logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId }) - -# if value_of(userId) is None: -# return JSONResponse('Invalid userId provided', 400) - -# exec_in_proc(target=delete_user, args=(vectordb_loader, userId)) - -# return JSONResponse('User deleted') - - -# @app.put('/loadSources') -# @enabled_guard(app) -# def _(sources: list[UploadFile]): -# global _indexing - -# if len(sources) == 0: -# return JSONResponse('No sources provided', 400) - -# for source in sources: -# if not value_of(source.filename): -# return JSONResponse(f'Invalid source filename for: {source.headers.get("title")}', 400) - -# with index_lock: -# if source.filename in _indexing: -# # this request will be retried by the client -# return JSONResponse( -# f'This source ({source.filename}) is already being processed in another request, try again later', -# 503, -# headers={'cc-retry': 'true'}, -# ) - -# if not ( -# value_of(source.headers.get('userIds')) -# and source.headers.get('title', None) is not None -# and value_of(source.headers.get('type')) -# and value_of(source.headers.get('modified')) -# and source.headers['modified'].isdigit() -# and value_of(source.headers.get('provider')) -# ): -# logger.error('Invalid/missing headers received', extra={ -# 'source_id': source.filename, -# 'title': source.headers.get('title'), -# 'headers': source.headers, -# }) -# return JSONResponse(f'Invaild/missing headers for:provider_ids {source.filename}', 400) - -# # wait for 10 minutes before failing the request -# semres = doc_parse_semaphore.acquire(block=True, timeout=10*60) -# if not semres: -# return JSONResponse( -# 'Document parser worker limit reached, try again in some time or consider increasing the limit', -# 503, -# headers={'cc-retry': 'true'} -# ) - -# with index_lock: -# for source in sources: -# _indexing[source.filename] = source.size - -# try: -# loaded_sources, not_added_sources = exec_in_proc( -# target=embed_sources, -# args=(vectordb_loader, app.extra['CONFIG'], sources) -# ) -# except (DbException, EmbeddingException): -# raise -# except Exception as e: -# raise DbException('Error: failed to load sources') from e -# finally: -# with index_lock: -# for source in sources: -# _indexing.pop(source.filename, None) -# doc_parse_semaphore.release() - -# if len(loaded_sources) != len(sources): -# logger.debug('Some sources were not loaded', extra={ -# 'Count of loaded sources': f'{len(loaded_sources)}/{len(sources)}', -# 'source_ids': loaded_sources, -# }) - -# # loaded sources include the existing sources that may only have their access updated -# return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources}) - - -# class Query(BaseModel): -# userId: str -# query: str -# useContext: bool = True -# scopeType: ScopeType | None = None -# scopeList: list[str] | None = None -# ctxLimit: int = 20 - -# @field_validator('userId', 'query', 'ctxLimit') -# @classmethod -# def check_empty_values(cls, value: Any, info: ValidationInfo): -# if value_of(value) is None: -# raise ValueError('Empty value for field', info.field_name) - -# return value - -# @field_validator('ctxLimit') -# @classmethod -# def at_least_one_context(cls, value: int): -# if value < 1: -# raise ValueError('Invalid context chunk limit') - -# return value - - -# def execute_query(query: Query, in_proc: bool = True) -> LLMOutput: -# llm: LLM = llm_loader.load() -# template = app.extra.get('LLM_TEMPLATE') -# no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE'] -# # todo: array -# end_separator = app.extra.get('LLM_END_SEPARATOR', '') - -# if query.useContext: -# target = process_context_query -# args=( -# query.userId, -# vectordb_loader, -# llm, -# app_config, -# query.query, -# query.ctxLimit, -# query.scopeType, -# query.scopeList, -# template, -# end_separator, -# ) -# else: -# target=process_query -# args=( -# query.userId, -# llm, -# app_config, -# query.query, -# no_ctx_template, -# end_separator, -# ) - -# if in_proc: -# return exec_in_proc(target=target, args=args) - -# return target(*args) # pyright: ignore - - -# @app.post('/query') -# @enabled_guard(app) -# def _(query: Query) -> LLMOutput: -# logger.debug('received query request', extra={ 'query': query.dict() }) - -# if app_config.llm[0] == 'nc_texttotext': -# return execute_query(query) - -# with llm_lock: -# return execute_query(query, in_proc=False) - - -# @app.post('/docSearch') -# @enabled_guard(app) -# def _(query: Query) -> list[SearchResult]: -# # useContext from Query is not used here -# return exec_in_proc(target=do_doc_search, args=( -# query.userId, -# query.query, -# vectordb_loader, -# query.ctxLimit, -# query.scopeType, -# query.scopeList, -# )) From 85d29f1640eb2ff5daa89016ecbae8ee9d484d27 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 27 Mar 2026 01:06:34 +0530 Subject: [PATCH 24/56] fix(ci): parse json output from the stats command Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 31 +++++++------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 58f9f50c..589f8852 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -224,7 +224,7 @@ jobs: echo "Checking stats, attempt $i..." stats_err=$(mktemp) - stats=$(timeout 5 ./occ context_chat:stats 2>"$stats_err") + stats=$(timeout 5 ./occ context_chat:stats --json 2>"$stats_err") stats_exit=$? echo "Stats output:" echo "$stats" @@ -243,41 +243,25 @@ jobs: fi # Extract Total eligible files - total_files=$(echo "$stats" | grep -oP 'Total eligible files:\s*\K\d+' || echo "") + total_files=$(echo "$stats" | jq '.eligible_files_count' || echo "") # Extract Indexed documents count (files__default) - indexed_count=$(echo "$stats" | grep -oP "'files__default'\s*=>\s*\K\d+" || echo "") - - # Validate parsed values - if [ -z "$total_files" ] || [ -z "$indexed_count" ]; then - echo "Error: Could not parse stats output properly" - if echo "$stats" | grep -q "Indexed documents:"; then - echo " Indexed documents section found but could not extract count" - fi - sleep 10 - continue - fi + indexed_count=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "") echo "Total eligible files: $total_files" echo "Indexed documents (files__default): $indexed_count" - # Calculate absolute difference diff=$((total_files - indexed_count)) - if [ $diff -lt 0 ]; then - diff=$((-diff)) - fi - - # Calculate 2% threshold using bc for floating point support - threshold=$(echo "scale=4; $total_files * 0.02" | bc) + threshold=$((total_files * 2 / 100)) # Check if difference is within tolerance - if (( $(echo "$diff <= $threshold" | bc -l) )); then + if [ $diff -le $threshold ]; then echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)" success=1 break else - pct=$(echo "scale=2; ($diff / $total_files) * 100" | bc) - echo "Outside 2% tolerance: diff=$diff (${pct}%), threshold=$threshold" + progress=$((diff * 100 / total_files)) + echo "Outside 2% tolerance: diff=$diff (${progress}%), threshold=$threshold" fi # Check if backend is still alive @@ -293,6 +277,7 @@ jobs: echo "::endgroup::" ./occ context_chat:stats + ./occ context_chat:stats --json if [ $success -ne 1 ]; then echo "Max attempts reached" From 4c6d01b9e913de0a931345aeab7169b3029a5c9a Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 27 Mar 2026 02:57:22 +0530 Subject: [PATCH 25/56] fix: seek to 0 to read the full buffer Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 18a37b4b..0196f5d9 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -50,6 +50,7 @@ async def __fetch_file_content( dav=False, params={ 'userId': user_id }, ) + fp.seek(0) return fp except niquests.exceptions.RequestException as e: if e.response is None: From 51774ff771944c5dffd46b3f33ed2c4a0d7f5bb6 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 27 Mar 2026 02:59:46 +0530 Subject: [PATCH 26/56] fix(ci): 3% tolerance Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 589f8852..73418e93 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -252,16 +252,16 @@ jobs: echo "Indexed documents (files__default): $indexed_count" diff=$((total_files - indexed_count)) - threshold=$((total_files * 2 / 100)) + threshold=$((total_files * 3 / 100)) # Check if difference is within tolerance if [ $diff -le $threshold ]; then - echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)" + echo "Indexing within 3% tolerance (diff=$diff, threshold=$threshold)" success=1 break else progress=$((diff * 100 / total_files)) - echo "Outside 2% tolerance: diff=$diff (${progress}%), threshold=$threshold" + echo "Outside 3% tolerance: diff=$diff (${progress}%), threshold=$threshold" fi # Check if backend is still alive From c81b6758600eae2f049deb7ec578ef5c7eeca41b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 27 Mar 2026 04:38:36 +0530 Subject: [PATCH 27/56] fix(ci): wait longer for EM server Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 73418e93..5c505483 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -201,7 +201,7 @@ jobs: timeout 10 ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080 timeout 120 ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish ls -la context_chat_backend/persistent_storage/* - sleep 30 # Wait for the em server to get ready + sleep 60 # Wait for the em server to get ready - name: Initial memory usage check run: | @@ -242,13 +242,13 @@ jobs: continue fi - # Extract Total eligible files - total_files=$(echo "$stats" | jq '.eligible_files_count' || echo "") + # Extract total queued files + total_files=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "") - # Extract Indexed documents count (files__default) - indexed_count=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "") + # Extract indexed documents count (files__default) + indexed_count=$(echo "$stats" | jq '.vectordb_document_counts.files__default' || echo "") - echo "Total eligible files: $total_files" + echo "Total queued files: $total_files" echo "Indexed documents (files__default): $indexed_count" diff=$((total_files - indexed_count)) From 6817f897e4ae14fdfeab0ad7b40a9a2de78cfe4b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Mon, 30 Mar 2026 15:57:44 +0530 Subject: [PATCH 28/56] fix: don't process files or requests until the EM server is healthy Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 1 - context_chat_backend/network_em.py | 14 +++++++++++--- context_chat_backend/task_fetcher.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 5c505483..8e6ca7d8 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -201,7 +201,6 @@ jobs: timeout 10 ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080 timeout 120 ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish ls -la context_chat_backend/persistent_storage/* - sleep 60 # Wait for the em server to get ready - name: Initial memory usage check run: | diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py index d39ea56a..43ced6cc 100644 --- a/context_chat_backend/network_em.py +++ b/context_chat_backend/network_em.py @@ -8,7 +8,6 @@ import niquests from langchain_core.embeddings import Embeddings -from pydantic import BaseModel from .types import ( EmbeddingException, @@ -41,8 +40,17 @@ class CreateEmbeddingResponse(TypedDict): usage: EmbeddingUsage -class NetworkEmbeddings(Embeddings, BaseModel): - app_config: TConfig +class NetworkEmbeddings(Embeddings): + def __init__(self, app_config: TConfig): + self.app_config = app_config + + def check_connection(self) -> bool: + try: + self.embed_query('hello') + return True + except EmbeddingException as e: + logger.warning('Embedding server connection failed', exc_info=e) + return False def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] | list[list[float]]: emconf = self.app_config.embedding diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 634b51cd..92d2719e 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -26,6 +26,7 @@ from .chain.query_proc import get_pruned_query from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult from .dyn_loader import LLMModelLoader, VectorDBLoader +from .network_em import NetworkEmbeddings from .types import ( ActionsQueueItems, ActionType, @@ -102,6 +103,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> return try: + if not __check_em_server(app_config): + sleep(POLLING_COOLDOWN) + continue + nc = NextcloudApp() q_items_res = nc.ocs( 'GET', @@ -415,6 +420,10 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: llm: LLM = llm_loader.load() while True: + if not __check_em_server(app_config): + sleep(POLLING_COOLDOWN) + continue + if THREAD_STOP_EVENT.is_set(): LOGGER.info('Updates processing thread is stopping due to stop event being set') return @@ -822,3 +831,8 @@ def process_search_task( task_input.get('scopeList'), ) ) + + +def __check_em_server(app_config: TConfig) -> bool: + embedding_model = NetworkEmbeddings(app_config=app_config) + return embedding_model.check_connection() From 104a37a8a1b28878b98da5ce7b0eb520ebe73716 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 12:38:38 +0200 Subject: [PATCH 29/56] tests: Increase testing time to allow backend to injest more sources --- .github/workflows/integration-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 8e6ca7d8..b937a147 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -218,8 +218,8 @@ jobs: - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | success=0 - echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files" - for i in {1..90}; do + echo "::group::Checking stats periodically for 30 minutes to allow the backend to index the files" + for i in {1..180}; do echo "Checking stats, attempt $i..." stats_err=$(mktemp) From b3b461a2b3a88f2fd815be11c132a7174772aa3c Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 13:17:13 +0200 Subject: [PATCH 30/56] fix: More log statements --- .../chain/ingest/doc_loader.py | 20 +++++++++-- context_chat_backend/chain/ingest/injest.py | 35 +++++++++++++++++++ context_chat_backend/task_fetcher.py | 29 +++++++++++++-- context_chat_backend/utils.py | 12 +++++++ context_chat_backend/vectordb/pgvector.py | 20 ++++++++++- 5 files changed, 110 insertions(+), 6 deletions(-) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index 832c8331..04c611d2 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -7,6 +7,8 @@ import tempfile from collections.abc import Callable from io import BytesIO +import logging +from time import perf_counter_ns import docx2txt from epub2txt import epub2txt @@ -19,6 +21,8 @@ from ...types import IndexingException, SourceItem +logger = logging.getLogger('ccb.doc_loader') + def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str: raw_bytes = file.read() @@ -133,10 +137,22 @@ def decode_source(source: SourceItem) -> str: else: io_obj = source.content - if _loader_map.get(source.type): - result = _loader_map[source.type](io_obj) + loader_fn = _loader_map.get(source.type) + if loader_fn: + logger.debug( + 'Decoding source %r with loader %s (mime: %s) — may be slow or block', + source.title, loader_fn.__name__, source.type, + ) + t0 = perf_counter_ns() + result = loader_fn(io_obj) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug( + 'Loader %s for %r finished in %.2f ms (%d chars)', + loader_fn.__name__, source.title, elapsed_ms, len(result), + ) return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip() + logger.debug('No specific loader for mime type %s, reading as plain text for %r', source.type, source.title) return io_obj.read().decode('utf-8', 'ignore').strip() except IndexingException: raise diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 0196f5d9..7ede94a6 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -7,6 +7,7 @@ import re from collections.abc import Mapping from io import BytesIO +from time import perf_counter_ns import niquests from langchain.schema import Document @@ -42,6 +43,8 @@ async def __fetch_file_content( async with semaphore: nc = AsyncNextcloudApp() try: + logger.debug('Downloading file id %d for user %s', file_id, user_id) + t0 = perf_counter_ns() # a file pointer for storing the stream in memory until it is consumed fp = BytesIO() await nc._session.download2fp( @@ -51,6 +54,8 @@ async def __fetch_file_content( params={ 'userId': user_id }, ) fp.seek(0) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug('Downloaded file id %d for user %s in %.2f ms (%d bytes)', file_id, user_id, elapsed_ms, fp.getbuffer().nbytes) return fp except niquests.exceptions.RequestException as e: if e.response is None: @@ -89,6 +94,9 @@ async def __fetch_files_content( semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) tasks = [] + file_count = sum(1 for s in sources.values() if isinstance(s, ReceivedFileItem)) + logger.debug('Fetching content for %d file(s) (max %d concurrent)', file_count, CONCURRENT_FILE_FETCHES) + for db_id, file in sources.items(): if isinstance(file, SourceItem): continue @@ -123,7 +131,11 @@ async def __fetch_files_content( # any user id from the list should have read access to the file tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) + logger.debug('Gathering %d file download task(s) — this blocks until all downloads complete or fail', len(tasks)) + t0 = perf_counter_ns() results = await asyncio.gather(*tasks, return_exceptions=True) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug('All %d file download task(s) completed in %.2f ms', len(tasks), elapsed_ms) for (db_id, file), result in zip(sources.items(), results, strict=True): if isinstance(file, SourceItem): continue @@ -215,7 +227,14 @@ def _sources_to_indocuments( # transform the source to have text data try: + logger.debug( + 'Decoding source %s (type: %s, title: %r) — may be slow for complex file types', + source.reference, source.type, source.title, + ) + t0 = perf_counter_ns() content = decode_source(source) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug('Decoded source %s in %.2f ms (%d chars)', source.reference, elapsed_ms, len(content)) except IndexingException as e: logger.error(f'Error decoding source ({source.reference}): {e}', exc_info=e) errored_docs[db_id] = IndexingError( @@ -333,7 +352,17 @@ def _process_sources( source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources) + logger.debug( + 'Fetching file contents for %d source(s) — this blocks on network I/O to Nextcloud', + len(to_embed_sources), + ) + t0 = perf_counter_ns() populated_to_embed_sources, errored_sources = asyncio.run(__fetch_files_content(to_embed_sources)) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug( + 'File content fetch complete in %.2f ms: %d fetched, %d errored', + elapsed_ms, len(populated_to_embed_sources), len(errored_sources), + ) source_proc_results.update(errored_sources) # pyright: ignore[reportAttributeAccessIssue] if len(populated_to_embed_sources) == 0: @@ -359,7 +388,13 @@ def _process_sources( 'source_ids': [indoc.source_id for indoc in indocuments.values()] }) + t0 = perf_counter_ns() doc_add_results = vectordb.add_indocuments(indocuments) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.info( + 'vectordb.add_indocuments completed in %.2f ms for %d document(s)', + elapsed_ms, len(indocuments), + ) source_proc_results.update(doc_add_results) # pyright: ignore[reportAttributeAccessIssue] logger.debug('Added documents to vectordb') diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 92d2719e..32673c85 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -82,11 +82,22 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: return def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: + source_refs = [s.reference for s in source_items.values()] + LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs) try: - return exec_in_proc( + result = exec_in_proc( target=embed_sources, args=(vectordb_loader, app_config, source_items), ) + errors = {k: v for k, v in result.items() if isinstance(v, IndexingError)} + LOGGER.info( + 'embed_sources subprocess finished for %d source(s): %d succeeded, %d errored', + len(source_items), + len(result) - len(errors), + len(errors), + extra={'errors': errors} if errors else {}, + ) + return result except Exception as e: err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown") source_ids = (s.reference for s in source_items.values()) @@ -94,6 +105,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}', retryable=True, ) + LOGGER.error( + 'embed_sources subprocess raised a %s error for sources %s, marking all as retryable', + err_name, source_refs, exc_info=e, + ) return dict.fromkeys(source_items, err) @@ -146,13 +161,21 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> max_workers=PARALLEL_FILE_PARSING_COUNT, thread_name_prefix='IndexingPool', ) as executor: + LOGGER.info( + 'Dispatching %d file chunk(s) and %d provider chunk(s) to %d IndexingPool worker(s)', + len(file_chunks), len(provider_chunks), PARALLEL_FILE_PARSING_COUNT, + ) file_futures = [executor.submit(_load_sources, chunk) for chunk in file_chunks] provider_futures = [executor.submit(_load_sources, chunk) for chunk in provider_chunks] - for future in file_futures: + for i, future in enumerate(file_futures): + LOGGER.debug('Waiting for file chunk %d/%d future to complete', i + 1, len(file_futures)) files_result.update(future.result()) - for future in provider_futures: + LOGGER.debug('File chunk %d/%d future completed', i + 1, len(file_futures)) + for i, future in enumerate(provider_futures): + LOGGER.debug('Waiting for provider chunk %d/%d future to complete', i + 1, len(provider_futures)) providers_result.update(future.result()) + LOGGER.debug('Provider chunk %d/%d future completed', i + 1, len(provider_futures)) if ( any(isinstance(res, IndexingError) for res in files_result.values()) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index c7e588b3..d28fc582 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -90,8 +90,20 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem kwargs=kwargs, daemon=daemon, ) + target_name = getattr(target, '__name__', str(target)) + _logger.debug('Starting subprocess for %s', target_name) + start = perf_counter_ns() p.start() + _logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name) p.join() + elapsed_ms = (perf_counter_ns() - start) / 1e6 + _logger.debug('Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode) + if p.exitcode != 0: + _logger.warning( + 'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms' + ' — possible OOM kill or unhandled signal', + p.pid, target_name, p.exitcode, elapsed_ms, + ) result = pconn.recv() if result['error'] is not None: diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 86f636be..33dfb039 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -6,6 +6,7 @@ import os from collections.abc import Mapping from datetime import datetime +from time import perf_counter_ns import psycopg import sqlalchemy as sa @@ -152,8 +153,25 @@ def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, # so we chunk the documents into (5 values * 10k) chunks # change the chunk size when there are more inserted values per document chunk_ids = [] - for i in range(0, len(indoc.documents), batch_size): + total_chunks = len(indoc.documents) + num_batches = max(1, -(-total_chunks // batch_size)) # ceiling division + logger.debug( + 'Embedding source %s: %d chunk(s) in %d batch(es) — blocks on embedding model', + indoc.source_id, total_chunks, num_batches, + ) + for i in range(0, total_chunks, batch_size): + batch_num = i // batch_size + 1 + logger.debug( + 'Sending embedding batch %d/%d (%d chunk(s)) for source %s', + batch_num, num_batches, len(indoc.documents[i:i+batch_size]), indoc.source_id, + ) + t0 = perf_counter_ns() chunk_ids.extend(self.client.add_documents(indoc.documents[i:i+batch_size])) + elapsed_ms = (perf_counter_ns() - t0) / 1e6 + logger.debug( + 'Embedding batch %d/%d for source %s completed in %.2f ms', + batch_num, num_batches, indoc.source_id, elapsed_ms, + ) doc = DocumentsStore( source_id=indoc.source_id, From a4a88dae5f231732e448cefb9c0ea3e0da03aee5 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 13:18:24 +0200 Subject: [PATCH 31/56] tests: Set wait time back to 90 --- .github/workflows/integration-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index b937a147..8e6ca7d8 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -218,8 +218,8 @@ jobs: - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | success=0 - echo "::group::Checking stats periodically for 30 minutes to allow the backend to index the files" - for i in {1..180}; do + echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files" + for i in {1..90}; do echo "Checking stats, attempt $i..." stats_err=$(mktemp) From 0c52747375355e6e0338fd68599338f8bd644dc4 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 14:04:57 +0200 Subject: [PATCH 32/56] fix: Reduce worker count on github actions to prevent oom --- context_chat_backend/task_fetcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 32673c85..91d1991a 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -59,6 +59,10 @@ MIN_FILES_PER_CPU = 4 # divides the batch into these many chunks PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? +if os.getenv('GITHUB_ACTIONS'): + # Keep CI memory usage predictable and avoid OOM-killed workers. + PARALLEL_FILE_PARSING_COUNT = max(1, min(PARALLEL_FILE_PARSING_COUNT, 2)) +LOGGER.info(f'Using {PARALLEL_FILE_PARSING_COUNT} parallel file parsing workers') ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 TRIGGER = Event() From e676c329ca5a0c147ef0bfadbf5c372f4e25dd99 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 14:14:58 +0200 Subject: [PATCH 33/56] fix(exec_in_proc): Raise RuntimeError if exitcode is non-zero --- context_chat_backend/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index d28fc582..024e71c8 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -104,6 +104,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem ' — possible OOM kill or unhandled signal', p.pid, target_name, p.exitcode, elapsed_ms, ) + raise RuntimeError(f'Subprocess PID {p.pid} for {target_name} exited with non-zero exit code {p.exitcode}') result = pconn.recv() if result['error'] is not None: From b027ff3234a50cf8eb5a1447bafbef8f147212b5 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 14:46:42 +0200 Subject: [PATCH 34/56] fix(indexing): Reduce memory pressure on gh actions --- context_chat_backend/task_fetcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 91d1991a..2a7e84fd 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -56,7 +56,11 @@ THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? +if os.getenv('GITHUB_ACTIONS'): + FILES_INDEXING_BATCH_SIZE = 4 MIN_FILES_PER_CPU = 4 +if os.getenv('GITHUB_ACTIONS'): + MIN_FILES_PER_CPU = 2 # divides the batch into these many chunks PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? if os.getenv('GITHUB_ACTIONS'): From 19b773fac97d3cf76fb581224df76d63e3c9a34d Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Wed, 1 Apr 2026 15:19:06 +0200 Subject: [PATCH 35/56] fix(indexing): Fallback to batch_size=1 if embed_sources is killed and do not retry afterward if one these single item batches get killed --- context_chat_backend/task_fetcher.py | 51 +++++++++++++++++++++++++--- context_chat_backend/utils.py | 13 ++++++- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 2a7e84fd..edeabc12 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -39,7 +39,7 @@ SourceItem, TConfig, ) -from .utils import exec_in_proc, get_app_role +from .utils import SubprocessKilledError, exec_in_proc, get_app_role from .vectordb.base import BaseVectorDB from .vectordb.service import ( decl_update_access, @@ -89,6 +89,29 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return + def _embed_one(db_id: int, item: SourceItem | ReceivedFileItem) -> tuple[int, IndexingError | None]: + """Run embed_sources for a single item in its own subprocess. Returns (db_id, error_or_None).""" + try: + result = exec_in_proc( + target=embed_sources, + args=(vectordb_loader, app_config, {db_id: item}), + ) + return db_id, result.get(db_id) + except SubprocessKilledError as e: + LOGGER.error( + 'embed_sources subprocess killed for individual source %s — marking as non-retryable' + ' to prevent infinite OOM retry loop', + item.reference, exc_info=e, + ) + return db_id, IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False) + except Exception as e: + err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown') + LOGGER.error( + 'embed_sources raised a %s error for individual source %s, marking as retryable', + err_name, item.reference, exc_info=e, + ) + return db_id, IndexingError(error=str(e), retryable=True) + def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: source_refs = [s.reference for s in source_items.values()] LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs) @@ -106,11 +129,31 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> extra={'errors': errors} if errors else {}, ) return result + except SubprocessKilledError as e: + LOGGER.error( + 'embed_sources subprocess was killed (likely OOM) for %d source(s): %s', + len(source_items), source_refs, exc_info=e, + ) + if len(source_items) == 1: + # Single-item subprocess was killed — mark non-retryable to break infinite OOM loop. + LOGGER.error( + 'Single-item subprocess killed for %s — marking as non-retryable', + source_refs, + ) + return {db_id: IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False) + for db_id in source_items} + + # Multi-item batch: fall back to one subprocess per source to pinpoint the problematic file. + LOGGER.warning( + 'Falling back to individual processing for %d sources to isolate any OOM-causing file(s)', + len(source_items), + ) + return dict(_embed_one(db_id, item) for db_id, item in source_items.items()) + except Exception as e: - err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown") - source_ids = (s.reference for s in source_items.values()) + err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown') err = IndexingError( - error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}', + error=f'{err_name} Error: {e}', retryable=True, ) LOGGER.error( diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 024e71c8..4b9fad51 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -69,6 +69,17 @@ def JSONResponse( return FastAPIJSONResponse(content, status_code, **kwargs) +class SubprocessKilledError(RuntimeError): + """Raised when a subprocess exits with a non-zero exit code (likely OOM kill or unhandled signal).""" + + def __init__(self, pid: int, target_name: str, exitcode: int): + super().__init__( + f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}' + ' — possible OOM kill or unhandled signal' + ) + self.exitcode = exitcode + + def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs): try: if fun is None: @@ -104,7 +115,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem ' — possible OOM kill or unhandled signal', p.pid, target_name, p.exitcode, elapsed_ms, ) - raise RuntimeError(f'Subprocess PID {p.pid} for {target_name} exited with non-zero exit code {p.exitcode}') + raise SubprocessKilledError(p.pid, target_name, p.exitcode) result = pconn.recv() if result['error'] is not None: From bde0bc54e2dde254b37fe426418abbca295a27a0 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 2 Apr 2026 14:18:47 +0530 Subject: [PATCH 36/56] fix: log stdout and stderr from subprocesses Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 4b9fad51..068ffa83 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -2,9 +2,11 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import io import logging import multiprocessing as mp import os +import sys import traceback from collections.abc import Callable from functools import partial, wraps @@ -80,7 +82,12 @@ def __init__(self, pid: int, target_name: str, exitcode: int): self.exitcode = exitcode -def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs): +def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): + stdout_capture = io.StringIO() + stderr_capture = io.StringIO() + sys.stdout = stdout_capture + sys.stderr = stderr_capture + try: if fun is None: return resconn.send({ 'value': None, 'error': None }) @@ -88,11 +95,15 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs): except Exception as e: tb = traceback.format_exc() resconn.send({ 'value': None, 'error': e, 'traceback': tb }) + finally: + stdconn.send({'stdout': stdout_capture.getvalue(), 'stderr': stderr_capture.getvalue()}) def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): # noqa: B006 pconn, cconn = mp.Pipe() + std_pconn, std_cconn = mp.Pipe() kwargs['resconn'] = cconn + kwargs['stdconn'] = std_cconn p = mp.Process( group=group, target=partial(exception_wrap, target), @@ -108,20 +119,28 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem _logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name) p.join() elapsed_ms = (perf_counter_ns() - start) / 1e6 - _logger.debug('Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode) + _logger.debug( + 'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', + p.pid, target_name, elapsed_ms, p.exitcode, + ) if p.exitcode != 0: _logger.warning( 'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms' ' — possible OOM kill or unhandled signal', p.pid, target_name, p.exitcode, elapsed_ms, ) - raise SubprocessKilledError(p.pid, target_name, p.exitcode) + raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1) result = pconn.recv() if result['error'] is not None: _logger.error('original traceback: %s', result['traceback']) raise result['error'] + stdobj = std_pconn.recv() + _logger.info(f'std info for {target_name}', extra={ + 'stdout': stdobj['stdout'], + 'stderr': stdobj['stderr'], + }) return result['value'] From 4de591f79b29746c220cd0a268b9254a18fc424c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 2 Apr 2026 14:57:16 +0530 Subject: [PATCH 37/56] fix: don't raise before std* is captured Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 068ffa83..3122a417 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -123,6 +123,17 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem 'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode, ) + stdobj = std_pconn.recv() + _logger.info(f'std info for {target_name}', extra={ + 'stdout': stdobj['stdout'], + 'stderr': stdobj['stderr'], + }) + + result = pconn.recv() + if result['error'] is not None: + _logger.error('original traceback: %s', result['traceback']) + raise result['error'] + if p.exitcode != 0: _logger.warning( 'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms' @@ -131,16 +142,6 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem ) raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1) - result = pconn.recv() - if result['error'] is not None: - _logger.error('original traceback: %s', result['traceback']) - raise result['error'] - - stdobj = std_pconn.recv() - _logger.info(f'std info for {target_name}', extra={ - 'stdout': stdobj['stdout'], - 'stderr': stdobj['stderr'], - }) return result['value'] From 4deda845f40dd3e3419253ec647d156a4c76e218 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 2 Apr 2026 15:01:10 +0530 Subject: [PATCH 38/56] feat: log cpu count and memory info of the system Signed-off-by: Anupam Kumar --- main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index c4ffa1fd..8d838d80 100755 --- a/main.py +++ b/main.py @@ -4,8 +4,9 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # import logging -from os import getenv +from os import cpu_count, getenv +import psutil import uvicorn from nc_py_api.ex_app import run_app @@ -48,6 +49,7 @@ def _setup_log_levels(debug: bool): app_config: TConfig = app.extra['CONFIG'] _setup_log_levels(app_config.debug) + print(f'CPU count: {cpu_count()}, Memory: {psutil.virtual_memory()}') print('App config:\n' + redact_config(app_config).model_dump_json(indent=2), flush=True) uv_log_config = uvicorn.config.LOGGING_CONFIG # pyright: ignore[reportAttributeAccessIssue] From ad0eac70712600964f45e2401bed411945e148a7 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Thu, 2 Apr 2026 17:41:39 +0530 Subject: [PATCH 39/56] fix: catch BaseException in subprocess Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 3122a417..02545d9f 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -92,7 +92,7 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co if fun is None: return resconn.send({ 'value': None, 'error': None }) resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) - except Exception as e: + except BaseException as e: tb = traceback.format_exc() resconn.send({ 'value': None, 'error': e, 'traceback': tb }) finally: From 36bcfb721364912bcca24c37bc30e357cebfe275 Mon Sep 17 00:00:00 2001 From: Marcel Klehr Date: Thu, 2 Apr 2026 14:19:49 +0200 Subject: [PATCH 40/56] fix(utils): Improve exec_in_proc to handle more failure modes --- context_chat_backend/utils.py | 170 +++++++++++++++++++++++++++++----- 1 file changed, 149 insertions(+), 21 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 02545d9f..e994a3f2 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -9,6 +9,7 @@ import sys import traceback from collections.abc import Callable +from contextlib import suppress from functools import partial, wraps from multiprocessing.connection import Connection from time import perf_counter_ns @@ -72,31 +73,95 @@ def JSONResponse( class SubprocessKilledError(RuntimeError): - """Raised when a subprocess exits with a non-zero exit code (likely OOM kill or unhandled signal).""" + """Raised when a subprocess is terminated by a signal (for example SIGKILL).""" def __init__(self, pid: int, target_name: str, exitcode: int): super().__init__( - f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}' - ' — possible OOM kill or unhandled signal' + f'Subprocess PID {pid} for {target_name} exited with signal {abs(exitcode)} ' + f'(raw exit code: {exitcode})' ) self.exitcode = exitcode +class SubprocessExecutionError(RuntimeError): + """Raised when a subprocess exits non-zero without a recoverable Python exception payload.""" + + def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''): + msg = f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}' + if details: + msg = f'{msg}: {details}' + super().__init__(msg) + self.exitcode = exitcode + + +_MAX_STD_CAPTURE_CHARS = 64 * 1024 + + +def _truncate_capture(text: str) -> tuple[str, bool]: + if len(text) <= _MAX_STD_CAPTURE_CHARS: + return text, False + + head = _MAX_STD_CAPTURE_CHARS // 2 + tail = _MAX_STD_CAPTURE_CHARS - head + omitted = len(text) - _MAX_STD_CAPTURE_CHARS + truncated = ( + f'[truncated {omitted} chars]\n' + f'{text[:head]}\n' + '[...snip...]\n' + f'{text[-tail:]}' + ) + return truncated, True + + def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): stdout_capture = io.StringIO() stderr_capture = io.StringIO() + orig_stdout = sys.stdout + orig_stderr = sys.stderr sys.stdout = stdout_capture sys.stderr = stderr_capture try: if fun is None: - return resconn.send({ 'value': None, 'error': None }) - resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) + resconn.send({ 'value': None, 'error': None }) + else: + resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) except BaseException as e: tb = traceback.format_exc() - resconn.send({ 'value': None, 'error': e, 'traceback': tb }) + payload = { + 'value': None, + 'error': e, + 'traceback': tb, + 'error_type': type(e).__name__, + 'error_module': type(e).__module__, + 'error_message': str(e), + } + try: + resconn.send(payload) + except Exception as send_err: + # Fallback for unpicklable exceptions. + with suppress(Exception): + resconn.send({ + 'value': None, + 'error': None, + 'traceback': tb, + 'error_type': type(e).__name__, + 'error_module': type(e).__module__, + 'error_message': str(e), + 'send_error': str(send_err), + }) finally: - stdconn.send({'stdout': stdout_capture.getvalue(), 'stderr': stderr_capture.getvalue()}) + sys.stdout = orig_stdout + sys.stderr = orig_stderr + stdout_text, stdout_truncated = _truncate_capture(stdout_capture.getvalue()) + stderr_text, stderr_truncated = _truncate_capture(stderr_capture.getvalue()) + with suppress(Exception): + stdconn.send({ + 'stdout': stdout_text, + 'stderr': stderr_text, + 'stdout_truncated': stdout_truncated, + 'stderr_truncated': stderr_truncated, + }) def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): # noqa: B006 @@ -117,30 +182,93 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem start = perf_counter_ns() p.start() _logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name) + + result = None + stdobj = { + 'stdout': '', + 'stderr': '', + 'stdout_truncated': False, + 'stderr_truncated': False, + } + got_result = False + got_std = False + + # Drain result/std pipes while child is still alive to avoid deadlock on full pipe buffers. + while p.is_alive() and (not got_result or not got_std): + if not got_result and pconn.poll(0.1): + with suppress(EOFError, OSError, BrokenPipeError): + result = pconn.recv() + got_result = True + if not got_std and std_pconn.poll(): + with suppress(EOFError, OSError, BrokenPipeError): + stdobj = std_pconn.recv() + got_std = True + p.join() elapsed_ms = (perf_counter_ns() - start) / 1e6 _logger.debug( 'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode, ) - stdobj = std_pconn.recv() - _logger.info(f'std info for {target_name}', extra={ - 'stdout': stdobj['stdout'], - 'stderr': stdobj['stderr'], - }) - - result = pconn.recv() - if result['error'] is not None: - _logger.error('original traceback: %s', result['traceback']) + + if not got_std: + with suppress(EOFError, OSError, BrokenPipeError): + if std_pconn.poll(): + stdobj = std_pconn.recv() + got_std = True + if stdobj['stdout'] or stdobj['stderr']: + extra = { + 'stdout': stdobj['stdout'], + 'stderr': stdobj['stderr'], + } + if stdobj.get('stdout_truncated') or stdobj.get('stderr_truncated'): + extra['stdio_truncated'] = { + 'stdout': bool(stdobj.get('stdout_truncated')), + 'stderr': bool(stdobj.get('stderr_truncated')), + } + _logger.info('std info for %s', target_name, extra=extra) + + if not got_result: + with suppress(EOFError, OSError, BrokenPipeError): + if pconn.poll(): + result = pconn.recv() + got_result = True + + if result is not None and result.get('error') is not None: + _logger.error('original traceback: %s', result.get('traceback', '')) raise result['error'] - if p.exitcode != 0: + if result is not None and result.get('error_type'): + details = ( + f"{result.get('error_module', '')}.{result.get('error_type', '')}: " + f"{result.get('error_message', '')}" + ) + if result.get('traceback'): + _logger.error('remote traceback: %s', result['traceback']) + raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details) + + if p.exitcode and p.exitcode < 0: _logger.warning( - 'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms' - ' — possible OOM kill or unhandled signal', - p.pid, target_name, p.exitcode, elapsed_ms, + 'Subprocess PID %d for %s exited due to signal %d after %.2f ms', + p.pid, target_name, abs(p.exitcode), elapsed_ms, + ) + raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode) + + if p.exitcode not in (None, 0): + raise SubprocessExecutionError( + p.pid or 0, + target_name, + p.exitcode, + 'No structured exception payload received from child process', + ) + + if result is None: + raise SubprocessExecutionError( + p.pid or 0, + target_name, + 0, + 'Subprocess exited successfully but returned no result payload', ) - raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1) return result['value'] From 47eaf72daec83faec6d9a4a4ce9e23b231cfba31 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 11:08:34 +0530 Subject: [PATCH 41/56] one more stab at a fix Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 37 ++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index e994a3f2..b4e93c79 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # +import atexit +import faulthandler import io import logging import multiprocessing as mp @@ -114,6 +116,28 @@ def _truncate_capture(text: str) -> tuple[str, bool]: def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): + # --- diagnostic probes: write directly to the real stderr FD so they survive + # Python's stdout/stderr redirection below and even os._exit() won't hide them + # from the parent process's stderr stream. + _diag_fd = os.dup(2) # dup before we capture sys.stderr + + def _raw_diag(msg: str) -> None: + with suppress(Exception): + os.write(_diag_fd, (msg + '\n').encode()) + + # Enable faulthandler on the real FD so crash tracebacks (SIGSEGV etc.) appear. + with suppress(Exception): + faulthandler.enable(file=os.fdopen(os.dup(_diag_fd), 'w', closefd=True), all_threads=True) + + # Atexit probe: if this message NEVER appears, it means os._exit() (C-level) + # was called with Python's cleanup phase entirely skipped. + _fun_name = getattr(fun, '__name__', str(fun)) + atexit.register( + _raw_diag, + f'[exception_wrap/atexit] pid={os.getpid()} target={_fun_name}' + ': Python atexit reached (normal Python exit)', + ) + stdout_capture = io.StringIO() stderr_capture = io.StringIO() orig_stdout = sys.stdout @@ -124,10 +148,18 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co try: if fun is None: resconn.send({ 'value': None, 'error': None }) + _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result sent (fun=None)') else: - resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) + result_value = fun(*args, **kwargs) + _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: fun() returned, sending result') + resconn.send({ 'value': result_value, 'error': None }) + _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result pipe send complete') except BaseException as e: tb = traceback.format_exc() + _raw_diag( + f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}' + f': caught {type(e).__name__}: {e}' + ) payload = { 'value': None, 'error': e, @@ -162,6 +194,9 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co 'stdout_truncated': stdout_truncated, 'stderr_truncated': stderr_truncated, }) + _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: finally block complete') + with suppress(Exception): + os.close(_diag_fd) def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): # noqa: B006 From 309ab2bf19a54fb89c01f61550b07a9daf9d45d1 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 11:43:38 +0530 Subject: [PATCH 42/56] do not throw away the valid result even with exitcode 1 Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index b4e93c79..fe4ee96c 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -282,6 +282,23 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem _logger.error('remote traceback: %s', result['traceback']) raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details) + # If we received a valid result payload, return it even if the exit + # code is non-zero. The non-zero code typically comes from + # multiprocessing/C-extension cleanup (e.g. util._exit_function or + # a native atexit handler) that runs *after* exception_wrap has + # already sent the result over the pipe. + if result is not None and 'value' in result: + if p.exitcode not in (None, 0): + _logger.warning( + 'Subprocess PID %d for %s exited with code %s after %.2f ms' + ' but returned a valid result — accepting the result.' + ' The non-zero exit likely originates from process' + ' cleanup (multiprocessing finalizers, C-extension' + ' atexit, etc.).', + p.pid, target_name, p.exitcode, elapsed_ms, + ) + return result['value'] + if p.exitcode and p.exitcode < 0: _logger.warning( 'Subprocess PID %d for %s exited due to signal %d after %.2f ms', @@ -297,15 +314,12 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem 'No structured exception payload received from child process', ) - if result is None: - raise SubprocessExecutionError( - p.pid or 0, - target_name, - 0, - 'Subprocess exited successfully but returned no result payload', - ) - - return result['value'] + raise SubprocessExecutionError( + p.pid or 0, + target_name, + 0, + 'Subprocess exited successfully but returned no result payload', + ) def timed(func: Callable): From e1763acdcdfa590cee3c74f6ba1acadf1d9c6f9c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 12:19:09 +0530 Subject: [PATCH 43/56] fix: use forkserver as process start method Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 4 ---- main.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 49d1d737..3a8e15a9 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -16,7 +16,6 @@ # ruff: noqa: E402 import logging -import multiprocessing as mp import os import tempfile import threading @@ -122,9 +121,6 @@ async def lifespan(app: FastAPI): index_lock = threading.Lock() _indexing = {} -# limit the number of concurrent document parsing -doc_parse_semaphore = mp.Semaphore(app_config.doc_parser_worker_limit) - # middlewares diff --git a/main.py b/main.py index 8d838d80..4e88ee9f 100755 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # + import logging from os import cpu_count, getenv @@ -44,6 +45,18 @@ def _setup_log_levels(debug: bool): if __name__ == '__main__': + import multiprocessing as mp + + # do forks from a clean process that doesn't have any threads or locks + mp.set_start_method('forkserver') + mp.set_forkserver_preload([ + 'langchain', + 'sqlalchemy', + 'numpy', + 'context_chat_backend.chain.ingest.injest', + 'context_chat_backend.vectordb.pgvector', + ]) + logging_config = get_logging_config(LOGGER_CONFIG_NAME) setup_logging(logging_config) app_config: TConfig = app.extra['CONFIG'] From 330165205127524780038280854dacc19f552e9c Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 13:16:49 +0530 Subject: [PATCH 44/56] fix(ci): consider eligible files as the total files count Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 8e6ca7d8..8ec8eabe 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -241,17 +241,17 @@ jobs: continue fi - # Extract total queued files - total_files=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "") + # Extract total eligible files + total_eligible_files=$(echo "$stats" | jq '.eligible_files_count' || echo "") # Extract indexed documents count (files__default) indexed_count=$(echo "$stats" | jq '.vectordb_document_counts.files__default' || echo "") - echo "Total queued files: $total_files" + echo "Total eligible files: $total_eligible_files" echo "Indexed documents (files__default): $indexed_count" - diff=$((total_files - indexed_count)) - threshold=$((total_files * 3 / 100)) + diff=$((total_eligible_files - indexed_count)) + threshold=$((total_eligible_files * 3 / 100)) # Check if difference is within tolerance if [ $diff -le $threshold ]; then @@ -259,7 +259,7 @@ jobs: success=1 break else - progress=$((diff * 100 / total_files)) + progress=$((diff * 100 / total_eligible_files)) echo "Outside 3% tolerance: diff=$diff (${progress}%), threshold=$threshold" fi From 32aa37474547c3f3e7993cf638171ef309c1e1df Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 15:13:14 +0530 Subject: [PATCH 45/56] fix: use logging config in forkserver and other fixes Signed-off-by: Anupam Kumar --- context_chat_backend/utils.py | 12 ++++++++---- main.py | 17 +++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index fe4ee96c..5f12d0c5 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -86,10 +86,10 @@ def __init__(self, pid: int, target_name: str, exitcode: int): class SubprocessExecutionError(RuntimeError): - """Raised when a subprocess exits non-zero without a recoverable Python exception payload.""" + """Raised when a subprocess exits without a recoverable Python exception payload.""" def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''): - msg = f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}' + msg = f'Subprocess PID {pid} for {target_name} exited with exit code {exitcode}' if details: msg = f'{msg}: {details}' super().__init__(msg) @@ -199,7 +199,11 @@ def _raw_diag(msg: str) -> None: os.close(_diag_fd) -def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): # noqa: B006 +def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None): + if not kwargs: + kwargs = {} + + # parent, child pconn, cconn = mp.Pipe() std_pconn, std_cconn = mp.Pipe() kwargs['resconn'] = cconn @@ -318,7 +322,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem p.pid or 0, target_name, 0, - 'Subprocess exited successfully but returned no result payload', + f'Subprocess exited successfully but returned no result payload: {result}', ) diff --git a/main.py b/main.py index 4e88ee9f..c2614515 100755 --- a/main.py +++ b/main.py @@ -47,21 +47,22 @@ def _setup_log_levels(debug: bool): if __name__ == '__main__': import multiprocessing as mp + logging_config = get_logging_config(LOGGER_CONFIG_NAME) + setup_logging(logging_config) + app_config: TConfig = app.extra['CONFIG'] + _setup_log_levels(app_config.debug) + # do forks from a clean process that doesn't have any threads or locks mp.set_start_method('forkserver') mp.set_forkserver_preload([ - 'langchain', - 'sqlalchemy', - 'numpy', 'context_chat_backend.chain.ingest.injest', 'context_chat_backend.vectordb.pgvector', + 'langchain', + 'logging', + 'numpy', + 'sqlalchemy', ]) - logging_config = get_logging_config(LOGGER_CONFIG_NAME) - setup_logging(logging_config) - app_config: TConfig = app.extra['CONFIG'] - _setup_log_levels(app_config.debug) - print(f'CPU count: {cpu_count()}, Memory: {psutil.virtual_memory()}') print('App config:\n' + redact_config(app_config).model_dump_json(indent=2), flush=True) From 33ee38ab24d9567f2a0152b7d55870a28ca2bbe1 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 15:23:40 +0530 Subject: [PATCH 46/56] fix: remove extra diagnostics Signed-off-by: Anupam Kumar --- .../chain/ingest/doc_loader.py | 20 +-- context_chat_backend/chain/ingest/injest.py | 15 +- context_chat_backend/task_fetcher.py | 60 ++----- context_chat_backend/utils.py | 146 +++++------------- context_chat_backend/vectordb/pgvector.py | 2 +- 5 files changed, 62 insertions(+), 181 deletions(-) diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py index 04c611d2..832c8331 100644 --- a/context_chat_backend/chain/ingest/doc_loader.py +++ b/context_chat_backend/chain/ingest/doc_loader.py @@ -7,8 +7,6 @@ import tempfile from collections.abc import Callable from io import BytesIO -import logging -from time import perf_counter_ns import docx2txt from epub2txt import epub2txt @@ -21,8 +19,6 @@ from ...types import IndexingException, SourceItem -logger = logging.getLogger('ccb.doc_loader') - def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str: raw_bytes = file.read() @@ -137,22 +133,10 @@ def decode_source(source: SourceItem) -> str: else: io_obj = source.content - loader_fn = _loader_map.get(source.type) - if loader_fn: - logger.debug( - 'Decoding source %r with loader %s (mime: %s) — may be slow or block', - source.title, loader_fn.__name__, source.type, - ) - t0 = perf_counter_ns() - result = loader_fn(io_obj) - elapsed_ms = (perf_counter_ns() - t0) / 1e6 - logger.debug( - 'Loader %s for %r finished in %.2f ms (%d chars)', - loader_fn.__name__, source.title, elapsed_ms, len(result), - ) + if _loader_map.get(source.type): + result = _loader_map[source.type](io_obj) return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip() - logger.debug('No specific loader for mime type %s, reading as plain text for %r', source.type, source.title) return io_obj.read().decode('utf-8', 'ignore').strip() except IndexingException: raise diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 7ede94a6..8e321088 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -43,8 +43,6 @@ async def __fetch_file_content( async with semaphore: nc = AsyncNextcloudApp() try: - logger.debug('Downloading file id %d for user %s', file_id, user_id) - t0 = perf_counter_ns() # a file pointer for storing the stream in memory until it is consumed fp = BytesIO() await nc._session.download2fp( @@ -54,8 +52,6 @@ async def __fetch_file_content( params={ 'userId': user_id }, ) fp.seek(0) - elapsed_ms = (perf_counter_ns() - t0) / 1e6 - logger.debug('Downloaded file id %d for user %s in %.2f ms (%d bytes)', file_id, user_id, elapsed_ms, fp.getbuffer().nbytes) return fp except niquests.exceptions.RequestException as e: if e.response is None: @@ -131,11 +127,7 @@ async def __fetch_files_content( # any user id from the list should have read access to the file tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) - logger.debug('Gathering %d file download task(s) — this blocks until all downloads complete or fail', len(tasks)) - t0 = perf_counter_ns() results = await asyncio.gather(*tasks, return_exceptions=True) - elapsed_ms = (perf_counter_ns() - t0) / 1e6 - logger.debug('All %d file download task(s) completed in %.2f ms', len(tasks), elapsed_ms) for (db_id, file), result in zip(sources.items(), results, strict=True): if isinstance(file, SourceItem): continue @@ -227,10 +219,7 @@ def _sources_to_indocuments( # transform the source to have text data try: - logger.debug( - 'Decoding source %s (type: %s, title: %r) — may be slow for complex file types', - source.reference, source.type, source.title, - ) + logger.debug('Decoding source %s (type: %s)', source.reference, source.type) t0 = perf_counter_ns() content = decode_source(source) elapsed_ms = (perf_counter_ns() - t0) / 1e6 @@ -353,7 +342,7 @@ def _process_sources( source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources) logger.debug( - 'Fetching file contents for %d source(s) — this blocks on network I/O to Nextcloud', + 'Fetching file contents for %d source(s) from Nextcloud', len(to_embed_sources), ) t0 = perf_counter_ns() diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index edeabc12..c75cec0d 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -31,7 +31,6 @@ ActionsQueueItems, ActionType, AppRole, - EmbeddingException, FilesQueueItems, IndexingError, LoaderException, @@ -89,29 +88,6 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) return - def _embed_one(db_id: int, item: SourceItem | ReceivedFileItem) -> tuple[int, IndexingError | None]: - """Run embed_sources for a single item in its own subprocess. Returns (db_id, error_or_None).""" - try: - result = exec_in_proc( - target=embed_sources, - args=(vectordb_loader, app_config, {db_id: item}), - ) - return db_id, result.get(db_id) - except SubprocessKilledError as e: - LOGGER.error( - 'embed_sources subprocess killed for individual source %s — marking as non-retryable' - ' to prevent infinite OOM retry loop', - item.reference, exc_info=e, - ) - return db_id, IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False) - except Exception as e: - err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown') - LOGGER.error( - 'embed_sources raised a %s error for individual source %s, marking as retryable', - err_name, item.reference, exc_info=e, - ) - return db_id, IndexingError(error=str(e), retryable=True) - def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: source_refs = [s.reference for s in source_items.values()] LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs) @@ -122,43 +98,39 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> ) errors = {k: v for k, v in result.items() if isinstance(v, IndexingError)} LOGGER.info( - 'embed_sources subprocess finished for %d source(s): %d succeeded, %d errored', - len(source_items), - len(result) - len(errors), - len(errors), - extra={'errors': errors} if errors else {}, + 'embed_sources finished for %d source(s): %d succeeded, %d errored', + len(source_items), len(result) - len(errors), len(errors), + extra={'errors': errors}, ) return result except SubprocessKilledError as e: LOGGER.error( - 'embed_sources subprocess was killed (likely OOM) for %d source(s): %s', - len(source_items), source_refs, exc_info=e, + 'embed_sources subprocess was killed for %d source(s) with exitcode %s: %s', + len(source_items), e.exitcode, source_refs, exc_info=e, ) if len(source_items) == 1: - # Single-item subprocess was killed — mark non-retryable to break infinite OOM loop. - LOGGER.error( - 'Single-item subprocess killed for %s — marking as non-retryable', - source_refs, + return dict.fromkeys( + source_items, + IndexingError(error=f'Subprocess killed with exitcode {e.exitcode}: {e}', retryable=False), ) - return {db_id: IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False) - for db_id in source_items} - # Multi-item batch: fall back to one subprocess per source to pinpoint the problematic file. + # Fall back to one-by-one to isolate the problematic file. LOGGER.warning( - 'Falling back to individual processing for %d sources to isolate any OOM-causing file(s)', + 'Falling back to individual processing for %d sources', len(source_items), ) - return dict(_embed_one(db_id, item) for db_id, item in source_items.items()) - + fallback: dict[int, IndexingError | None] = {} + for db_id, item in source_items.items(): + fallback.update(_load_sources({db_id: item})) + return fallback except Exception as e: - err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown') err = IndexingError( - error=f'{err_name} Error: {e}', + error=f'{e.__class__.__name__}: {e}', retryable=True, ) LOGGER.error( 'embed_sources subprocess raised a %s error for sources %s, marking all as retryable', - err_name, source_refs, exc_info=e, + e.__class__.__name__, source_refs, exc_info=e, ) return dict.fromkeys(source_items, err) diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py index 5f12d0c5..4552e320 100644 --- a/context_chat_backend/utils.py +++ b/context_chat_backend/utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors # SPDX-License-Identifier: AGPL-3.0-or-later # -import atexit import faulthandler import io import logging @@ -23,6 +22,7 @@ T = TypeVar('T') _logger = logging.getLogger('ccb.utils') +_MAX_STD_CAPTURE_CHARS = 64 * 1024 def not_none(value: T | None) -> TypeGuard[T]: @@ -77,7 +77,7 @@ def JSONResponse( class SubprocessKilledError(RuntimeError): """Raised when a subprocess is terminated by a signal (for example SIGKILL).""" - def __init__(self, pid: int, target_name: str, exitcode: int): + def __init__(self, pid: int | None, target_name: str, exitcode: int): super().__init__( f'Subprocess PID {pid} for {target_name} exited with signal {abs(exitcode)} ' f'(raw exit code: {exitcode})' @@ -88,7 +88,7 @@ def __init__(self, pid: int, target_name: str, exitcode: int): class SubprocessExecutionError(RuntimeError): """Raised when a subprocess exits without a recoverable Python exception payload.""" - def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''): + def __init__(self, pid: int | None, target_name: str, exitcode: int, details: str = ''): msg = f'Subprocess PID {pid} for {target_name} exited with exit code {exitcode}' if details: msg = f'{msg}: {details}' @@ -96,47 +96,29 @@ def __init__(self, pid: int, target_name: str, exitcode: int, details: str = '') self.exitcode = exitcode -_MAX_STD_CAPTURE_CHARS = 64 * 1024 - - -def _truncate_capture(text: str) -> tuple[str, bool]: +def _truncate_capture(text: str) -> str: if len(text) <= _MAX_STD_CAPTURE_CHARS: - return text, False + return text head = _MAX_STD_CAPTURE_CHARS // 2 tail = _MAX_STD_CAPTURE_CHARS - head omitted = len(text) - _MAX_STD_CAPTURE_CHARS - truncated = ( + return ( f'[truncated {omitted} chars]\n' f'{text[:head]}\n' '[...snip...]\n' f'{text[-tail:]}' ) - return truncated, True def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs): - # --- diagnostic probes: write directly to the real stderr FD so they survive - # Python's stdout/stderr redirection below and even os._exit() won't hide them - # from the parent process's stderr stream. - _diag_fd = os.dup(2) # dup before we capture sys.stderr - - def _raw_diag(msg: str) -> None: - with suppress(Exception): - os.write(_diag_fd, (msg + '\n').encode()) - - # Enable faulthandler on the real FD so crash tracebacks (SIGSEGV etc.) appear. + # Preserve real stderr FD for faulthandler before we redirect sys.stderr. + _faulthandler_fd = os.dup(2) with suppress(Exception): - faulthandler.enable(file=os.fdopen(os.dup(_diag_fd), 'w', closefd=True), all_threads=True) - - # Atexit probe: if this message NEVER appears, it means os._exit() (C-level) - # was called with Python's cleanup phase entirely skipped. - _fun_name = getattr(fun, '__name__', str(fun)) - atexit.register( - _raw_diag, - f'[exception_wrap/atexit] pid={os.getpid()} target={_fun_name}' - ': Python atexit reached (normal Python exit)', - ) + faulthandler.enable( + file=os.fdopen(_faulthandler_fd, 'w', closefd=False), + all_threads=True, + ) stdout_capture = io.StringIO() stderr_capture = io.StringIO() @@ -148,55 +130,31 @@ def _raw_diag(msg: str) -> None: try: if fun is None: resconn.send({ 'value': None, 'error': None }) - _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result sent (fun=None)') else: - result_value = fun(*args, **kwargs) - _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: fun() returned, sending result') - resconn.send({ 'value': result_value, 'error': None }) - _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result pipe send complete') + resconn.send({ 'value': fun(*args, **kwargs), 'error': None }) except BaseException as e: tb = traceback.format_exc() - _raw_diag( - f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}' - f': caught {type(e).__name__}: {e}' - ) payload = { 'value': None, 'error': e, 'traceback': tb, - 'error_type': type(e).__name__, - 'error_module': type(e).__module__, - 'error_message': str(e), } try: resconn.send(payload) except Exception as send_err: - # Fallback for unpicklable exceptions. - with suppress(Exception): - resconn.send({ - 'value': None, - 'error': None, - 'traceback': tb, - 'error_type': type(e).__name__, - 'error_module': type(e).__module__, - 'error_message': str(e), - 'send_error': str(send_err), - }) + stderr_capture.write(f'Original error: {e}, pipe send error: {send_err}') finally: sys.stdout = orig_stdout sys.stderr = orig_stderr - stdout_text, stdout_truncated = _truncate_capture(stdout_capture.getvalue()) - stderr_text, stderr_truncated = _truncate_capture(stderr_capture.getvalue()) + stdout_text = _truncate_capture(stdout_capture.getvalue()) + stderr_text = _truncate_capture(stderr_capture.getvalue()) with suppress(Exception): stdconn.send({ 'stdout': stdout_text, 'stderr': stderr_text, - 'stdout_truncated': stdout_truncated, - 'stderr_truncated': stderr_truncated, }) - _raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: finally block complete') with suppress(Exception): - os.close(_diag_fd) + os.close(_faulthandler_fd) def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None): @@ -217,22 +175,17 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, da daemon=daemon, ) target_name = getattr(target, '__name__', str(target)) - _logger.debug('Starting subprocess for %s', target_name) start = perf_counter_ns() p.start() - _logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name) + _logger.debug('Subprocess PID %d started for %s', p.pid, target_name) result = None - stdobj = { - 'stdout': '', - 'stderr': '', - 'stdout_truncated': False, - 'stderr_truncated': False, - } + stdobj = { 'stdout': '', 'stderr': '' } got_result = False got_std = False # Drain result/std pipes while child is still alive to avoid deadlock on full pipe buffers. + # Pipe's buffer size is 64 KiB while p.is_alive() and (not got_result or not got_std): if not got_result and pconn.poll(0.1): with suppress(EOFError, OSError, BrokenPipeError): @@ -254,72 +207,55 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, da with suppress(EOFError, OSError, BrokenPipeError): if std_pconn.poll(): stdobj = std_pconn.recv() - got_std = True - if stdobj['stdout'] or stdobj['stderr']: - extra = { - 'stdout': stdobj['stdout'], - 'stderr': stdobj['stderr'], - } - if stdobj.get('stdout_truncated') or stdobj.get('stderr_truncated'): - extra['stdio_truncated'] = { - 'stdout': bool(stdobj.get('stdout_truncated')), - 'stderr': bool(stdobj.get('stderr_truncated')), - } - _logger.info('std info for %s', target_name, extra=extra) + # no need to update got_std here + if stdobj.get('stdout') or stdobj.get('stderr'): + _logger.info('std info for %s', target_name, extra={ + 'stdout': stdobj.get('stdout', ''), + 'stderr': stdobj.get('stderr', ''), + }) if not got_result: with suppress(EOFError, OSError, BrokenPipeError): if pconn.poll(): result = pconn.recv() - got_result = True + # no need to update got_result here if result is not None and result.get('error') is not None: - _logger.error('original traceback: %s', result.get('traceback', '')) + _logger.error( + 'original traceback of %s (PID %d, exitcode: %s): %s', + target_name, + p.pid, + p.exitcode, + result.get('traceback', ''), + ) raise result['error'] - if result is not None and result.get('error_type'): - details = ( - f"{result.get('error_module', '')}.{result.get('error_type', '')}: " - f"{result.get('error_message', '')}" - ) - if result.get('traceback'): - _logger.error('remote traceback: %s', result['traceback']) - raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details) - - # If we received a valid result payload, return it even if the exit - # code is non-zero. The non-zero code typically comes from - # multiprocessing/C-extension cleanup (e.g. util._exit_function or - # a native atexit handler) that runs *after* exception_wrap has - # already sent the result over the pipe. if result is not None and 'value' in result: if p.exitcode not in (None, 0): _logger.warning( 'Subprocess PID %d for %s exited with code %s after %.2f ms' - ' but returned a valid result — accepting the result.' - ' The non-zero exit likely originates from process' - ' cleanup (multiprocessing finalizers, C-extension' - ' atexit, etc.).', + ' but returned a valid result', p.pid, target_name, p.exitcode, elapsed_ms, ) return result['value'] if p.exitcode and p.exitcode < 0: _logger.warning( - 'Subprocess PID %d for %s exited due to signal %d after %.2f ms', - p.pid, target_name, abs(p.exitcode), elapsed_ms, + 'Subprocess PID %d for %s exited due to signal %d, exitcode %d after %.2f ms', + p.pid, target_name, abs(p.exitcode), p.exitcode, elapsed_ms, ) - raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode) + raise SubprocessKilledError(p.pid, target_name, p.exitcode) if p.exitcode not in (None, 0): raise SubprocessExecutionError( - p.pid or 0, + p.pid, target_name, p.exitcode, - 'No structured exception payload received from child process', + f'No structured exception payload received from child process: {result}', ) raise SubprocessExecutionError( - p.pid or 0, + p.pid, target_name, 0, f'Subprocess exited successfully but returned no result payload: {result}', diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 33dfb039..41d7f0db 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -156,7 +156,7 @@ def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, total_chunks = len(indoc.documents) num_batches = max(1, -(-total_chunks // batch_size)) # ceiling division logger.debug( - 'Embedding source %s: %d chunk(s) in %d batch(es) — blocks on embedding model', + 'Embedding source %s: %d chunk(s) in %d batch(es)', indoc.source_id, total_chunks, num_batches, ) for i in range(0, total_chunks, batch_size): From d9ebdac85772930b556f02ea501d3c73160d567b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 17:54:44 +0530 Subject: [PATCH 47/56] fix: use zip on the subset of filtered sources Signed-off-by: Anupam Kumar --- context_chat_backend/chain/ingest/injest.py | 23 ++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py index 8e321088..190eebd4 100644 --- a/context_chat_backend/chain/ingest/injest.py +++ b/context_chat_backend/chain/ingest/injest.py @@ -89,6 +89,7 @@ async def __fetch_files_content( error_items = {} semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES) tasks = [] + task_sources = {} file_count = sum(1 for s in sources.values() if isinstance(s, ReceivedFileItem)) logger.debug('Fetching content for %d file(s) (max %d concurrent)', file_count, CONCURRENT_FILE_FETCHES) @@ -126,13 +127,18 @@ async def __fetch_files_content( continue # any user id from the list should have read access to the file tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0]))) + task_sources[db_id] = file results = await asyncio.gather(*tasks, return_exceptions=True) - for (db_id, file), result in zip(sources.items(), results, strict=True): - if isinstance(file, SourceItem): - continue - - if isinstance(result, IndexingException): + for (db_id, file), result in zip(task_sources.items(), results, strict=True): + if isinstance(result, str) or isinstance(result, BytesIO): + source_items[db_id] = SourceItem( + **{ + **file.model_dump(), + 'content': result, + } + ) + elif isinstance(result, IndexingException): logger.error( f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}' f': {result}', @@ -142,13 +148,6 @@ async def __fetch_files_content( error=str(result), retryable=result.retryable, ) - elif isinstance(result, str) or isinstance(result, BytesIO): - source_items[db_id] = SourceItem( - **{ - **file.model_dump(), - 'content': result, - } - ) elif isinstance(result, BaseException): logger.error( f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},' From ea77480df7060a21cb556d7dfe13f8d5da21337f Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 18:41:30 +0530 Subject: [PATCH 48/56] fix(em): use tcp socket connection check Signed-off-by: Anupam Kumar --- context_chat_backend/network_em.py | 29 ++++++++++++++++++++++++---- context_chat_backend/task_fetcher.py | 17 +++++++--------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py index 43ced6cc..ba1edc9e 100644 --- a/context_chat_backend/network_em.py +++ b/context_chat_backend/network_em.py @@ -3,8 +3,10 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # import logging +import socket from time import sleep from typing import Literal, TypedDict +from urllib.parse import urlparse import niquests from langchain_core.embeddings import Embeddings @@ -19,6 +21,7 @@ ) logger = logging.getLogger('ccb.nextwork_em') +TCP_CONNECT_TIMEOUT = 2.0 # seconds # Copied from llama_cpp/llama_types.py @@ -44,12 +47,30 @@ class NetworkEmbeddings(Embeddings): def __init__(self, app_config: TConfig): self.app_config = app_config - def check_connection(self) -> bool: + def _get_host_and_port(self) -> tuple[str, int]: + parsed = urlparse(self.app_config.embedding.base_url) + host = parsed.hostname + + if not host: + raise ValueError("Invalid URL: Missing hostname") + + if parsed.port: + port = parsed.port + else: + port = 443 if parsed.scheme == "https" else 80 + + return host, port + + def check_connection(self, check_origin: str) -> bool: try: - self.embed_query('hello') + host, port = self._get_host_and_port() + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(TCP_CONNECT_TIMEOUT) + sock.connect((host, port)) + sock.close() return True - except EmbeddingException as e: - logger.warning('Embedding server connection failed', exc_info=e) + except (ValueError, TimeoutError, ConnectionRefusedError, socket.gaierror) as e: + logger.warning(f'[{check_origin}] Embedding server is not reachable, retrying after some time: {e}') return False def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] | list[list[float]]: diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index c75cec0d..c931e7df 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -83,6 +83,7 @@ class ThreadType(Enum): def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: try: + network_em = NetworkEmbeddings(app_config) vectordb_loader = VectorDBLoader(app_config) except LoaderException as e: LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e) @@ -141,7 +142,7 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> return try: - if not __check_em_server(app_config): + if not network_em.check_connection(ThreadType.FILES_INDEXING.value): sleep(POLLING_COOLDOWN) continue @@ -456,6 +457,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: LOGGER.info('Starting task fetcher loop') try: + network_em = NetworkEmbeddings(app_config) vectordb_loader = VectorDBLoader(app_config) llm_loader = LLMModelLoader(app_config) except LoaderException as e: @@ -466,14 +468,14 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: llm: LLM = llm_loader.load() while True: - if not __check_em_server(app_config): - sleep(POLLING_COOLDOWN) - continue - if THREAD_STOP_EVENT.is_set(): LOGGER.info('Updates processing thread is stopping due to stop event being set') return + if not network_em.check_connection(ThreadType.REQUEST_PROCESSING.value): + sleep(POLLING_COOLDOWN) + continue + try: # Fetch pending task try: @@ -877,8 +879,3 @@ def process_search_task( task_input.get('scopeList'), ) ) - - -def __check_em_server(app_config: TConfig) -> bool: - embedding_model = NetworkEmbeddings(app_config=app_config) - return embedding_model.check_connection() From 1ce237a36addb872e3affc790faeae5583e80b28 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 18:42:59 +0530 Subject: [PATCH 49/56] fix(ci): remove github CI restrictions Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index c931e7df..004104f8 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -55,16 +55,9 @@ THREAD_STOP_EVENT = Event() LOGGER = logging.getLogger('ccb.task_fetcher') FILES_INDEXING_BATCH_SIZE = 16 # theoretical max RAM usage: 16 * 100 MiB, todo: config? -if os.getenv('GITHUB_ACTIONS'): - FILES_INDEXING_BATCH_SIZE = 4 MIN_FILES_PER_CPU = 4 -if os.getenv('GITHUB_ACTIONS'): - MIN_FILES_PER_CPU = 2 # divides the batch into these many chunks PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1) # todo: config? -if os.getenv('GITHUB_ACTIONS'): - # Keep CI memory usage predictable and avoid OOM-killed workers. - PARALLEL_FILE_PARSING_COUNT = max(1, min(PARALLEL_FILE_PARSING_COUNT, 2)) LOGGER.info(f'Using {PARALLEL_FILE_PARSING_COUNT} parallel file parsing workers') ACTIONS_BATCH_SIZE = 512 # todo: config? POLLING_COOLDOWN = 30 From d82e01b6555e4a362ba58fda1414cba83dc00023 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 18:54:20 +0530 Subject: [PATCH 50/56] fix: remove unused code and some de-duplication Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 286 +++++++-------------------- 1 file changed, 75 insertions(+), 211 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 004104f8..1e456465 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -15,16 +15,14 @@ import niquests from langchain.llms.base import LLM -from langchain.schema import Document from nc_py_api import NextcloudApp, NextcloudException from niquests import JSONDecodeError, RequestException from pydantic import ValidationError -from .chain.context import do_doc_search, get_context_chunks, get_context_docs +from .chain.context import do_doc_search from .chain.ingest.injest import embed_sources from .chain.one_shot import process_context_query -from .chain.query_proc import get_pruned_query -from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult +from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, SearchResult from .dyn_loader import LLMModelLoader, VectorDBLoader from .network_em import NetworkEmbeddings from .types import ( @@ -39,7 +37,6 @@ TConfig, ) from .utils import SubprocessKilledError, exec_in_proc, get_app_role -from .vectordb.base import BaseVectorDB from .vectordb.service import ( decl_update_access, delete_by_provider, @@ -498,11 +495,16 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None: if task['type'] == 'context_chat:context_chat': result: LLMOutput = process_normal_task(task, vectordb_loader, llm, app_config) # Return result to Nextcloud - success = return_normal_result_to_nextcloud(task['id'], userId, result) + success = return_result_to_nextcloud(task['id'], userId, { + 'output': result['output'], + 'sources': enrich_sources(result['sources'], userId), + }) elif task['type'] == 'context_chat:context_chat_search': search_result: list[SearchResult] = process_search_task(task, vectordb_loader) # Return result to Nextcloud - success = return_search_result_to_nextcloud(task['id'], userId, search_result) + success = return_result_to_nextcloud(task['id'], userId, { + 'sources': enrich_sources(search_result, userId), + }) else: LOGGER.error(f'Unknown task type {task["type"]}') success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}')) @@ -541,200 +543,6 @@ def wait_for_tasks(interval = None): TRIGGER.clear() - -def start_bg_threads(app_config: TConfig, app_enabled: Event): - if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: - if ( - ThreadType.FILES_INDEXING in THREADS - or ThreadType.UPDATES_PROCESSING in THREADS - ): - LOGGER.info('Background threads already running, skipping start') - return - - THREAD_STOP_EVENT.clear() - THREADS[ThreadType.FILES_INDEXING] = Thread( - target=files_indexing_thread, - args=(app_config, app_enabled), - name='FilesIndexingThread', - ) - THREADS[ThreadType.UPDATES_PROCESSING] = Thread( - target=updates_processing_thread, - args=(app_config, app_enabled), - name='UpdatesProcessingThread', - ) - THREADS[ThreadType.FILES_INDEXING].start() - THREADS[ThreadType.UPDATES_PROCESSING].start() - - if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: - if ThreadType.REQUEST_PROCESSING in THREADS: - LOGGER.info('Background threads already running, skipping start') - return - - THREAD_STOP_EVENT.clear() - THREADS[ThreadType.REQUEST_PROCESSING] = Thread( - target=request_processing_thread, - args=(app_config, app_enabled), - name='RequestProcessingThread', - ) - THREADS[ThreadType.REQUEST_PROCESSING].start() - - -def wait_for_bg_threads(): - if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: - if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): - return - - THREAD_STOP_EVENT.set() - THREADS[ThreadType.FILES_INDEXING].join() - THREADS[ThreadType.UPDATES_PROCESSING].join() - THREADS.pop(ThreadType.FILES_INDEXING) - THREADS.pop(ThreadType.UPDATES_PROCESSING) - - if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: - if (ThreadType.REQUEST_PROCESSING not in THREADS): - return - - THREAD_STOP_EVENT.set() - THREADS[ThreadType.REQUEST_PROCESSING].join() - THREADS.pop(ThreadType.REQUEST_PROCESSING) - - -def query_vector_database( - user_id: str, - query: str, - vectordb: BaseVectorDB, - ctx_limit: int, - scope_type: ScopeType | None = None, - scope_list: list[str] | None = None, -) -> list[Document]: - """ - Query the vector database to retrieve relevant documents. - - Args: - user_id: User ID for scoping the search - query: The search query text - vectordb: Vector database instance - ctx_limit: Maximum number of documents to return - scope_type: Optional scope type (PROVIDER or SOURCE) - scope_list: Optional list of scope identifiers - - Returns: - List of relevant Document objects - - Raises: - ContextException: If scope type is provided without scope list - """ - context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list) - LOGGER.debug('Retrieved context documents', extra={ - 'user_id': user_id, - 'num_docs': len(context_docs), - 'ctx_limit': ctx_limit, - }) - return context_docs - - -def prepare_context_chunks(context_docs: list[Document]) -> list[str]: - """ - Extract and format text chunks from documents for LLM context. - - Args: - context_docs: List of Document objects from vector DB - - Returns: - List of formatted text chunks including titles and content - """ - return get_context_chunks(context_docs) - - -def generate_llm_response( - llm: LLM, - app_config: TConfig, - user_id: str, - query: str, - template: str, - context_chunks: list[str], - end_separator: str = '', -) -> str: - """ - Generate LLM response using the pruned query and context. - - Args: - llm: Language model instance - app_config: Application configuration - user_id: User ID for the request - query: The original query text - template: Template for formatting the prompt - context_chunks: Context chunks to include in the prompt - end_separator: Optional separator to stop generation - - Returns: - Generated LLM output text - - Raises: - ValueError: If context length is too small to fit the query - """ - pruned_query_text = get_pruned_query(llm, app_config, query, template, context_chunks) - - stop = [end_separator] if end_separator else None - output = llm.invoke( - pruned_query_text, - stop=stop, - userid=user_id, - ).strip() - - LOGGER.debug('Generated LLM response', extra={ - 'user_id': user_id, - 'output_length': len(output), - }) - return output - - -def extract_unique_sources(context_docs: list[Document]) -> list[str]: - """ - Extract unique source IDs from context documents. - - Args: - context_docs: List of Document objects - - Returns: - List of unique source IDs - """ - unique_sources: list[str] = list({ - source for d in context_docs if (source := d.metadata.get('source')) - }) - return unique_sources - -def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutput) -> bool: - """ - Return query result back to Nextcloud. - - Args: - task_id: Unique task identifier - result: The LLMOutput result to return - - Returns: - True if successful, False otherwise - """ - LOGGER.debug('Returning result to Nextcloud', extra={ - 'task_id': task_id, - 'output_length': len(result['output']), - 'num_sources': len(result['sources']), - }) - - nc = NextcloudApp() - - try: - nc.providers.task_processing.report_result(task_id, { - 'output': result['output'], - 'sources': enrich_sources(result['sources'], userId), - }) - except (NextcloudException, RequestException, JSONDecodeError) as e: - LOGGER.error(f"Network error reporting task result {e}", exc_info=e) - return False - - return True - - def enrich_sources(results: list[SearchResult], userId: str) -> list[str]: nc = NextcloudApp() data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId}) @@ -742,34 +550,32 @@ def enrich_sources(results: list[SearchResult], userId: str) -> list[str]: return [s.model_dump_json() for s in sources] -def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool: +def return_result_to_nextcloud(task_id: int, userId: str, result: dict[str, Any]) -> bool: """ - Return search result back to Nextcloud. + Return query result back to Nextcloud. Args: - task_id: Unique task identifier - result: The list of search results to return + result: dict[str, Any] Returns: True if successful, False otherwise """ - LOGGER.debug('Returning search result to Nextcloud', extra={ + LOGGER.debug('Returning result to Nextcloud', extra={ 'task_id': task_id, - 'num_sources': len(result), + 'result': result, }) nc = NextcloudApp() try: - nc.providers.task_processing.report_result(task_id, { - 'sources': enrich_sources(result, userId), - }) + nc.providers.task_processing.report_result(task_id, result) except (NextcloudException, RequestException, JSONDecodeError) as e: - LOGGER.error(f"Network error reporting search task result {e}", exc_info=e) + LOGGER.error(f"Network error reporting task result {e}", exc_info=e) return False return True + def return_error_to_nextcloud(task_id: int, e: Exception) -> bool: """ Return error result back to Nextcloud. @@ -827,6 +633,7 @@ def process_normal_task( if task_input.get('scopeType') == 'none': task_input['scopeType'] = None + # todo: document no template support return exec_in_proc(target=process_context_query, args=( user_id, @@ -872,3 +679,60 @@ def process_search_task( task_input.get('scopeList'), ) ) + + +def start_bg_threads(app_config: TConfig, app_enabled: Event): + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: + if ( + ThreadType.FILES_INDEXING in THREADS + or ThreadType.UPDATES_PROCESSING in THREADS + ): + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() + THREADS[ThreadType.FILES_INDEXING] = Thread( + target=files_indexing_thread, + args=(app_config, app_enabled), + name='FilesIndexingThread', + ) + THREADS[ThreadType.UPDATES_PROCESSING] = Thread( + target=updates_processing_thread, + args=(app_config, app_enabled), + name='UpdatesProcessingThread', + ) + THREADS[ThreadType.FILES_INDEXING].start() + THREADS[ThreadType.UPDATES_PROCESSING].start() + + if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if ThreadType.REQUEST_PROCESSING in THREADS: + LOGGER.info('Background threads already running, skipping start') + return + + THREAD_STOP_EVENT.clear() + THREADS[ThreadType.REQUEST_PROCESSING] = Thread( + target=request_processing_thread, + args=(app_config, app_enabled), + name='RequestProcessingThread', + ) + THREADS[ThreadType.REQUEST_PROCESSING].start() + + +def wait_for_bg_threads(): + if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL: + if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS): + return + + THREAD_STOP_EVENT.set() + THREADS[ThreadType.FILES_INDEXING].join() + THREADS[ThreadType.UPDATES_PROCESSING].join() + THREADS.pop(ThreadType.FILES_INDEXING) + THREADS.pop(ThreadType.UPDATES_PROCESSING) + + if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL: + if (ThreadType.REQUEST_PROCESSING not in THREADS): + return + + THREAD_STOP_EVENT.set() + THREADS[ThreadType.REQUEST_PROCESSING].join() + THREADS.pop(ThreadType.REQUEST_PROCESSING) From 286db22e8cb664f600ddfa3b759ce8e83963ff2b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Fri, 3 Apr 2026 19:32:28 +0530 Subject: [PATCH 51/56] fix(mp): run repairs and config file check only in MainProcess Signed-off-by: Anupam Kumar --- context_chat_backend/controller.py | 8 ++++++-- main.py | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py index 3a8e15a9..9c3812e9 100644 --- a/context_chat_backend/controller.py +++ b/context_chat_backend/controller.py @@ -16,6 +16,7 @@ # ruff: noqa: E402 import logging +import multiprocessing as mp import os import tempfile import threading @@ -39,8 +40,11 @@ # setup -repair_run() -ensure_config_file() +# only run once +if mp.current_process().name == 'MainProcess': + repair_run() + ensure_config_file() + logger = logging.getLogger('ccb.controller') app_config = get_config(os.environ['CC_CONFIG_PATH']) __download_models_from_hf = os.environ.get('CC_DOWNLOAD_MODELS_FROM_HF', 'true').lower() in ('1', 'true', 'yes') diff --git a/main.py b/main.py index c2614515..076b7db0 100755 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ # import logging +import multiprocessing as mp from os import cpu_count, getenv import psutil @@ -45,8 +46,6 @@ def _setup_log_levels(debug: bool): if __name__ == '__main__': - import multiprocessing as mp - logging_config = get_logging_config(LOGGER_CONFIG_NAME) setup_logging(logging_config) app_config: TConfig = app.extra['CONFIG'] From 726eb64f5624eb9a2262aa6c6b17641e04b33973 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 16:43:07 +0530 Subject: [PATCH 52/56] fix: attach source_ids as keys in json logs Signed-off-by: Anupam Kumar --- context_chat_backend/task_fetcher.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py index 1e456465..be74b316 100644 --- a/context_chat_backend/task_fetcher.py +++ b/context_chat_backend/task_fetcher.py @@ -81,7 +81,9 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None: def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]: source_refs = [s.reference for s in source_items.values()] - LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs) + LOGGER.info('Starting embed_sources subprocess for %d source(s)', len(source_items), extra={ + 'source_ids': source_refs, + }) try: result = exec_in_proc( target=embed_sources, @@ -96,8 +98,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> return result except SubprocessKilledError as e: LOGGER.error( - 'embed_sources subprocess was killed for %d source(s) with exitcode %s: %s', - len(source_items), e.exitcode, source_refs, exc_info=e, + 'embed_sources subprocess was killed for %d source(s) with exitcode %s', + len(source_items), e.exitcode, exc_info=e, extra={ + 'source_ids': source_refs, + }, ) if len(source_items) == 1: return dict.fromkeys( @@ -120,8 +124,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> retryable=True, ) LOGGER.error( - 'embed_sources subprocess raised a %s error for sources %s, marking all as retryable', - e.__class__.__name__, source_refs, exc_info=e, + 'embed_sources subprocess raised a %s error for %d sources, marking all as retryable', + e.__class__.__name__, len(source_refs), exc_info=e, extra={ + 'source_ids': source_refs, + } ) return dict.fromkeys(source_items, err) From 073f9d0e4a2f7fd52c1ef0df3410ea390c70c683 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 16:43:26 +0530 Subject: [PATCH 53/56] fix(ci): upload db dump artifacts Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 8ec8eabe..9c664838 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -89,7 +89,7 @@ jobs: POSTGRES_USER: root POSTGRES_PASSWORD: rootpassword POSTGRES_DB: nextcloud - options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 + options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 --name postgres --hostname postgres steps: - name: Checkout server @@ -214,6 +214,13 @@ jobs: php cron.php sleep 10 done & + sleep 30 + # list all the bg jobs + ./occ background-job:list + + - name: Initial dump of DB with context_chat_queue populated + run: | + docker exec postgres pg_dump nextcloud > /tmp/0_pgdump_nextcloud - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files run: | @@ -315,6 +322,10 @@ jobs: echo "Memory usage during scan is stable. No memory leak detected." fi + - name: Final dump of DB with vectordb populated + run: | + docker exec postgres pg_dump nextcloud > /tmp/1_pgdump_nextcloud + - name: Show server logs if: always() run: | @@ -350,6 +361,14 @@ jobs: run: | tail -v -n +1 context_chat_backend/persistent_storage/logs/em_server.log* || echo "No logs in logs directory" + - name: Upload database dumps + uses: actions/upload-artifact@v4 + with: + name: database-dumps + path: | + /tmp/0_pgdump_nextcloud + /tmp/1_pgdump_nextcloud + summary: permissions: contents: none From 13ea740d94841069b1c72398440dab9a2a30cd31 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 18:01:47 +0530 Subject: [PATCH 54/56] fix: retry PGVector object creation if table already exists Signed-off-by: Anupam Kumar --- context_chat_backend/vectordb/pgvector.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py index 41d7f0db..d7b718dc 100644 --- a/context_chat_backend/vectordb/pgvector.py +++ b/context_chat_backend/vectordb/pgvector.py @@ -120,7 +120,15 @@ def __init__(self, embedding: Embeddings | None = None, **kwargs): kwargs['connection'] = os.environ['CCB_DB_URL'] # setup langchain db + our access list table - self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs) + try: + self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs) + except sa.exc.IntegrityError as ie: # pyright: ignore[reportAttributeAccessIssue] + if not isinstance(ie.orig, psycopg.errors.UniqueViolation): + raise + + # tried to create the tables but it was already created in another process + # init the client again to detect it already exists, and continue from there + self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs) def get_instance(self) -> VectorStore: return self.client From dcb04e7209558ea9185f902637474e301d70f1b9 Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 20:11:24 +0530 Subject: [PATCH 55/56] fix: unique db dump artifact id Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 9c664838..384e3520 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -364,7 +364,7 @@ jobs: - name: Upload database dumps uses: actions/upload-artifact@v4 with: - name: database-dumps + name: database-dumps-${{ matrix.server-versions }}-php@${{ matrix.php-versions }} path: | /tmp/0_pgdump_nextcloud /tmp/1_pgdump_nextcloud From dc1d57b15161ff13ffa56208bc4a21bb4e13b10b Mon Sep 17 00:00:00 2001 From: Anupam Kumar Date: Tue, 7 Apr 2026 20:12:51 +0530 Subject: [PATCH 56/56] fix(ci): log stats before exit Signed-off-by: Anupam Kumar --- .github/workflows/integration-test.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 384e3520..d30073ab 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -282,9 +282,6 @@ jobs: echo "::endgroup::" - ./occ context_chat:stats - ./occ context_chat:stats --json - if [ $success -ne 1 ]; then echo "Max attempts reached" exit 1 @@ -369,6 +366,11 @@ jobs: /tmp/0_pgdump_nextcloud /tmp/1_pgdump_nextcloud + - name: Final stats log + run: | + ./occ context_chat:stats + ./occ context_chat:stats --json + summary: permissions: contents: none