From 5dad71474a5757bcc3769379e9de7ca35803033d Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 5 Mar 2026 14:14:19 +0530
Subject: [PATCH 01/56] feat: add kubernetes app role selection

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 appinfo/info.xml                     | 14 ++++++++++++++
 context_chat_backend/controller.py   | 15 ++++++++-------
 context_chat_backend/task_fetcher.py |  4 ++++
 context_chat_backend/types.py        |  8 ++++++++
 context_chat_backend/utils.py        | 13 ++++++++++++-
 5 files changed, 46 insertions(+), 8 deletions(-)
 create mode 100644 context_chat_backend/task_fetcher.py
diff --git a/appinfo/info.xml b/appinfo/info.xml
index 9760cd29..30194baa 100644
--- a/appinfo/info.xml
+++ b/appinfo/info.xml
@@ -82,5 +82,19 @@ Setup background job workers as described here: https://docs.nextcloud.com/serve
 				<description>Password to be used for authenticating requests to the OpenAI-compatible endpoint set in CC_EM_BASE_URL.</description>
 			</variable>
 		</environment-variables>
+		<k8s-service-roles>
+			<role>
+				<name>rp</name>
+				<display-name>Request Processing Mode</display-name>
+				<env>APP_ROLE=rp</env>
+				<expose>true</expose>
+			</role>
+			<role>
+				<name>indexing</name>
+				<display-name>Indexing Mode</display-name>
+				<env>APP_ROLE=indexing</env>
+				<expose>false</expose>
+			</role>
+		</k8s-service-roles>
 	</external-app>
 </info>
diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index c26b930a..0b6b53dc 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -75,6 +75,7 @@
 def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str:
 	if enabled:
 		app_enabled.set()
+		# todo: start bg threads to fetch docs, updates and requests to process
 	else:
 		app_enabled.clear()
 
@@ -213,6 +214,13 @@ def _():
 	return JSONResponse(content={'enabled': app_enabled.is_set()}, status_code=200)
 
 
+@app.post('/countIndexedDocuments')
+@enabled_guard(app)
+def _():
+	counts = exec_in_proc(target=count_documents_by_provider, args=(vectordb_loader,))
+	return JSONResponse(counts)
+
+
 @app.post('/updateAccessDeclarative')
 @enabled_guard(app)
 def _(
@@ -328,13 +336,6 @@ def _(userId: str = Body(embed=True)):
 	return JSONResponse('User deleted')
 
 
-@app.post('/countIndexedDocuments')
-@enabled_guard(app)
-def _():
-	counts = exec_in_proc(target=count_documents_by_provider, args=(vectordb_loader,))
-	return JSONResponse(counts)
-
-
 @app.put('/loadSources')
 @enabled_guard(app)
 def _(sources: list[UploadFile]):
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
new file mode 100644
index 00000000..5e2f317f
--- /dev/null
+++ b/context_chat_backend/task_fetcher.py
@@ -0,0 +1,4 @@
+#
+# SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
+# SPDX-License-Identifier: AGPL-3.0-or-later
+#
diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
index 500a97d0..78680866 100644
--- a/context_chat_backend/types.py
+++ b/context_chat_backend/types.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+from enum import Enum
+
 from pydantic import BaseModel
 
 __all__ = [
@@ -71,3 +73,9 @@ class FatalEmbeddingException(EmbeddingException):
 
 	Either malformed request, authentication error, or other non-retryable error.
 	"""
+
+
+class AppRole(str, Enum):
+	NORMAL = 'normal'
+	INDEXING = 'indexing'
+	RP = 'rp'
diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index f6d6e672..224f466e 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -4,6 +4,7 @@
 #
 import logging
 import multiprocessing as mp
+import os
 import re
 import traceback
 from collections.abc import Callable
@@ -14,7 +15,7 @@
 
 from fastapi.responses import JSONResponse as FastAPIJSONResponse
 
-from .types import TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig
+from .types import AppRole, TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig
 
 T = TypeVar('T')
 _logger = logging.getLogger('ccb.utils')
@@ -144,3 +145,13 @@ def redact_config(config: TConfig | TEmbeddingConfig) -> TConfig | TEmbeddingCon
 			em_conf.auth.password = '***REDACTED***'  # noqa: S105
 
 	return config_copy
+
+
+def get_app_role() -> AppRole:
+	role = os.getenv('APP_ROLE', '').lower()
+	if role == '':
+		return AppRole.NORMAL
+	if role not in ['indexing', 'rp']:
+		_logger.warning(f'Invalid app role: {role}, defaulting to all roles')
+		return AppRole.NORMAL
+	return AppRole(role)

From 089d27a41643c165d0474258c840ba6e048279a9 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 5 Mar 2026 16:42:41 +0530
Subject: [PATCH 02/56] feat: add thread start and stop logic

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/controller.py   | 17 ++++--
 context_chat_backend/task_fetcher.py | 82 ++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 0b6b53dc..fadc5f83 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -42,6 +42,7 @@
 from .models.types import LlmException
 from nc_py_api.ex_app import AppAPIAuthMiddleware
 from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of
+from .task_fetcher import start_bg_threads, stop_bg_threads
 from .vectordb.service import (
 	count_documents_by_provider,
 	decl_update_access,
@@ -73,11 +74,16 @@
 app_enabled = Event()
 
 def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str:
-	if enabled:
-		app_enabled.set()
-		# todo: start bg threads to fetch docs, updates and requests to process
-	else:
-		app_enabled.clear()
+	try:
+		if enabled:
+			app_enabled.set()
+			start_bg_threads()
+		else:
+			app_enabled.clear()
+			stop_bg_threads()
+	except Exception as e:
+		logger.exception('Error in enabled handler:', exc_info=e)
+		return f'Error in enabled handler: {e}'
 
 	logger.info(f'App {("disabled", "enabled")[enabled]}')
 	return ''
@@ -95,6 +101,7 @@ async def lifespan(app: FastAPI):
 	yield
 	vectordb_loader.offload()
 	llm_loader.offload()
+	stop_bg_threads()
 
 
 app_config = get_config(os.environ['CC_CONFIG_PATH'])
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 5e2f317f..9660b44c 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -2,3 +2,85 @@
 # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+
+from enum import Enum
+from threading import Thread
+
+from .types import AppRole
+from .utils import get_app_role
+
+APP_ROLE = get_app_role()
+THREADS = {}
+THREADS_STOP_EVENTS = {}
+
+
+class ThreadType(Enum):
+	FILES_INDEXING = 'files_indexing'
+	UPDATES_PROCESSING = 'updates_processing'
+	REQUEST_PROCESSING = 'request_processing'
+
+
+def files_indexing_thread():
+	...
+
+
+def updates_processing_thread():
+	...
+
+
+def request_processing_thread():
+	...
+
+
+def start_bg_threads():
+	match APP_ROLE:
+		case AppRole.INDEXING | AppRole.NORMAL:
+			THREADS[ThreadType.FILES_INDEXING] = Thread(
+				target=files_indexing_thread,
+				name='FilesIndexingThread',
+				daemon=True,
+			)
+			THREADS[ThreadType.UPDATES_PROCESSING] = Thread(
+				target=updates_processing_thread,
+				name='UpdatesProcessingThread',
+				daemon=True,
+			)
+			THREADS[ThreadType.FILES_INDEXING].start()
+			THREADS[ThreadType.UPDATES_PROCESSING].start()
+		case AppRole.RP | AppRole.NORMAL:
+			THREADS[ThreadType.REQUEST_PROCESSING] = Thread(
+				target=request_processing_thread,
+				name='RequestProcessingThread',
+				daemon=True,
+			)
+			THREADS[ThreadType.REQUEST_PROCESSING].start()
+
+
+def stop_bg_threads():
+	match APP_ROLE:
+		case AppRole.INDEXING | AppRole.NORMAL:
+			if (
+				ThreadType.FILES_INDEXING not in THREADS
+				or ThreadType.UPDATES_PROCESSING not in THREADS
+				or ThreadType.FILES_INDEXING not in THREADS_STOP_EVENTS
+				or ThreadType.UPDATES_PROCESSING not in THREADS_STOP_EVENTS
+			):
+				return
+			THREADS_STOP_EVENTS[ThreadType.FILES_INDEXING].set()
+			THREADS_STOP_EVENTS[ThreadType.UPDATES_PROCESSING].set()
+			THREADS[ThreadType.FILES_INDEXING].join()
+			THREADS[ThreadType.UPDATES_PROCESSING].join()
+			THREADS.pop(ThreadType.FILES_INDEXING)
+			THREADS.pop(ThreadType.UPDATES_PROCESSING)
+			THREADS_STOP_EVENTS.pop(ThreadType.FILES_INDEXING)
+			THREADS_STOP_EVENTS.pop(ThreadType.UPDATES_PROCESSING)
+		case AppRole.RP | AppRole.NORMAL:
+			if (
+				ThreadType.REQUEST_PROCESSING not in THREADS
+				or ThreadType.REQUEST_PROCESSING not in THREADS_STOP_EVENTS
+			):
+				return
+			THREADS_STOP_EVENTS[ThreadType.REQUEST_PROCESSING].set()
+			THREADS[ThreadType.REQUEST_PROCESSING].join()
+			THREADS.pop(ThreadType.REQUEST_PROCESSING)
+			THREADS_STOP_EVENTS.pop(ThreadType.REQUEST_PROCESSING)

From 64ffdaf2b83dae9f450a86024cad9f3a41849c30 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Mon, 9 Mar 2026 19:22:45 +0530
Subject: [PATCH 03/56] wip: migrate the indexing process

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .../chain/ingest/doc_loader.py                |  53 +--
 context_chat_backend/chain/ingest/injest.py   | 201 ++++++-----
 context_chat_backend/controller.py            | 165 +++++-----
 .../{chain/ingest => }/mimetype_list.py       |   0
 context_chat_backend/task_fetcher.py          | 311 ++++++++++++++++--
 context_chat_backend/types.py                 | 121 ++++++-
 context_chat_backend/vectordb/base.py         |   9 +-
 context_chat_backend/vectordb/pgvector.py     |  61 ++--
 8 files changed, 659 insertions(+), 262 deletions(-)
 rename context_chat_backend/{chain/ingest => }/mimetype_list.py (100%)

diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py
index efb81b6d..d26f74b1 100644
--- a/context_chat_backend/chain/ingest/doc_loader.py
+++ b/context_chat_backend/chain/ingest/doc_loader.py
@@ -7,11 +7,10 @@
 import re
 import tempfile
 from collections.abc import Callable
-from typing import BinaryIO
+from io import BytesIO
 
 import docx2txt
 from epub2txt import epub2txt
-from fastapi import UploadFile
 from langchain_unstructured import UnstructuredLoader
 from odfdo import Document
 from pandas import read_csv, read_excel
@@ -19,9 +18,11 @@
 from pypdf.errors import FileNotDecryptedError as PdfFileNotDecryptedError
 from striprtf import striprtf
 
+from ...types import SourceItem
+
 logger = logging.getLogger('ccb.doc_loader')
 
-def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str:
+def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str:
 	raw_bytes = file.read()
 	with tempfile.NamedTemporaryFile(mode='wb') as tmp:
 		tmp.write(raw_bytes)
@@ -35,46 +36,46 @@ def _temp_file_wrapper(file: BinaryIO, loader: Callable, sep: str = '\n') -> str
 
 # -- LOADERS -- #
 
-def _load_pdf(file: BinaryIO) -> str:
+def _load_pdf(file: BytesIO) -> str:
 	pdf_reader = PdfReader(file)
 	return '\n\n'.join([page.extract_text().strip() for page in pdf_reader.pages])
 
 
-def _load_csv(file: BinaryIO) -> str:
+def _load_csv(file: BytesIO) -> str:
 	return read_csv(file).to_string(header=False, na_rep='')
 
 
-def _load_epub(file: BinaryIO) -> str:
+def _load_epub(file: BytesIO) -> str:
 	return _temp_file_wrapper(file, epub2txt).strip()
 
 
-def _load_docx(file: BinaryIO) -> str:
+def _load_docx(file: BytesIO) -> str:
 	return docx2txt.process(file).strip()
 
 
-def _load_odt(file: BinaryIO) -> str:
+def _load_odt(file: BytesIO) -> str:
 	return _temp_file_wrapper(file, lambda fp: Document(fp).get_formatted_text()).strip()
 
 
-def _load_ppt_x(file: BinaryIO) -> str:
+def _load_ppt_x(file: BytesIO) -> str:
 	return _temp_file_wrapper(file, lambda fp: UnstructuredLoader(fp).load()).strip()
 
 
-def _load_rtf(file: BinaryIO) -> str:
+def _load_rtf(file: BytesIO) -> str:
 	return striprtf.rtf_to_text(file.read().decode('utf-8', 'ignore')).strip()
 
 
-def _load_xml(file: BinaryIO) -> str:
+def _load_xml(file: BytesIO) -> str:
 	data = file.read().decode('utf-8', 'ignore')
 	data = re.sub(r'</.+>', '', data)
 	return data.strip()
 
 
-def _load_xlsx(file: BinaryIO) -> str:
+def _load_xlsx(file: BytesIO) -> str:
 	return read_excel(file, na_filter=False).to_string(header=False, na_rep='')
 
 
-def _load_email(file: BinaryIO, ext: str = 'eml') -> str | None:
+def _load_email(file: BytesIO, ext: str = 'eml') -> str | None:
 	# NOTE: msg format is not tested
 	if ext not in ['eml', 'msg']:
 		return None
@@ -115,30 +116,34 @@ def attachment_partitioner(
 }
 
 
-def decode_source(source: UploadFile) -> str | None:
+def decode_source(source: SourceItem) -> str | None:
+	io_obj: BytesIO | None = None
 	try:
 		# .pot files are powerpoint templates but also plain text files,
 		# so we skip them to prevent decoding errors
-		if source.headers['title'].endswith('.pot'):
+		if source.title.endswith('.pot'):
 			return None
 
-		mimetype = source.headers['type']
+		mimetype = source.type
 		if mimetype is None:
 			return None
 
+		if isinstance(source.content, str):
+			io_obj = BytesIO(source.content.encode('utf-8', 'ignore'))
+		else:
+			io_obj = source.content
+
 		if _loader_map.get(mimetype):
-			result = _loader_map[mimetype](source.file)
-			source.file.close()
+			result = _loader_map[mimetype](io_obj)
 			return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
 
-		result = source.file.read().decode('utf-8', 'ignore')
-		source.file.close()
-		return result
+		return io_obj.read().decode('utf-8', 'ignore')
 	except PdfFileNotDecryptedError:
-		logger.warning(f'PDF file ({source.filename}) is encrypted and cannot be read')
+		logger.warning(f'PDF file ({source.reference}) is encrypted and cannot be read')
 		return None
 	except Exception:
-		logger.exception(f'Error decoding source file ({source.filename})', stack_info=True)
+		logger.exception(f'Error decoding source file ({source.reference})', stack_info=True)
 		return None
 	finally:
-		source.file.close()  # Ensure file is closed after processing
+		if io_obj is not None:
+			io_obj.close()
diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index 5871ebb8..0eb70e0b 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -5,29 +5,23 @@
 import logging
 import re
 
-from fastapi.datastructures import UploadFile
 from langchain.schema import Document
 
 from ...dyn_loader import VectorDBLoader
-from ...types import TConfig
-from ...utils import is_valid_source_id, to_int
+from ...types import IndexingError, SourceItem, TConfig
 from ...vectordb.base import BaseVectorDB
 from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp
 from ..types import InDocument
 from .doc_loader import decode_source
 from .doc_splitter import get_splitter_for
-from .mimetype_list import SUPPORTED_MIMETYPES
 
 logger = logging.getLogger('ccb.injest')
 
-def _allowed_file(file: UploadFile) -> bool:
-	return file.headers['type'] in SUPPORTED_MIMETYPES
-
 
 def _filter_sources(
 	vectordb: BaseVectorDB,
-	sources: list[UploadFile]
-) -> tuple[list[UploadFile], list[UploadFile]]:
+	sources: dict[int, SourceItem]
+) -> tuple[dict[int, SourceItem], dict[int, SourceItem]]:
 	'''
 	Returns
 	-------
@@ -37,30 +31,42 @@ def _filter_sources(
 	'''
 
 	try:
-		existing_sources, new_sources = vectordb.check_sources(sources)
+		existing_source_ids, to_embed_source_ids = vectordb.check_sources(sources)
 	except Exception as e:
-		raise DbException('Error: Vectordb sources_to_embed error') from e
+		raise DbException('Error: Vectordb error while checking existing sources in indexing') from e
+
+	existing_sources = {}
+	to_embed_sources = {}
 
-	return ([
-		source for source in sources
-		if source.filename in existing_sources
-	], [
-		source for source in sources
-		if source.filename in new_sources
-	])
+	for db_id, source in sources.items():
+		if source.reference in existing_source_ids:
+			existing_sources[db_id] = source
+		elif source.reference in to_embed_source_ids:
+			to_embed_sources[db_id] = source
 
+	return existing_sources, to_embed_sources
 
-def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[InDocument]:
-	indocuments = []
 
-	for source in sources:
-		logger.debug('processing source', extra={ 'source_id': source.filename })
+def _sources_to_indocuments(
+	config: TConfig,
+	sources: dict[int, SourceItem]
+) -> tuple[dict[int, InDocument], dict[int, IndexingError]]:
+	indocuments = {}
+	errored_docs = {}
 
+	for db_id, source in sources.items():
+		logger.debug('processing source', extra={ 'source_id': source.reference })
+
+		# todo: maybe fetch the content of the files here
 		# transform the source to have text data
 		content = decode_source(source)
 
 		if content is None or (content := content.strip()) == '':
-			logger.debug('decoded empty source', extra={ 'source_id': source.filename })
+			logger.debug('decoded empty source', extra={ 'source_id': source.reference })
+			errored_docs[db_id] = IndexingError(
+				error='Decoded content is empty',
+				retryable=False,
+			)
 			continue
 
 		# replace more than two newlines with two newlines (also blank spaces, more than 4)
@@ -71,94 +77,123 @@ def _sources_to_indocuments(config: TConfig, sources: list[UploadFile]) -> list[
 		content = content.replace('\0', '')
 
 		if content is None or content == '':
-			logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.filename })
+			logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.reference })
+			errored_docs[db_id] = IndexingError(
+				error='Decoded content is empty',
+				retryable=False,
+			)
 			continue
 
-		logger.debug('decoded non empty source', extra={ 'source_id': source.filename })
+		logger.debug('decoded non empty source', extra={ 'source_id': source.reference })
 
 		metadata = {
-			'source': source.filename,
-			'title': _decode_latin_1(source.headers['title']),
-			'type': source.headers['type'],
+			'source': source.reference,
+			'title': _decode_latin_1(source.title),
+			'type': source.type,
 		}
 		doc = Document(page_content=content, metadata=metadata)
 
-		splitter = get_splitter_for(config.embedding_chunk_size, source.headers['type'])
+		splitter = get_splitter_for(config.embedding_chunk_size, source.type)
 		split_docs = splitter.split_documents([doc])
 		logger.debug('split document into chunks', extra={
-			'source_id': source.filename,
+			'source_id': source.reference,
 			'len(split_docs)': len(split_docs),
 		})
 
-		indocuments.append(InDocument(
+		indocuments[db_id] = InDocument(
 			documents=split_docs,
-			userIds=list(map(_decode_latin_1, source.headers['userIds'].split(','))),
-			source_id=source.filename,  # pyright: ignore[reportArgumentType]
-			provider=source.headers['provider'],
-			modified=to_int(source.headers['modified']),
-		))
+			userIds=list(map(_decode_latin_1, source.userIds)),
+			source_id=source.reference,
+			provider=source.provider,
+			modified=source.modified,  # pyright: ignore[reportArgumentType]
+		)
+
+	return indocuments, errored_docs
+
+
+def _increase_access_for_existing_sources(
+	vectordb: BaseVectorDB,
+	existing_sources: dict[int, SourceItem]
+) -> dict[int, IndexingError | None]:
+	'''
+	update userIds for existing sources
+	allow the userIds as additional users, not as the only users
+	'''
+	if len(existing_sources) == 0:
+		return {}
 
-	return indocuments
+	results = {}
+	logger.debug('Increasing access for existing sources', extra={
+		'source_ids': [source.reference for source in existing_sources.values()]
+	})
+	for db_id, source in existing_sources.items():
+		try:
+			vectordb.update_access(
+				UpdateAccessOp.allow,
+				list(map(_decode_latin_1, source.userIds)),
+				source.reference,
+			)
+			results[db_id] = None
+		except SafeDbException as e:
+			logger.error(f'Failed to update access for source ({source.reference}): {e.args[0]}')
+			results[db_id] = IndexingError(
+				error=str(e),
+				retryable=False,
+			)
+			continue
+		except Exception as e:
+			logger.error(f'Unexpected error while updating access for source ({source.reference}): {e}')
+			results[db_id] = IndexingError(
+				error='Unexpected error while updating access',
+				retryable=True,
+			)
+			continue
+	return results
 
 
 def _process_sources(
 	vectordb: BaseVectorDB,
 	config: TConfig,
-	sources: list[UploadFile],
-) -> tuple[list[str],list[str]]:
+	sources: dict[int, SourceItem]
+) -> dict[int, IndexingError | None]:
 	'''
 	Processes the sources and adds them to the vectordb.
 	Returns the list of source ids that were successfully added and those that need to be retried.
 	'''
-	existing_sources, filtered_sources = _filter_sources(vectordb, sources)
+	existing_sources, to_embed_sources = _filter_sources(vectordb, sources)
 	logger.debug('db filter source results', extra={
 		'len(existing_sources)': len(existing_sources),
 		'existing_sources': existing_sources,
-		'len(filtered_sources)': len(filtered_sources),
-		'filtered_sources': filtered_sources,
+		'len(to_embed_sources)': len(to_embed_sources),
+		'to_embed_sources': to_embed_sources,
 	})
-	loaded_source_ids = [source.filename for source in existing_sources]
 
-	# update userIds for existing sources
-	# allow the userIds as additional users, not as the only users
-	if len(existing_sources) > 0:
-		logger.debug('Increasing access for existing sources', extra={
-			'source_ids': [source.filename for source in existing_sources]
-		})
-		for source in existing_sources:
-			try:
-				vectordb.update_access(
-					UpdateAccessOp.allow,
-					list(map(_decode_latin_1, source.headers['userIds'].split(','))),
-					source.filename,  # pyright: ignore[reportArgumentType]
-				)
-			except SafeDbException as e:
-				logger.error(f'Failed to update access for source ({source.filename}): {e.args[0]}')
-				continue
-
-	if len(filtered_sources) == 0:
+	source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources)
+
+	if len(to_embed_sources) == 0:
 		# no new sources to embed
 		logger.debug('Filtered all sources, nothing to embed')
-		return loaded_source_ids, []  # pyright: ignore[reportReturnType]
+		return source_proc_results
 
 	logger.debug('Filtered sources:', extra={
-		'source_ids': [source.filename for source in filtered_sources]
+		'source_ids': [source.reference for source in to_embed_sources.values()]
 	})
 	# invalid/empty sources are filtered out here and not counted in loaded/retryable
-	indocuments = _sources_to_indocuments(config, filtered_sources)
+	indocuments, errored_docs = _sources_to_indocuments(config, to_embed_sources)
 
-	logger.debug('Converted all sources to documents')
+	source_proc_results.update(errored_docs)
+	logger.debug('Converted sources to documents')
 
 	if len(indocuments) == 0:
 		# filtered document(s) were invalid/empty, not an error
 		logger.debug('All documents were found empty after being processed')
-		return loaded_source_ids, []  # pyright: ignore[reportReturnType]
+		return source_proc_results
 
-	added_source_ids, retry_source_ids = vectordb.add_indocuments(indocuments)
-	loaded_source_ids.extend(added_source_ids)
+	doc_add_results = vectordb.add_indocuments(indocuments)
+	source_proc_results.update(doc_add_results)
 	logger.debug('Added documents to vectordb')
 
-	return loaded_source_ids, retry_source_ids  # pyright: ignore[reportReturnType]
+	return source_proc_results
 
 
 def _decode_latin_1(s: str) -> str:
@@ -172,31 +207,15 @@ def _decode_latin_1(s: str) -> str:
 def embed_sources(
 	vectordb_loader: VectorDBLoader,
 	config: TConfig,
-	sources: list[UploadFile],
-) -> tuple[list[str],list[str]]:
-	# either not a file or a file that is allowed
-	sources_filtered = [
-		source for source in sources
-		if is_valid_source_id(source.filename)  # pyright: ignore[reportArgumentType]
-		or _allowed_file(source)
-	]
-
+	sources: dict[int, SourceItem]
+) -> dict[int, IndexingError | None]:
 	logger.debug('Embedding sources:', extra={
 		'source_ids': [
-			f'{source.filename} ({_decode_latin_1(source.headers["title"])})'
-			for source in sources_filtered
-		],
-		'invalid_source_ids': [
-			source.filename for source in sources
-			if not is_valid_source_id(source.filename)  # pyright: ignore[reportArgumentType]
-		],
-		'not_allowed_file_ids': [
-			source.filename for source in sources
-			if not _allowed_file(source)
+			f'{source.reference} ({_decode_latin_1(source.title)})'
+			for source in sources.values()
 		],
-		'len(source_ids)': len(sources_filtered),
-		'len(total_source_ids)': len(sources),
+		'len(source_ids)': len(sources),
 	})
 
 	vectordb = vectordb_loader.load()
-	return _process_sources(vectordb, config, sources_filtered)
+	return _process_sources(vectordb, config, sources)
diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index fadc5f83..3e70ee1b 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -27,7 +27,7 @@
 from time import sleep
 from typing import Annotated, Any
 
-from fastapi import Body, FastAPI, Request, UploadFile
+from fastapi import Body, FastAPI, Request
 from langchain.llms.base import LLM
 from nc_py_api import AsyncNextcloudApp, NextcloudApp
 from nc_py_api.ex_app import persistent_storage, set_handlers
@@ -35,14 +35,13 @@
 from starlette.responses import FileResponse
 
 from .chain.context import do_doc_search
-from .chain.ingest.injest import embed_sources
 from .chain.one_shot import process_context_query, process_query
 from .config_parser import get_config
 from .dyn_loader import LLMModelLoader, VectorDBLoader
 from .models.types import LlmException
 from nc_py_api.ex_app import AppAPIAuthMiddleware
 from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of
-from .task_fetcher import start_bg_threads, stop_bg_threads
+from .task_fetcher import start_bg_threads, wait_for_bg_threads
 from .vectordb.service import (
 	count_documents_by_provider,
 	decl_update_access,
@@ -57,6 +56,7 @@
 repair_run()
 ensure_config_file()
 logger = logging.getLogger('ccb.controller')
+app_config = get_config(os.environ['CC_CONFIG_PATH'])
 __download_models_from_hf = os.environ.get('CC_DOWNLOAD_MODELS_FROM_HF', 'true').lower() in ('1', 'true', 'yes')
 
 models_to_fetch = {
@@ -77,10 +77,10 @@ def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str:
 	try:
 		if enabled:
 			app_enabled.set()
-			start_bg_threads()
+			start_bg_threads(app_config, app_enabled)
 		else:
 			app_enabled.clear()
-			stop_bg_threads()
+			wait_for_bg_threads()
 	except Exception as e:
 		logger.exception('Error in enabled handler:', exc_info=e)
 		return f'Error in enabled handler: {e}'
@@ -101,10 +101,9 @@ async def lifespan(app: FastAPI):
 	yield
 	vectordb_loader.offload()
 	llm_loader.offload()
-	stop_bg_threads()
+	wait_for_bg_threads()
 
 
-app_config = get_config(os.environ['CC_CONFIG_PATH'])
 app = FastAPI(debug=app_config.debug, lifespan=lifespan)  # pyright: ignore[reportArgumentType]
 
 app.extra['CONFIG'] = app_config
@@ -343,86 +342,78 @@ def _(userId: str = Body(embed=True)):
 	return JSONResponse('User deleted')
 
 
-@app.put('/loadSources')
-@enabled_guard(app)
-def _(sources: list[UploadFile]):
-	global _indexing
-
-	if len(sources) == 0:
-		return JSONResponse('No sources provided', 400)
-
-	filtered_sources = []
-
-	for source in sources:
-		if not value_of(source.filename):
-			logger.warning('Skipping source with invalid source_id', extra={
-				'source_id': source.filename,
-				'title': source.headers.get('title'),
-			})
-			continue
-
-		with index_lock:
-			if source.filename in _indexing:
-				# this request will be retried by the client
-				return JSONResponse(
-					f'This source ({source.filename}) is already being processed in another request, try again later',
-					503,
-					headers={'cc-retry': 'true'},
-				)
-
-		if not (
-			value_of(source.headers.get('userIds'))
-			and source.headers.get('title', None) is not None
-			and value_of(source.headers.get('type'))
-			and value_of(source.headers.get('modified'))
-			and source.headers['modified'].isdigit()
-			and value_of(source.headers.get('provider'))
-		):
-			logger.warning('Skipping source with invalid/missing headers', extra={
-				'source_id': source.filename,
-				'title': source.headers.get('title'),
-				'headers': source.headers,
-			})
-			continue
-
-		filtered_sources.append(source)
-
-	# wait for 10 minutes before failing the request
-	semres = doc_parse_semaphore.acquire(block=True, timeout=10*60)
-	if not semres:
-		return JSONResponse(
-			'Document parser worker limit reached, try again in some time or consider increasing the limit',
-			503,
-			headers={'cc-retry': 'true'}
-		)
-
-	with index_lock:
-		for source in filtered_sources:
-			_indexing[source.filename] = source.size
-
-	try:
-		loaded_sources, not_added_sources = exec_in_proc(
-			target=embed_sources,
-			args=(vectordb_loader, app.extra['CONFIG'], filtered_sources)
-		)
-	except (DbException, EmbeddingException):
-		raise
-	except Exception as e:
-		raise DbException('Error: failed to load sources') from e
-	finally:
-		with index_lock:
-			for source in filtered_sources:
-				_indexing.pop(source.filename, None)
-		doc_parse_semaphore.release()
-
-	if len(loaded_sources) != len(filtered_sources):
-		logger.debug('Some sources were not loaded', extra={
-			'Count of loaded sources': f'{len(loaded_sources)}/{len(filtered_sources)}',
-			'source_ids': loaded_sources,
-		})
-
-	# loaded sources include the existing sources that may only have their access updated
-	return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources})
+# @app.put('/loadSources')
+# @enabled_guard(app)
+# def _(sources: list[UploadFile]):
+# 	global _indexing
+
+# 	if len(sources) == 0:
+# 		return JSONResponse('No sources provided', 400)
+
+# 	for source in sources:
+# 		if not value_of(source.filename):
+# 			return JSONResponse(f'Invalid source filename for: {source.headers.get("title")}', 400)
+
+# 		with index_lock:
+# 			if source.filename in _indexing:
+# 				# this request will be retried by the client
+# 				return JSONResponse(
+# 					f'This source ({source.filename}) is already being processed in another request, try again later',
+# 					503,
+# 					headers={'cc-retry': 'true'},
+# 				)
+
+# 		if not (
+# 			value_of(source.headers.get('userIds'))
+# 			and source.headers.get('title', None) is not None
+# 			and value_of(source.headers.get('type'))
+# 			and value_of(source.headers.get('modified'))
+# 			and source.headers['modified'].isdigit()
+# 			and value_of(source.headers.get('provider'))
+# 		):
+# 			logger.error('Invalid/missing headers received', extra={
+# 				'source_id': source.filename,
+# 				'title': source.headers.get('title'),
+# 				'headers': source.headers,
+# 			})
+# 			return JSONResponse(f'Invaild/missing headers for: {source.filename}', 400)
+
+# 	# wait for 10 minutes before failing the request
+# 	semres = doc_parse_semaphore.acquire(block=True, timeout=10*60)
+# 	if not semres:
+# 		return JSONResponse(
+# 			'Document parser worker limit reached, try again in some time or consider increasing the limit',
+# 			503,
+# 			headers={'cc-retry': 'true'}
+# 		)
+
+# 	with index_lock:
+# 		for source in sources:
+# 			_indexing[source.filename] = source.size
+
+# 	try:
+# 		loaded_sources, not_added_sources = exec_in_proc(
+# 			target=embed_sources,
+# 			args=(vectordb_loader, app.extra['CONFIG'], sources)
+# 		)
+# 	except (DbException, EmbeddingException):
+# 		raise
+# 	except Exception as e:
+# 		raise DbException('Error: failed to load sources') from e
+# 	finally:
+# 		with index_lock:
+# 			for source in sources:
+# 				_indexing.pop(source.filename, None)
+# 		doc_parse_semaphore.release()
+
+# 	if len(loaded_sources) != len(sources):
+# 		logger.debug('Some sources were not loaded', extra={
+# 			'Count of loaded sources': f'{len(loaded_sources)}/{len(sources)}',
+# 			'source_ids': loaded_sources,
+# 		})
+
+# 	# loaded sources include the existing sources that may only have their access updated
+# 	return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources})
 
 
 class Query(BaseModel):
diff --git a/context_chat_backend/chain/ingest/mimetype_list.py b/context_chat_backend/mimetype_list.py
similarity index 100%
rename from context_chat_backend/chain/ingest/mimetype_list.py
rename to context_chat_backend/mimetype_list.py
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 9660b44c..a548bcfd 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -3,15 +3,41 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 
+import asyncio
+import logging
+from contextlib import suppress
 from enum import Enum
-from threading import Thread
+from io import BytesIO
+from threading import Event, Thread
+from time import sleep
 
-from .types import AppRole
-from .utils import get_app_role
+import niquests
+from nc_py_api import AsyncNextcloudApp, NextcloudApp
+from pydantic import ValidationError
+
+from .chain.ingest.injest import embed_sources
+from .dyn_loader import VectorDBLoader
+from .types import (
+	AppRole,
+	EmbeddingException,
+	FilesQueueItem,
+	IndexingError,
+	IndexingException,
+	LoaderException,
+	ReceivedFileItem,
+	SourceItem,
+	TConfig,
+)
+from .utils import exec_in_proc, get_app_role
+from .vectordb.types import DbException
 
 APP_ROLE = get_app_role()
 THREADS = {}
-THREADS_STOP_EVENTS = {}
+LOGGER = logging.getLogger('ccb.task_fetcher')
+FILES_INDEXING_BATCH_SIZE = 64  # todo: config?
+# max concurrent fetches to avoid overloading the NC server or hitting rate limits
+CONCURRENT_FILE_FETCHES = 10  # todo: config?
+MAX_FILE_SIZE = 100 * 1024 * 1024  # 100 MB, todo: config?
 
 
 class ThreadType(Enum):
@@ -20,67 +46,294 @@ class ThreadType(Enum):
 	REQUEST_PROCESSING = 'request_processing'
 
 
-def files_indexing_thread():
-	...
+async def __fetch_file_content(
+	semaphore: asyncio.Semaphore,
+	file_id: int,
+	user_id: str,
+	_rlimit = 3,
+) -> BytesIO:
+	'''
+	Raises
+	------
+	IndexingException
+	'''
+
+	async with semaphore:
+		nc = AsyncNextcloudApp()
+		try:
+			# a file pointer for storing the stream in memory until it is consumed
+			fp = BytesIO()
+			await nc._session.download2fp(
+				url_path=f'/apps/context_chat/files/{file_id}',
+				fp=fp,
+				dav=False,
+				params={ 'userId': user_id },
+			)
+			return fp
+		except niquests.exceptions.RequestException as e:
+			# todo: raise IndexingException with retryable=True for rate limit errors,
+			# todo: and handle it in the caller to not delete the source from the queue and retry later through
+			# todo: the normal lock expiry mechanism
+			if e.response is None:
+				raise
+
+			if e.response.status_code == niquests.codes.too_many_requests:  # pyright: ignore[reportAttributeAccessIssue]
+				# todo: implement rate limits in php CC?
+				wait_for = int(e.response.headers.get('Retry-After', '30'))
+				if _rlimit <= 0:
+					raise IndexingException(
+						f'Rate limited when fetching content for file id {file_id}, user id {user_id},'
+						' max retries exceeded',
+						retryable=True,
+					) from e
+				LOGGER.warning(
+					f'Rate limited when fetching content for file id {file_id}, user id {user_id},'
+					f' waiting {wait_for} before retrying',
+					exc_info=e,
+				)
+				await asyncio.sleep(wait_for)
+				return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1)
+
+			raise
+		except IndexingException:
+			raise
+		except Exception as e:
+			LOGGER.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e)
+			raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e
+
+
+async def __fetch_files_content(
+	files: dict[int, ReceivedFileItem]
+) -> dict[int, SourceItem | IndexingError]:
+	source_items = {}
+	semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES)
+	tasks = []
+
+	for file_id, file_item in files.items():
+		if file_item.size > MAX_FILE_SIZE:
+			LOGGER.info(
+				f'Skipping file id {file_id}, source id {file_item.reference} due to size'
+				f' {(file_item.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB',
+			)
+			source_items[file_id] = IndexingError(
+				error=(
+					f'File size {(file_item.size/(1024*1024)):.2f} MiB'
+					f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB'
+				),
+				retryable=False,
+			)
+			continue
+		# todo: perform the existing file check before fetching the content to avoid unnecessary fetches
+		# any user id from the list should have read access to the file
+		tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file_id, file_item.userIds[0])))
 
+	results = await asyncio.gather(*tasks, return_exceptions=True)
+	for (file_id, file_item), result in zip(files.items(), results, strict=True):
+		if isinstance(result, IndexingException):
+			LOGGER.error(
+				f'Error fetching content for file id {file_id}, reference {file_item.reference}: {result}',
+				exc_info=result,
+			)
+			source_items[file_id] = IndexingError(
+				error=str(result),
+				retryable=result.retryable,
+			)
+		elif isinstance(result, str) or isinstance(result, BytesIO):
+			source_items[file_id] = SourceItem(
+				**file_item.model_dump(),
+				content=result,
+			)
+		elif isinstance(result, BaseException):
+			LOGGER.error(
+				f'Unexpected error fetching content for file id {file_id}, reference {file_item.reference}: {result}',
+				exc_info=result,
+			)
+			source_items[file_id] = IndexingError(
+				error=f'Unexpected error: {result}',
+				retryable=True,
+			)
+		else:
+			LOGGER.error(
+				f'Unknown error fetching content for file id {file_id}, reference {file_item.reference}: {result}',
+				exc_info=True,
+			)
+			source_items[file_id] = IndexingError(
+				error='Unknown error',
+				retryable=True,
+			)
+	return source_items
+
+
+def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None:
+	try:
+		vectordb_loader = VectorDBLoader(app_config)
+	except LoaderException as e:
+		LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e)
+		return
+
+	def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingError | None]:
+		try:
+			return exec_in_proc(
+				target=embed_sources,
+				args=(vectordb_loader, app_config, source_items),
+			)
+		except (DbException, EmbeddingException):
+			raise
+		except Exception as e:
+			raise DbException('Error: failed to load sources') from e
 
-def updates_processing_thread():
+
+	while True:
+		if not app_enabled.is_set():
+			LOGGER.info('Files indexing thread is stopping as the app is disabled')
+			return
+
+		try:
+			nc = NextcloudApp()
+			# todo: add the 'size' param to the return of this call.
+			q_items_res = nc.ocs(
+				'GET',
+				'/apps/context_chat/queues/documents',
+				params={ 'n': FILES_INDEXING_BATCH_SIZE }
+			)
+
+			try:
+				q_items = FilesQueueItem.model_validate(q_items_res)
+			except ValidationError as e:
+				raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e
+
+			# populate files content and convert to source items
+			fetched_files = {}
+			source_files = {}
+			# unified error structure for files and content providers
+			source_errors = {}
+
+			if q_items.files:
+				fetched_files = asyncio.run(__fetch_files_content(q_items.files))
+
+			for file_id, result in fetched_files.items():
+				if isinstance(result, SourceItem):
+					source_files[file_id] = result
+				else:
+					source_errors[file_id] = result
+
+			files_result = _load_sources(source_files)
+			providers_result = _load_sources(q_items.content_providers)
+
+			if (
+				any(isinstance(res, IndexingError) for res in files_result.values())
+				or any(isinstance(res, IndexingError) for res in providers_result.values())
+			):
+				LOGGER.error('Some sources failed to index', extra={
+					'file_errors': {
+						file_id: error
+						for file_id, error in files_result.items()
+						if isinstance(error, IndexingError)
+					},
+					'provider_errors': {
+						provider_id: error
+						for provider_id, error in providers_result.items()
+						if isinstance(error, IndexingError)
+					},
+				})
+		except (
+			niquests.exceptions.ConnectionError,
+			niquests.exceptions.Timeout,
+		) as e:
+			LOGGER.info('Temporary error fetching documents to index, will retry:', exc_info=e)
+			sleep(5)
+			continue
+		except Exception as e:
+			LOGGER.exception('Error fetching documents to index:', exc_info=e)
+			sleep(5)
+			continue
+
+		# delete the entries from the PHP side queue where indexing succeeded or the error is not retryable
+		to_delete_file_ids = [
+			file_id for file_id, result in files_result.items()
+			if result is None or (isinstance(result, IndexingError) and not result.retryable)
+		]
+		to_delete_provider_ids = [
+			provider_id for provider_id, result in providers_result.items()
+			if result is None or (isinstance(result, IndexingError) and not result.retryable)
+		]
+
+		try:
+			nc.ocs(
+				'DELETE',
+				'/apps/context_chat/queues/documents/',
+				json={
+					'files': to_delete_file_ids,
+					'content_providers': to_delete_provider_ids,
+				},
+			)
+		except (
+			niquests.exceptions.ConnectionError,
+			niquests.exceptions.Timeout,
+		) as e:
+			LOGGER.info('Temporary error reporting indexing results, will retry:', exc_info=e)
+			sleep(5)
+			with suppress(Exception):
+				nc = NextcloudApp()
+				nc.ocs(
+					'DELETE',
+					'/apps/context_chat/queues/documents/',
+					json={
+						'files': to_delete_file_ids,
+						'content_providers': to_delete_provider_ids,
+					},
+				)
+			continue
+		except Exception as e:
+			LOGGER.exception('Error reporting indexing results:', exc_info=e)
+			sleep(5)
+			continue
+
+
+
+def updates_processing_thread(app_config: TConfig):
 	...
 
 
-def request_processing_thread():
+def request_processing_thread(app_config: TConfig):
 	...
 
 
-def start_bg_threads():
+def start_bg_threads(app_config: TConfig, app_enabled: Event):
 	match APP_ROLE:
 		case AppRole.INDEXING | AppRole.NORMAL:
 			THREADS[ThreadType.FILES_INDEXING] = Thread(
 				target=files_indexing_thread,
+				args=(app_config, Event),
 				name='FilesIndexingThread',
-				daemon=True,
 			)
 			THREADS[ThreadType.UPDATES_PROCESSING] = Thread(
 				target=updates_processing_thread,
+				args=(app_config, Event),
 				name='UpdatesProcessingThread',
-				daemon=True,
 			)
 			THREADS[ThreadType.FILES_INDEXING].start()
 			THREADS[ThreadType.UPDATES_PROCESSING].start()
 		case AppRole.RP | AppRole.NORMAL:
 			THREADS[ThreadType.REQUEST_PROCESSING] = Thread(
 				target=request_processing_thread,
+				args=(app_config, Event),
 				name='RequestProcessingThread',
-				daemon=True,
 			)
 			THREADS[ThreadType.REQUEST_PROCESSING].start()
 
 
-def stop_bg_threads():
+def wait_for_bg_threads():
 	match APP_ROLE:
 		case AppRole.INDEXING | AppRole.NORMAL:
-			if (
-				ThreadType.FILES_INDEXING not in THREADS
-				or ThreadType.UPDATES_PROCESSING not in THREADS
-				or ThreadType.FILES_INDEXING not in THREADS_STOP_EVENTS
-				or ThreadType.UPDATES_PROCESSING not in THREADS_STOP_EVENTS
-			):
+			if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS):
 				return
-			THREADS_STOP_EVENTS[ThreadType.FILES_INDEXING].set()
-			THREADS_STOP_EVENTS[ThreadType.UPDATES_PROCESSING].set()
 			THREADS[ThreadType.FILES_INDEXING].join()
 			THREADS[ThreadType.UPDATES_PROCESSING].join()
 			THREADS.pop(ThreadType.FILES_INDEXING)
 			THREADS.pop(ThreadType.UPDATES_PROCESSING)
-			THREADS_STOP_EVENTS.pop(ThreadType.FILES_INDEXING)
-			THREADS_STOP_EVENTS.pop(ThreadType.UPDATES_PROCESSING)
 		case AppRole.RP | AppRole.NORMAL:
-			if (
-				ThreadType.REQUEST_PROCESSING not in THREADS
-				or ThreadType.REQUEST_PROCESSING not in THREADS_STOP_EVENTS
-			):
+			if (ThreadType.REQUEST_PROCESSING not in THREADS):
 				return
-			THREADS_STOP_EVENTS[ThreadType.REQUEST_PROCESSING].set()
 			THREADS[ThreadType.REQUEST_PROCESSING].join()
 			THREADS.pop(ThreadType.REQUEST_PROCESSING)
-			THREADS_STOP_EVENTS.pop(ThreadType.REQUEST_PROCESSING)
diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
index 78680866..97d48ce6 100644
--- a/context_chat_backend/types.py
+++ b/context_chat_backend/types.py
@@ -3,8 +3,13 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 from enum import Enum
+from io import BytesIO
+from typing import Self
 
-from pydantic import BaseModel
+from pydantic import BaseModel, field_validator
+
+from .mimetype_list import SUPPORTED_MIMETYPES
+from .utils import is_valid_provider_id, is_valid_source_id
 
 __all__ = [
 	'DEFAULT_EM_MODEL_ALIAS',
@@ -17,6 +22,7 @@
 ]
 
 DEFAULT_EM_MODEL_ALIAS = 'em_model'
+FILES_PROVIDER_ID = 'files__default'
 
 
 class TEmbeddingAuthApiKey(BaseModel):
@@ -79,3 +85,116 @@ class AppRole(str, Enum):
 	NORMAL = 'normal'
 	INDEXING = 'indexing'
 	RP = 'rp'
+
+
+class CommonSourceItem(BaseModel):
+	userIds: list[str]
+	reference: str  # source_id of the form "appId__providerId: itemId"
+	title: str
+	modified: int | str  # todo: int/string?
+	type: str
+	provider: str
+	size: int
+
+	@field_validator('modified', mode='before')
+	@classmethod
+	def validate_modified(cls, v):
+		if isinstance(v, int):
+			return v
+		if isinstance(v, str):
+			try:
+				return int(v)
+			except ValueError as e:
+				raise ValueError(f'Invalid modified value: {v}') from e
+		raise ValueError(f'Invalid modified type: {type(v)}')
+
+	@field_validator('reference', 'title', 'type', 'provider')
+	@classmethod
+	def validate_strings_non_empty(cls, v):
+		if not isinstance(v, str) or v.strip() == '':
+			raise ValueError('Must be a non-empty string')
+		return v.strip()
+
+	@field_validator('userIds', mode='after')
+	def validate_user_ids(self) -> Self:
+		if (
+			not isinstance(self.userIds, list)
+			or not all(
+				isinstance(uid, str)
+				and uid.strip() != ''
+				for uid in self.userIds
+			)
+			or len(self.userIds) == 0
+		):
+			raise ValueError('userIds must be a non-empty list of non-empty strings')
+		self.userIds = [uid.strip() for uid in self.userIds]
+		return self
+
+	@field_validator('reference', mode='after')
+	def validate_reference_format(self) -> Self:
+		# validate reference format: "appId__providerId: itemId"
+		if not is_valid_source_id(self.reference):
+			raise ValueError('Invalid reference format, must be "appId__providerId: itemId"')
+		return self
+
+	@field_validator('provider', mode='after')
+	def validate_provider_format(self) -> Self:
+		# validate provider format: "appId__providerId"
+		if not is_valid_provider_id(self.provider):
+			raise ValueError('Invalid provider format, must be "appId__providerId"')
+		return self
+
+	@field_validator('type', mode='after')
+	def validate_type(self) -> Self:
+		if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES:
+			raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}')
+		return self
+
+	@field_validator('size', mode='after')
+	def validate_size(self) -> Self:
+		if not isinstance(self.size, int) or self.size < 0:
+			raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer')
+		return self
+
+
+class ReceivedFileItem(CommonSourceItem):
+	content: None
+
+
+class SourceItem(CommonSourceItem):
+	'''
+	Used for the unified queue of items to process, after fetching the content for files
+	and for directly fetched content providers.
+	'''
+	content: str | BytesIO
+
+	@field_validator('content')
+	@classmethod
+	def validate_content(cls, v):
+		if isinstance(v, str):
+			if v.strip() == '':
+				raise ValueError('Content must be a non-empty string')
+			return v.strip()
+		if isinstance(v, BytesIO):
+			if v.getbuffer().nbytes == 0:
+				raise ValueError('Content must be a non-empty BytesIO')
+			return v
+		raise ValueError('Content must be either a non-empty string or a non-empty BytesIO')
+
+
+class FilesQueueItem(BaseModel):
+	files: dict[int, ReceivedFileItem]  # [db id]: FileItem
+	content_providers: dict[int, SourceItem]  # [db id]: SourceItem
+
+
+class IndexingException(Exception):
+	retryable: bool = False
+
+	def __init__(self, message: str, retryable: bool = False):
+		super().__init__(message)
+		self.retryable = retryable
+
+
+class IndexingError(BaseModel):
+	error: str
+	retryable: bool = False
diff --git a/context_chat_backend/vectordb/base.py b/context_chat_backend/vectordb/base.py
index 0bf10200..ebd54075 100644
--- a/context_chat_backend/vectordb/base.py
+++ b/context_chat_backend/vectordb/base.py
@@ -5,12 +5,12 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
-from fastapi import UploadFile
 from langchain.schema import Document
 from langchain.schema.embeddings import Embeddings
 from langchain.schema.vectorstore import VectorStore
 
 from ..chain.types import InDocument, ScopeType
+from ..types import IndexingError, SourceItem
 from ..utils import timed
 from .types import UpdateAccessOp
 
@@ -62,7 +62,7 @@ def get_instance(self) -> VectorStore:
 		'''
 
 	@abstractmethod
-	def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str],list[str]]:
+	def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]:
 		'''
 		Adds the given indocuments to the vectordb and updates the docs + access tables.
 
@@ -79,10 +79,7 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str],list
 
 	@timed
 	@abstractmethod
-	def check_sources(
-		self,
-		sources: list[UploadFile],
-	) -> tuple[list[str], list[str]]:
+	def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]:
 		'''
 		Checks the sources in the vectordb if they are already embedded
 			and are up to date.
diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py
index 2b7fc060..f5879feb 100644
--- a/context_chat_backend/vectordb/pgvector.py
+++ b/context_chat_backend/vectordb/pgvector.py
@@ -11,14 +11,13 @@
 import sqlalchemy.dialects.postgresql as postgresql_dialects
 import sqlalchemy.orm as orm
 from dotenv import load_dotenv
-from fastapi import UploadFile
 from langchain.schema import Document
 from langchain.vectorstores import VectorStore
 from langchain_core.embeddings import Embeddings
 from langchain_postgres.vectorstores import Base, PGVector
 
 from ..chain.types import InDocument, ScopeType
-from ..types import EmbeddingException, RetryableEmbeddingException
+from ..types import EmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem
 from ..utils import timed
 from .base import BaseVectorDB
 from .types import DbException, SafeDbException, UpdateAccessOp
@@ -130,17 +129,16 @@ def get_users(self) -> list[str]:
 			except Exception as e:
 				raise DbException('Error: getting a list of all users from access list') from e
 
-	def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], list[str]]:
+	def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]:
 		"""
 		Raises
 			EmbeddingException: if the embedding request definitively fails
 		"""
-		added_sources = []
-		retry_sources = []
+		results = {}
 		batch_size = PG_BATCH_SIZE // 5
 
 		with self.session_maker() as session:
-			for indoc in indocuments:
+			for php_db_id, indoc in indocuments.items():
 				try:
 					# query paramerters limitation in postgres is 65535 (https://www.postgresql.org/docs/current/limits.html)
 					# so we chunk the documents into (5 values * 10k) chunks
@@ -170,7 +168,7 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], lis
 						)
 
 					self.decl_update_access(indoc.userIds, indoc.source_id, session)
-					added_sources.append(indoc.source_id)
+					results[php_db_id] = None
 					session.commit()
 				except SafeDbException as e:
 					# for when the source_id is not found. This here can be an error in the DB
@@ -178,51 +176,67 @@ def add_indocuments(self, indocuments: list[InDocument]) -> tuple[list[str], lis
 					logger.exception('Error adding documents to vectordb', exc_info=e, extra={
 						'source_id': indoc.source_id,
 					})
-					retry_sources.append(indoc.source_id)
+					results[php_db_id] = IndexingError(
+						error=str(e),
+						retryable=True,
+					)
 					continue
 				except RetryableEmbeddingException as e:
 					# temporary error, continue with the next document
 					logger.exception('Error adding documents to vectordb, should be retried later.', exc_info=e, extra={
 						'source_id': indoc.source_id,
 					})
-					retry_sources.append(indoc.source_id)
+					results[php_db_id] = IndexingError(
+						error=str(e),
+						retryable=True,
+					)
 					continue
 				except EmbeddingException as e:
 					logger.exception('Error adding documents to vectordb', exc_info=e, extra={
 						'source_id': indoc.source_id,
 					})
-					raise
+					results[php_db_id] = IndexingError(
+						error=str(e),
+						retryable=False,
+					)
+					continue
 				except Exception as e:
 					logger.exception('Error adding documents to vectordb', exc_info=e, extra={
 						'source_id': indoc.source_id,
 					})
-					retry_sources.append(indoc.source_id)
+					results[php_db_id] = IndexingError(
+						error='An unexpected error occurred while adding documents to the database.',
+						retryable=True,
+					)
 					continue
 
-		return added_sources, retry_sources
+		return results
 
 	@timed
-	def check_sources(self, sources: list[UploadFile]) -> tuple[list[str], list[str]]:
+	def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]:
+		'''
+		returns a tuple of (existing_source_ids, to_embed_source_ids)
+		'''
 		with self.session_maker() as session:
 			try:
 				stmt = (
 					sa.select(DocumentsStore.source_id)
-					.filter(DocumentsStore.source_id.in_([source.filename for source in sources]))
+					.filter(DocumentsStore.source_id.in_([source.reference for source in sources.values()]))
 					.with_for_update()
 				)
 
 				results = session.execute(stmt).fetchall()
 				existing_sources = {r.source_id for r in results}
-				to_embed = [source.filename for source in sources if source.filename not in existing_sources]
+				to_embed = [source.reference for source in sources.values() if source.reference not in existing_sources]
 
 				to_delete = []
 
-				for source in sources:
+				for source in sources.values():
 					stmt = (
 						sa.select(DocumentsStore.source_id)
-						.filter(DocumentsStore.source_id == source.filename)
+						.filter(DocumentsStore.source_id == source.reference)
 						.filter(DocumentsStore.modified < sa.cast(
-							datetime.fromtimestamp(int(source.headers['modified'])),
+							datetime.fromtimestamp(int(source.modified)),
 							sa.DateTime,
 						))
 					)
@@ -239,14 +253,13 @@ def check_sources(self, sources: list[UploadFile]) -> tuple[list[str], list[str]
 				session.rollback()
 				raise DbException('Error: checking sources in vectordb') from e
 
-			still_existing_sources = [
-				source
-				for source in existing_sources
-				if source not in to_delete
+			still_existing_source_ids = [
+				source_id
+				for source_id in existing_sources
+				if source_id not in to_delete
 			]
 
-			# the pyright issue stems from source.filename, which has already been validated
-			return list(still_existing_sources), to_embed  # pyright: ignore[reportReturnType]
+			return list(still_existing_source_ids), to_embed
 
 	def decl_update_access(self, user_ids: list[str], source_id: str, session_: orm.Session | None = None):
 		session = session_ or self.session_maker()

From 03a3f433caccdf7121c3171538828c8f6fefa5af Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Mon, 9 Mar 2026 19:42:21 +0530
Subject: [PATCH 04/56] wip: parallelize file parsing and processing based on
 cpu count

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/task_fetcher.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index a548bcfd..853a68c8 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -5,6 +5,7 @@
 
 import asyncio
 import logging
+import os
 from contextlib import suppress
 from enum import Enum
 from io import BytesIO
@@ -35,6 +36,8 @@
 THREADS = {}
 LOGGER = logging.getLogger('ccb.task_fetcher')
 FILES_INDEXING_BATCH_SIZE = 64  # todo: config?
+# divides the batch into these many chunks
+PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1)  # todo: config?
 # max concurrent fetches to avoid overloading the NC server or hitting rate limits
 CONCURRENT_FILE_FETCHES = 10  # todo: config?
 MAX_FILE_SIZE = 100 * 1024 * 1024  # 100 MB, todo: config?
@@ -217,8 +220,18 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 				else:
 					source_errors[file_id] = result
 
-			files_result = _load_sources(source_files)
-			providers_result = _load_sources(q_items.content_providers)
+			files_result = {}
+			providers_result = {}
+			chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING
+
+			# chunk file parsing for better file operation parallelism
+			for i in range(0, len(source_files), chunk_size):
+				chunk = dict(list(source_files.items())[i:i+chunk_size])
+				files_result.update(_load_sources(chunk))
+
+			for i in range(0, len(q_items.content_providers), chunk_size):
+				chunk = dict(list(q_items.content_providers.items())[i:i+chunk_size])
+				providers_result.update(_load_sources(chunk))
 
 			if (
 				any(isinstance(res, IndexingError) for res in files_result.values())

From 0dc404bf48cff0e358b723bcb12775956d0c2eac Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Tue, 10 Mar 2026 17:36:03 +0530
Subject: [PATCH 05/56] ci: use the kubernetes branch of context_chat

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 10e2d61b..fb06bafa 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -113,6 +113,8 @@ jobs:
           repository: nextcloud/context_chat
           path: apps/context_chat
           persist-credentials: false
+          # todo: remove later
+          ref: feat/reverse-content-flow
 
       - name: Checkout backend
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4

From c7339828818ff49e8a2c44aa7896b4b2fdf495fb Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Tue, 10 Mar 2026 17:43:27 +0530
Subject: [PATCH 06/56] fix typo

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/task_fetcher.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 853a68c8..cfa9293c 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -304,11 +304,11 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 
 
 
-def updates_processing_thread(app_config: TConfig):
+def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 	...
 
 
-def request_processing_thread(app_config: TConfig):
+def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 	...
 
 
@@ -317,12 +317,12 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event):
 		case AppRole.INDEXING | AppRole.NORMAL:
 			THREADS[ThreadType.FILES_INDEXING] = Thread(
 				target=files_indexing_thread,
-				args=(app_config, Event),
+				args=(app_config, app_enabled),
 				name='FilesIndexingThread',
 			)
 			THREADS[ThreadType.UPDATES_PROCESSING] = Thread(
 				target=updates_processing_thread,
-				args=(app_config, Event),
+				args=(app_config, app_enabled),
 				name='UpdatesProcessingThread',
 			)
 			THREADS[ThreadType.FILES_INDEXING].start()
@@ -330,7 +330,7 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event):
 		case AppRole.RP | AppRole.NORMAL:
 			THREADS[ThreadType.REQUEST_PROCESSING] = Thread(
 				target=request_processing_thread,
-				args=(app_config, Event),
+				args=(app_config, app_enabled),
 				name='RequestProcessingThread',
 			)
 			THREADS[ThreadType.REQUEST_PROCESSING].start()

From dda312f21f74955d70e6f5f74840a31b26bb3f9d Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Wed, 11 Mar 2026 11:58:50 +0530
Subject: [PATCH 07/56] migrate the update process to be thread based

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/ingest/injest.py |   2 +-
 context_chat_backend/controller.py          | 203 ++++++++++----------
 context_chat_backend/task_fetcher.py        | 183 +++++++++++++++++-
 context_chat_backend/types.py               | 183 +++++++++++++++++-
 context_chat_backend/vectordb/pgvector.py   |  27 ++-
 context_chat_backend/vectordb/service.py    |  54 +++++-
 context_chat_backend/vectordb/types.py      |   4 +-
 7 files changed, 531 insertions(+), 125 deletions(-)

diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index 0eb70e0b..7369f452 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -129,7 +129,7 @@ def _increase_access_for_existing_sources(
 	for db_id, source in existing_sources.items():
 		try:
 			vectordb.update_access(
-				UpdateAccessOp.allow,
+				UpdateAccessOp.ALLOW,
 				list(map(_decode_latin_1, source.userIds)),
 				source.reference,
 			)
diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 3e70ee1b..580416f7 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -6,7 +6,7 @@
 # isort: off
 from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult
 from .types import LoaderException, EmbeddingException
-from .vectordb.types import DbException, SafeDbException, UpdateAccessOp
+from .vectordb.types import DbException, SafeDbException
 from .setup_functions import ensure_config_file, repair_run, setup_env_vars
 
 # setup env vars before importing other modules
@@ -25,9 +25,9 @@
 from functools import wraps
 from threading import Event, Thread
 from time import sleep
-from typing import Annotated, Any
+from typing import Any
 
-from fastapi import Body, FastAPI, Request
+from fastapi import FastAPI, Request
 from langchain.llms.base import LLM
 from nc_py_api import AsyncNextcloudApp, NextcloudApp
 from nc_py_api.ex_app import persistent_storage, set_handlers
@@ -40,16 +40,9 @@
 from .dyn_loader import LLMModelLoader, VectorDBLoader
 from .models.types import LlmException
 from nc_py_api.ex_app import AppAPIAuthMiddleware
-from .utils import JSONResponse, exec_in_proc, is_valid_provider_id, is_valid_source_id, value_of
+from .utils import JSONResponse, exec_in_proc, value_of
 from .task_fetcher import start_bg_threads, wait_for_bg_threads
-from .vectordb.service import (
-	count_documents_by_provider,
-	decl_update_access,
-	delete_by_provider,
-	delete_by_source,
-	delete_user,
-	update_access,
-)
+from .vectordb.service import count_documents_by_provider
 
 # setup
 
@@ -227,119 +220,131 @@ def _():
 	return JSONResponse(counts)
 
 
-@app.post('/updateAccessDeclarative')
-@enabled_guard(app)
-def _(
-	userIds: Annotated[list[str], Body()],
-	sourceId: Annotated[str, Body()],
-):
-	logger.debug('Update access declarative request:', extra={
-		'user_ids': userIds,
-		'source_id': sourceId,
-	})
+@app.get('/downloadLogs')
+def download_logs() -> FileResponse:
+	with tempfile.NamedTemporaryFile('wb', delete=False) as tmp:
+		with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
+			files = os.listdir(os.path.join(persistent_storage(), 'logs'))
+			for file in files:
+				file_path = os.path.join(persistent_storage(), 'logs', file)
+				if os.path.isfile(file_path): # Might be a folder (just skip it then)
+					zip_file.write(file_path)
+		return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip')
 
-	if len(userIds) == 0:
-		return JSONResponse('Empty list of user ids', 400)
 
-	if not is_valid_source_id(sourceId):
-		return JSONResponse('Invalid source id', 400)
+# @app.post('/updateAccessDeclarative')
+# @enabled_guard(app)
+# def _(
+# 	userIds: Annotated[list[str], Body()],
+# 	sourceId: Annotated[str, Body()],
+# ):
+# 	logger.debug('Update access declarative request:', extra={
+# 		'user_ids': userIds,
+# 		'source_id': sourceId,
+# 	})
 
-	exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId))
+# 	if len(userIds) == 0:
+# 		return JSONResponse('Empty list of user ids', 400)
 
-	return JSONResponse('Access updated')
+# 	if not is_valid_source_id(sourceId):
+# 		return JSONResponse('Invalid source id', 400)
 
+# 	exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId))
 
-@app.post('/updateAccess')
-@enabled_guard(app)
-def _(
-	op: Annotated[UpdateAccessOp, Body()],
-	userIds: Annotated[list[str], Body()],
-	sourceId: Annotated[str, Body()],
-):
-	logger.debug('Update access request', extra={
-		'op': op,
-		'user_ids': userIds,
-		'source_id': sourceId,
-	})
+# 	return JSONResponse('Access updated')
 
-	if len(userIds) == 0:
-		return JSONResponse('Empty list of user ids', 400)
 
-	if not is_valid_source_id(sourceId):
-		return JSONResponse('Invalid source id', 400)
+# @app.post('/updateAccess')
+# @enabled_guard(app)
+# def _(
+# 	op: Annotated[UpdateAccessOp, Body()],
+# 	userIds: Annotated[list[str], Body()],
+# 	sourceId: Annotated[str, Body()],
+# ):
+# 	logger.debug('Update access request', extra={
+# 		'op': op,
+# 		'user_ids': userIds,
+# 		'source_id': sourceId,
+# 	})
 
-	exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId))
+# 	if len(userIds) == 0:
+# 		return JSONResponse('Empty list of user ids', 400)
 
-	return JSONResponse('Access updated')
+# 	if not is_valid_source_id(sourceId):
+# 		return JSONResponse('Invalid source id', 400)
 
+# 	exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId))
 
-@app.post('/updateAccessProvider')
-@enabled_guard(app)
-def _(
-	op: Annotated[UpdateAccessOp, Body()],
-	userIds: Annotated[list[str], Body()],
-	providerId: Annotated[str, Body()],
-):
-	logger.debug('Update access by provider request', extra={
-		'op': op,
-		'user_ids': userIds,
-		'provider_id': providerId,
-	})
+# 	return JSONResponse('Access updated')
 
-	if len(userIds) == 0:
-		return JSONResponse('Empty list of user ids', 400)
 
-	if not is_valid_provider_id(providerId):
-		return JSONResponse('Invalid provider id', 400)
+# @app.post('/updateAccessProvider')
+# @enabled_guard(app)
+# def _(
+# 	op: Annotated[UpdateAccessOp, Body()],
+# 	userIds: Annotated[list[str], Body()],
+# 	providerId: Annotated[str, Body()],
+# ):
+# 	logger.debug('Update access by provider request', extra={
+# 		'op': op,
+# 		'user_ids': userIds,
+# 		'provider_id': providerId,
+# 	})
 
-	exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, providerId))
+# 	if len(userIds) == 0:
+# 		return JSONResponse('Empty list of user ids', 400)
 
-	return JSONResponse('Access updated')
+# 	if not is_valid_provider_id(providerId):
+# 		return JSONResponse('Invalid provider id', 400)
 
+# 	exec_in_proc(target=update_access_provider, args=(vectordb_loader, op, userIds, providerId))
 
-@app.post('/deleteSources')
-@enabled_guard(app)
-def _(sourceIds: Annotated[list[str], Body(embed=True)]):
-	logger.debug('Delete sources request', extra={
-		'source_ids': sourceIds,
-	})
+# 	return JSONResponse('Access updated')
 
-	sourceIds = [source.strip() for source in sourceIds if source.strip() != '']
 
-	if len(sourceIds) == 0:
-		return JSONResponse('No sources provided', 400)
+# @app.post('/deleteSources')
+# @enabled_guard(app)
+# def _(sourceIds: Annotated[list[str], Body(embed=True)]):
+# 	logger.debug('Delete sources request', extra={
+# 		'source_ids': sourceIds,
+# 	})
 
-	res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds))
-	if res is False:
-		return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400)
+# 	sourceIds = [source.strip() for source in sourceIds if source.strip() != '']
 
-	return JSONResponse('All valid sources deleted')
+# 	if len(sourceIds) == 0:
+# 		return JSONResponse('No sources provided', 400)
 
+# 	res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds))
+# 	if res is False:
+# 		return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400)
 
-@app.post('/deleteProvider')
-@enabled_guard(app)
-def _(providerKey: str = Body(embed=True)):
-	logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey })
+# 	return JSONResponse('All valid sources deleted')
 
-	if value_of(providerKey) is None:
-		return JSONResponse('Invalid provider key provided', 400)
 
-	exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey))
+# @app.post('/deleteProvider')
+# @enabled_guard(app)
+# def _(providerKey: str = Body(embed=True)):
+# 	logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey })
 
-	return JSONResponse('All valid sources deleted')
+# 	if value_of(providerKey) is None:
+# 		return JSONResponse('Invalid provider key provided', 400)
 
+# 	exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey))
 
-@app.post('/deleteUser')
-@enabled_guard(app)
-def _(userId: str = Body(embed=True)):
-	logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId })
+# 	return JSONResponse('All valid sources deleted')
 
-	if value_of(userId) is None:
-		return JSONResponse('Invalid userId provided', 400)
 
-	exec_in_proc(target=delete_user, args=(vectordb_loader, userId))
+# @app.post('/deleteUser')
+# @enabled_guard(app)
+# def _(userId: str = Body(embed=True)):
+# 	logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId })
+
+# 	if value_of(userId) is None:
+# 		return JSONResponse('Invalid userId provided', 400)
 
-	return JSONResponse('User deleted')
+# 	exec_in_proc(target=delete_user, args=(vectordb_loader, userId))
+
+# 	return JSONResponse('User deleted')
 
 
 # @app.put('/loadSources')
@@ -503,15 +508,3 @@ def _(query: Query) -> list[SearchResult]:
 		query.scopeType,
 		query.scopeList,
 	))
-
-
-@app.get('/downloadLogs')
-def download_logs() -> FileResponse:
-	with tempfile.NamedTemporaryFile('wb', delete=False) as tmp:
-		with zipfile.ZipFile(tmp, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
-			files = os.listdir(os.path.join(persistent_storage(), 'logs'))
-			for file in files:
-				file_path = os.path.join(persistent_storage(), 'logs', file)
-				if os.path.isfile(file_path): # Might be a folder (just skip it then)
-					zip_file.write(file_path)
-		return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip')
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index cfa9293c..84b974b2 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -19,9 +19,11 @@
 from .chain.ingest.injest import embed_sources
 from .dyn_loader import VectorDBLoader
 from .types import (
+	ActionsQueueItems,
+	ActionType,
 	AppRole,
 	EmbeddingException,
-	FilesQueueItem,
+	FilesQueueItems,
 	IndexingError,
 	IndexingException,
 	LoaderException,
@@ -30,7 +32,15 @@
 	TConfig,
 )
 from .utils import exec_in_proc, get_app_role
-from .vectordb.types import DbException
+from .vectordb.service import (
+	decl_update_access,
+	delete_by_provider,
+	delete_by_source,
+	delete_user,
+	update_access,
+	update_access_provider,
+)
+from .vectordb.types import DbException, SafeDbException
 
 APP_ROLE = get_app_role()
 THREADS = {}
@@ -41,6 +51,8 @@
 # max concurrent fetches to avoid overloading the NC server or hitting rate limits
 CONCURRENT_FILE_FETCHES = 10  # todo: config?
 MAX_FILE_SIZE = 100 * 1024 * 1024  # 100 MB, todo: config?
+ACTIONS_BATCH_SIZE = 512  # todo: config?
+POLLING_COOLDOWN = 30
 
 
 class ThreadType(Enum):
@@ -201,10 +213,15 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 			)
 
 			try:
-				q_items = FilesQueueItem.model_validate(q_items_res)
+				q_items: FilesQueueItems = FilesQueueItems.model_validate(q_items_res)
 			except ValidationError as e:
 				raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e
 
+			if not q_items.files and not q_items.content_providers:
+				LOGGER.debug('No documents to index')
+				sleep(POLLING_COOLDOWN)
+				continue
+
 			# populate files content and convert to source items
 			fetched_files = {}
 			source_files = {}
@@ -305,7 +322,165 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 
 
 def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
-	...
+	try:
+		vectordb_loader = VectorDBLoader(app_config)
+	except LoaderException as e:
+		LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e)
+		return
+
+	while True:
+		if not app_enabled.is_set():
+			LOGGER.info('Files indexing thread is stopping as the app is disabled')
+			return
+
+		try:
+			nc = NextcloudApp()
+			q_items_res = nc.ocs(
+				'GET',
+				'/apps/context_chat/queues/actions',
+				params={ 'n': ACTIONS_BATCH_SIZE }
+			)
+
+			try:
+				q_items: ActionsQueueItems = ActionsQueueItems.model_validate(q_items_res)
+			except ValidationError as e:
+				raise Exception(f'Error validating queue items response: {e}\nResponse content: {q_items_res}') from e
+		except (
+			niquests.exceptions.ConnectionError,
+			niquests.exceptions.Timeout,
+		) as e:
+			LOGGER.info('Temporary error fetching updates to process, will retry:', exc_info=e)
+			sleep(5)
+			continue
+		except Exception as e:
+			LOGGER.exception('Error fetching updates to process:', exc_info=e)
+			sleep(5)
+			continue
+
+		if not q_items.actions:
+			LOGGER.debug('No updates to process')
+			sleep(POLLING_COOLDOWN)
+			continue
+
+		processed_event_ids = []
+		errored_events = {}
+		for i, (db_id, action_item) in enumerate(q_items.actions.items()):
+			try:
+				match action_item.type:
+					case ActionType.DELETE_SOURCE_IDS:
+						exec_in_proc(target=delete_by_source, args=(vectordb_loader, action_item.payload.sourceIds))
+
+					case ActionType.DELETE_PROVIDER_ID:
+						exec_in_proc(target=delete_by_provider, args=(vectordb_loader, action_item.payload.providerId))
+
+					case ActionType.DELETE_USER_ID:
+						exec_in_proc(target=delete_user, args=(vectordb_loader, action_item.payload.userId))
+
+					case ActionType.UPDATE_ACCESS_SOURCE_ID:
+						exec_in_proc(
+							target=update_access,
+							args=(
+								vectordb_loader,
+								action_item.payload.op,
+								action_item.payload.userIds,
+								action_item.payload.sourceId,
+							),
+						)
+
+					case ActionType.UPDATE_ACCESS_PROVIDER_ID:
+						exec_in_proc(
+							target=update_access_provider,
+							args=(
+								vectordb_loader,
+								action_item.payload.op,
+								action_item.payload.userIds,
+								action_item.payload.providerId,
+							),
+						)
+
+					case ActionType.UPDATE_ACCESS_DECL_SOURCE_ID:
+						exec_in_proc(
+							target=decl_update_access,
+							args=(
+								vectordb_loader,
+								action_item.payload.userIds,
+								action_item.payload.sourceId,
+							),
+						)
+
+					case _:
+						LOGGER.warning(
+							f'Unknown action type {action_item.type} for action id {db_id},'
+							f' type {action_item.type}, skipping and marking as processed',
+							extra={ 'action_item': action_item },
+						)
+						continue
+
+				processed_event_ids.append(db_id)
+			except SafeDbException as e:
+				LOGGER.debug(
+					f'Safe DB error thrown while processing action id {db_id}, type {action_item.type},'
+					" it's safe to ignore and mark as processed.",
+					exc_info=e,
+					extra={ 'action_item': action_item },
+				)
+				processed_event_ids.append(db_id)
+				continue
+
+			except (LoaderException, DbException) as e:
+				LOGGER.error(
+					f'Error deleting source for action id {db_id}, type {action_item.type}: {e}',
+					exc_info=e,
+					extra={ 'action_item': action_item },
+				)
+				errored_events[db_id] = str(e)
+				continue
+
+			except Exception as e:
+				LOGGER.error(
+					f'Unexpected error processing action id {db_id}, type {action_item.type}: {e}',
+					exc_info=e,
+					extra={ 'action_item': action_item },
+				)
+				errored_events[db_id] = f'Unexpected error: {e}'
+				continue
+
+			if (i + 1) % 20 == 0:
+				LOGGER.debug(f'Processed {i + 1} updates, sleeping for a bit to allow other operations to proceed')
+				sleep(2)
+
+		LOGGER.info(f'Processed {len(processed_event_ids)} updates with {len(errored_events)} errors', extra={
+			'errored_events': errored_events,
+		})
+
+		if len(processed_event_ids) == 0:
+			LOGGER.debug('No updates processed, skipping reporting to the server')
+			continue
+
+		try:
+			nc.ocs(
+				'DELETE',
+				'/apps/context_chat/queues/actions/',
+				json={ 'actions': processed_event_ids },
+			)
+		except (
+			niquests.exceptions.ConnectionError,
+			niquests.exceptions.Timeout,
+		) as e:
+			LOGGER.info('Temporary error reporting processed updates, will retry:', exc_info=e)
+			sleep(5)
+			with suppress(Exception):
+				nc = NextcloudApp()
+				nc.ocs(
+					'DELETE',
+					'/apps/context_chat/queues/actions/',
+					json={ 'ids': processed_event_ids },
+				)
+			continue
+		except Exception as e:
+			LOGGER.exception('Error reporting processed updates:', exc_info=e)
+			sleep(5)
+			continue
 
 
 def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
index 97d48ce6..849c2e31 100644
--- a/context_chat_backend/types.py
+++ b/context_chat_backend/types.py
@@ -4,12 +4,13 @@
 #
 from enum import Enum
 from io import BytesIO
-from typing import Self
+from typing import Annotated, Literal, Self
 
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, Discriminator, field_validator
 
 from .mimetype_list import SUPPORTED_MIMETYPES
 from .utils import is_valid_provider_id, is_valid_source_id
+from .vectordb.types import UpdateAccessOp
 
 __all__ = [
 	'DEFAULT_EM_MODEL_ALIAS',
@@ -182,7 +183,7 @@ def validate_content(cls, v):
 		raise ValueError('Content must be either a non-empty string or a non-empty BytesIO')
 
 
-class FilesQueueItem(BaseModel):
+class FilesQueueItems(BaseModel):
 	files: dict[int, ReceivedFileItem]  # [db id]: FileItem
 	content_providers: dict[int, SourceItem]  # [db id]: SourceItem
 
@@ -198,3 +199,179 @@ def __init__(self, message: str, retryable: bool = False):
 class IndexingError(BaseModel):
 	error: str
 	retryable: bool = False
+
+
+# PHP equivalent for reference:
+
+# class ActionType {
+# 	// { sourceIds: array<string> }
+# 	public const DELETE_SOURCE_IDS = 'delete_source_ids';
+# 	// { providerId: string }
+# 	public const DELETE_PROVIDER_ID = 'delete_provider_id';
+# 	// { userId: string }
+# 	public const DELETE_USER_ID = 'delete_user_id';
+# 	// { op: string, userIds: array<string>, sourceId: string }
+# 	public const UPDATE_ACCESS_SOURCE_ID = 'update_access_source_id';
+# 	// { op: string, userIds: array<string>, providerId: string }
+# 	public const UPDATE_ACCESS_PROVIDER_ID = 'update_access_provider_id';
+# 	// { userIds: array<string>, sourceId: string }
+# 	public const UPDATE_ACCESS_DECL_SOURCE_ID = 'update_access_decl_source_id';
+# }
+
+
+def _validate_source_ids(source_ids: list[str]) -> list[str]:
+	if (
+		not isinstance(source_ids, list)
+		or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids)
+		or len(source_ids) == 0
+	):
+		raise ValueError('sourceIds must be a non-empty list of non-empty strings')
+	return [sid.strip() for sid in source_ids]
+
+
+def _validate_provider_id(provider_id: str) -> str:
+	if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id):
+		raise ValueError('providerId must be a valid provider ID string')
+	return provider_id
+
+
+def _validate_user_ids(user_ids: list[str]) -> list[str]:
+	if (
+		not isinstance(user_ids, list)
+		or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids)
+		or len(user_ids) == 0
+	):
+		raise ValueError('userIds must be a non-empty list of non-empty strings')
+	return [uid.strip() for uid in user_ids]
+
+
+class ActionPayloadDeleteSourceIds(BaseModel):
+	sourceIds: list[str]
+
+	@field_validator('sourceIds', mode='after')
+	def validate_source_ids(self) -> Self:
+		self.sourceIds = _validate_source_ids(self.sourceIds)
+		return self
+
+
+class ActionPayloadDeleteProviderId(BaseModel):
+	providerId: str
+
+	@field_validator('providerId')
+	def validate_provider_id(self) -> Self:
+		self.providerId = _validate_provider_id(self.providerId)
+		return self
+
+
+class ActionPayloadDeleteUserId(BaseModel):
+	userId: str
+
+	@field_validator('userId')
+	def validate_user_id(self) -> Self:
+		self.userId = _validate_user_ids([self.userId])[0]
+		return self
+
+
+class ActionPayloadUpdateAccessSourceId(BaseModel):
+	op: UpdateAccessOp
+	userIds: list[str]
+	sourceId: str
+
+	@field_validator('userIds', mode='after')
+	def validate_user_ids(self) -> Self:
+		self.userIds = _validate_user_ids(self.userIds)
+		return self
+
+	@field_validator('sourceId')
+	def validate_source_id(self) -> Self:
+		self.sourceId = _validate_source_ids([self.sourceId])[0]
+		return self
+
+
+class ActionPayloadUpdateAccessProviderId(BaseModel):
+	op: UpdateAccessOp
+	userIds: list[str]
+	providerId: str
+
+	@field_validator('userIds', mode='after')
+	def validate_user_ids(self) -> Self:
+		self.userIds = _validate_user_ids(self.userIds)
+		return self
+
+	@field_validator('providerId')
+	def validate_provider_id(self) -> Self:
+		self.providerId = _validate_provider_id(self.providerId)
+		return self
+
+
+class ActionPayloadUpdateAccessDeclSourceId(BaseModel):
+	userIds: list[str]
+	sourceId: str
+
+	@field_validator('userIds', mode='after')
+	def validate_user_ids(self) -> Self:
+		self.userIds = _validate_user_ids(self.userIds)
+		return self
+
+	@field_validator('sourceId')
+	def validate_source_id(self) -> Self:
+		self.sourceId = _validate_source_ids([self.sourceId])[0]
+		return self
+
+
+class ActionType(str, Enum):
+	DELETE_SOURCE_IDS = 'delete_source_ids'
+	DELETE_PROVIDER_ID = 'delete_provider_id'
+	DELETE_USER_ID = 'delete_user_id'
+	UPDATE_ACCESS_SOURCE_ID = 'update_access_source_id'
+	UPDATE_ACCESS_PROVIDER_ID = 'update_access_provider_id'
+	UPDATE_ACCESS_DECL_SOURCE_ID = 'update_access_decl_source_id'
+
+
+class CommonActionsQueueItem(BaseModel):
+	id: int
+
+
+class ActionsQueueItemDeleteSourceIds(CommonActionsQueueItem):
+	type: Literal[ActionType.DELETE_SOURCE_IDS]
+	payload: ActionPayloadDeleteSourceIds
+
+
+class ActionsQueueItemDeleteProviderId(CommonActionsQueueItem):
+	type: Literal[ActionType.DELETE_PROVIDER_ID]
+	payload: ActionPayloadDeleteProviderId
+
+
+class ActionsQueueItemDeleteUserId(CommonActionsQueueItem):
+	type: Literal[ActionType.DELETE_USER_ID]
+	payload: ActionPayloadDeleteUserId
+
+
+class ActionsQueueItemUpdateAccessSourceId(CommonActionsQueueItem):
+	type: Literal[ActionType.UPDATE_ACCESS_SOURCE_ID]
+	payload: ActionPayloadUpdateAccessSourceId
+
+
+class ActionsQueueItemUpdateAccessProviderId(CommonActionsQueueItem):
+	type: Literal[ActionType.UPDATE_ACCESS_PROVIDER_ID]
+	payload: ActionPayloadUpdateAccessProviderId
+
+
+class ActionsQueueItemUpdateAccessDeclSourceId(CommonActionsQueueItem):
+	type: Literal[ActionType.UPDATE_ACCESS_DECL_SOURCE_ID]
+	payload: ActionPayloadUpdateAccessDeclSourceId
+
+
+ActionsQueueItem = Annotated[
+	ActionsQueueItemDeleteSourceIds
+	| ActionsQueueItemDeleteProviderId
+	| ActionsQueueItemDeleteUserId
+	| ActionsQueueItemUpdateAccessSourceId
+	| ActionsQueueItemUpdateAccessProviderId
+	| ActionsQueueItemUpdateAccessDeclSourceId,
+	Discriminator('type'),
+]
+
+
+class ActionsQueueItems(BaseModel):
+	actions: dict[int, ActionsQueueItem]
diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py
index f5879feb..8bcc6f4c 100644
--- a/context_chat_backend/vectordb/pgvector.py
+++ b/context_chat_backend/vectordb/pgvector.py
@@ -338,7 +338,7 @@ def update_access(
 				)
 
 			match op:
-				case UpdateAccessOp.allow:
+				case UpdateAccessOp.ALLOW:
 					for i in range(0, len(user_ids), PG_BATCH_SIZE):
 						batched_uids = user_ids[i:i+PG_BATCH_SIZE]
 						stmt = (
@@ -355,7 +355,7 @@ def update_access(
 						session.execute(stmt)
 					session.commit()
 
-				case UpdateAccessOp.deny:
+				case UpdateAccessOp.DENY:
 					for i in range(0, len(user_ids), PG_BATCH_SIZE):
 						batched_uids = user_ids[i:i+PG_BATCH_SIZE]
 						stmt = (
@@ -448,15 +448,17 @@ def delete_source_ids(self, source_ids: list[str], session_: orm.Session | None
 			# entry from "AccessListStore" is deleted automatically due to the foreign key constraint
       # batch the deletion to avoid hitting the query parameter limit
 			chunks_to_delete = []
+			deleted_source_ids = []
 			for i in range(0, len(source_ids), PG_BATCH_SIZE):
 				batched_ids = source_ids[i:i+PG_BATCH_SIZE]
 				stmt_doc = (
 					sa.delete(DocumentsStore)
 					.filter(DocumentsStore.source_id.in_(batched_ids))
-					.returning(DocumentsStore.chunks)
+					.returning(DocumentsStore.chunks, DocumentsStore.source_id)
 				)
 				doc_result = session.execute(stmt_doc)
 				chunks_to_delete.extend(str(c) for res in doc_result for c in res.chunks)
+				deleted_source_ids.extend(str(res.source_id) for res in doc_result)
 
 			for i in range(0, len(chunks_to_delete), PG_BATCH_SIZE):
 				batched_chunks = chunks_to_delete[i:i+PG_BATCH_SIZE]
@@ -476,6 +478,14 @@ def delete_source_ids(self, source_ids: list[str], session_: orm.Session | None
 			if session_ is None:
 				session.close()
 
+		undeleted_source_ids = set(source_ids) - set(deleted_source_ids)
+		if len(undeleted_source_ids) > 0:
+			logger.info(
+				f'Source ids {undeleted_source_ids} were not deleted from documents store.'
+				' This can be due to the source ids not existing in the documents store due to'
+				' already being deleted or not having been added yet.'
+			)
+
 	def delete_provider(self, provider_key: str):
 		with self.session_maker() as session:
 			try:
@@ -519,7 +529,16 @@ def delete_user(self, user_id: str):
 				session.rollback()
 				raise DbException('Error: deleting user from access list') from e
 
-			self._cleanup_if_orphaned(list(source_ids), session)
+			try:
+				self._cleanup_if_orphaned(list(source_ids), session)
+			except Exception as e:
+				session.rollback()
+				logger.error(
+					'Error cleaning up orphaned source ids after deleting user, manual cleanup might be required',
+					exc_info=e,
+					extra={ 'source_ids': list(source_ids) },
+				)
+				raise DbException('Error: cleaning up orphaned source ids after deleting user') from e
 
 	def count_documents_by_provider(self) -> dict[str, int]:
 		try:
diff --git a/context_chat_backend/vectordb/service.py b/context_chat_backend/vectordb/service.py
index 620a0b39..06a8e19e 100644
--- a/context_chat_backend/vectordb/service.py
+++ b/context_chat_backend/vectordb/service.py
@@ -6,27 +6,42 @@
 
 from ..dyn_loader import VectorDBLoader
 from .base import BaseVectorDB
-from .types import DbException, UpdateAccessOp
+from .types import UpdateAccessOp
 
 logger = logging.getLogger('ccb.vectordb')
 
-# todo: return source ids that were successfully deleted
+
 def delete_by_source(vectordb_loader: VectorDBLoader, source_ids: list[str]):
+	'''
+	Raises
+	------
+	DbException
+	LoaderException
+	'''
 	db: BaseVectorDB = vectordb_loader.load()
 	logger.debug('deleting sources by id', extra={ 'source_ids': source_ids })
-	try:
-		db.delete_source_ids(source_ids)
-	except Exception as e:
-		raise DbException('Error: Vectordb delete_source_ids error') from e
+	db.delete_source_ids(source_ids)
 
 
 def delete_by_provider(vectordb_loader: VectorDBLoader, provider_key: str):
+	'''
+	Raises
+	------
+	DbException
+	LoaderException
+	'''
 	db: BaseVectorDB = vectordb_loader.load()
 	logger.debug(f'deleting sources by provider: {provider_key}')
 	db.delete_provider(provider_key)
 
 
 def delete_user(vectordb_loader: VectorDBLoader, user_id: str):
+	'''
+	Raises
+	------
+	DbException
+	LoaderException
+	'''
 	db: BaseVectorDB = vectordb_loader.load()
 	logger.debug(f'deleting user from db: {user_id}')
 	db.delete_user(user_id)
@@ -38,6 +53,13 @@ def update_access(
 	user_ids: list[str],
 	source_id: str,
 ):
+	'''
+	Raises
+	------
+	DbException
+	LoaderException
+	SafeDbException
+	'''
 	db: BaseVectorDB = vectordb_loader.load()
 	logger.debug('updating access', extra={ 'op': op, 'user_ids': user_ids, 'source_id': source_id })
 	db.update_access(op, user_ids, source_id)
@@ -49,6 +71,13 @@ def update_access_provider(
 	user_ids: list[str],
 	provider_id: str,
 ):
+	'''
+	Raises
+	------
+	DbException
+	LoaderException
+	SafeDbException
+	'''
 	db: BaseVectorDB = vectordb_loader.load()
 	logger.debug('updating access by provider', extra={ 'op': op, 'user_ids': user_ids, 'provider_id': provider_id })
 	db.update_access_provider(op, user_ids, provider_id)
@@ -59,11 +88,24 @@ def decl_update_access(
 	user_ids: list[str],
 	source_id: str,
 ):
+	'''
+	Raises
+	------
+	DbException
+	LoaderException
+	SafeDbException
+	'''
 	db: BaseVectorDB = vectordb_loader.load()
 	logger.debug('decl update access', extra={ 'user_ids': user_ids, 'source_id': source_id })
 	db.decl_update_access(user_ids, source_id)
 
 def count_documents_by_provider(vectordb_loader: VectorDBLoader):
+	'''
+	Raises
+	------
+	DbException
+	LoaderException
+	'''
 	db: BaseVectorDB = vectordb_loader.load()
 	logger.debug('counting documents by provider')
 	return db.count_documents_by_provider()
diff --git a/context_chat_backend/vectordb/types.py b/context_chat_backend/vectordb/types.py
index df5c6dd7..30811797 100644
--- a/context_chat_backend/vectordb/types.py
+++ b/context_chat_backend/vectordb/types.py
@@ -14,5 +14,5 @@ class SafeDbException(Exception):
 
 
 class UpdateAccessOp(Enum):
-	allow = 'allow'
-	deny = 'deny'
+	ALLOW = 'allow'
+	DENY = 'deny'

From b09a93cafda6726b706f11c8e7815b4a91acfc43 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Wed, 11 Mar 2026 14:33:39 +0530
Subject: [PATCH 08/56] fix pydantic types

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/types.py | 180 ++++++++++++----------------------
 context_chat_backend/utils.py |  10 --
 2 files changed, 64 insertions(+), 126 deletions(-)

diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
index 849c2e31..8577c931 100644
--- a/context_chat_backend/types.py
+++ b/context_chat_backend/types.py
@@ -2,14 +2,14 @@
 # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import re
 from enum import Enum
 from io import BytesIO
 from typing import Annotated, Literal, Self
 
-from pydantic import BaseModel, Discriminator, field_validator
+from pydantic import AfterValidator, BaseModel, Discriminator, field_validator, model_validator
 
 from .mimetype_list import SUPPORTED_MIMETYPES
-from .utils import is_valid_provider_id, is_valid_source_id
 from .vectordb.types import UpdateAccessOp
 
 __all__ = [
@@ -26,6 +26,49 @@
 FILES_PROVIDER_ID = 'files__default'
 
 
+def is_valid_source_id(source_id: str) -> bool:
+	# note the ":" in the item id part
+	return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+: [a-zA-Z0-9:-]+$', source_id) is not None
+
+
+def is_valid_provider_id(provider_id: str) -> bool:
+	return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+$', provider_id) is not None
+
+
+def _validate_source_ids(source_ids: list[str]) -> list[str]:
+	if (
+		not isinstance(source_ids, list)
+		or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids)
+		or len(source_ids) == 0
+	):
+		raise ValueError('sourceIds must be a non-empty list of non-empty strings')
+	return [sid.strip() for sid in source_ids]
+
+
+def _validate_source_id(source_id: str) -> str:
+	return _validate_source_ids([source_id])[0]
+
+
+def _validate_provider_id(provider_id: str) -> str:
+	if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id):
+		raise ValueError('providerId must be a valid provider ID string')
+	return provider_id
+
+
+def _validate_user_ids(user_ids: list[str]) -> list[str]:
+	if (
+		not isinstance(user_ids, list)
+		or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids)
+		or len(user_ids) == 0
+	):
+		raise ValueError('userIds must be a non-empty list of non-empty strings')
+	return [uid.strip() for uid in user_ids]
+
+
+def _validate_user_id(user_id: str) -> str:
+	return _validate_user_ids([user_id])[0]
+
+
 class TEmbeddingAuthApiKey(BaseModel):
 	apikey: str
 
@@ -89,12 +132,13 @@ class AppRole(str, Enum):
 
 
 class CommonSourceItem(BaseModel):
-	userIds: list[str]
-	reference: str  # source_id of the form "appId__providerId: itemId"
+	userIds: Annotated[list[str], AfterValidator(_validate_user_ids)]
+	# source_id of the form "appId__providerId: itemId"
+	reference: Annotated[str, AfterValidator(_validate_source_id)]
 	title: str
 	modified: int | str  # todo: int/string?
 	type: str
-	provider: str
+	provider: Annotated[str, AfterValidator(_validate_provider_id)]
 	size: int
 
 	@field_validator('modified', mode='before')
@@ -116,42 +160,13 @@ def validate_strings_non_empty(cls, v):
 			raise ValueError('Must be a non-empty string')
 		return v.strip()
 
-	@field_validator('userIds', mode='after')
-	def validate_user_ids(self) -> Self:
-		if (
-			not isinstance(self.userIds, list)
-			or not all(
-				isinstance(uid, str)
-				and uid.strip() != ''
-				for uid in self.userIds
-			)
-			or len(self.userIds) == 0
-		):
-			raise ValueError('userIds must be a non-empty list of non-empty strings')
-		self.userIds = [uid.strip() for uid in self.userIds]
-		return self
-
-	@field_validator('reference', mode='after')
-	def validate_reference_format(self) -> Self:
-		# validate reference format: "appId__providerId: itemId"
-		if not is_valid_source_id(self.reference):
-			raise ValueError('Invalid reference format, must be "appId__providerId: itemId"')
-		return self
-
-	@field_validator('provider', mode='after')
-	def validate_provider_format(self) -> Self:
-		# validate provider format: "appId__providerId"
-		if not is_valid_provider_id(self.provider):
-			raise ValueError('Invalid provider format, must be "appId__providerId"')
-		return self
-
-	@field_validator('type', mode='after')
+	@model_validator(mode='after')
 	def validate_type(self) -> Self:
 		if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES:
 			raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}')
 		return self
 
-	@field_validator('size', mode='after')
+	@model_validator(mode='after')
 	def validate_size(self) -> Self:
 		if not isinstance(self.size, int) or self.size < 0:
 			raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer')
@@ -182,6 +197,10 @@ def validate_content(cls, v):
 			return v
 		raise ValueError('Content must be either a non-empty string or a non-empty BytesIO')
 
+	class Config:
+		# to allow BytesIO in content field
+		arbitrary_types_allowed = True
+
 
 class FilesQueueItems(BaseModel):
 	files: dict[int, ReceivedFileItem]  # [db id]: FileItem
@@ -219,104 +238,33 @@ class IndexingError(BaseModel):
 # }
 
 
-def _validate_source_ids(source_ids: list[str]) -> list[str]:
-	if (
-		not isinstance(source_ids, list)
-		or not all(isinstance(sid, str) and sid.strip() != '' for sid in source_ids)
-		or len(source_ids) == 0
-	):
-		raise ValueError('sourceIds must be a non-empty list of non-empty strings')
-	return [sid.strip() for sid in source_ids]
-
-
-def _validate_provider_id(provider_id: str) -> str:
-	if not isinstance(provider_id, str) or not is_valid_provider_id(provider_id):
-		raise ValueError('providerId must be a valid provider ID string')
-	return provider_id
-
-
-def _validate_user_ids(user_ids: list[str]) -> list[str]:
-	if (
-		not isinstance(user_ids, list)
-		or not all(isinstance(uid, str) and uid.strip() != '' for uid in user_ids)
-		or len(user_ids) == 0
-	):
-		raise ValueError('userIds must be a non-empty list of non-empty strings')
-	return [uid.strip() for uid in user_ids]
-
-
 class ActionPayloadDeleteSourceIds(BaseModel):
-	sourceIds: list[str]
-
-	@field_validator('sourceIds', mode='after')
-	def validate_source_ids(self) -> Self:
-		self.sourceIds = _validate_source_ids(self.sourceIds)
-		return self
+	sourceIds: Annotated[list[str], AfterValidator(_validate_source_ids)]
 
 
 class ActionPayloadDeleteProviderId(BaseModel):
-	providerId: str
-
-	@field_validator('providerId')
-	def validate_provider_id(self) -> Self:
-		self.providerId = _validate_provider_id(self.providerId)
-		return self
+	providerId: Annotated[str, AfterValidator(_validate_provider_id)]
 
 
 class ActionPayloadDeleteUserId(BaseModel):
-	userId: str
-
-	@field_validator('userId')
-	def validate_user_id(self) -> Self:
-		self.userId = _validate_user_ids([self.userId])[0]
-		return self
+	userId: Annotated[str, AfterValidator(_validate_user_id)]
 
 
 class ActionPayloadUpdateAccessSourceId(BaseModel):
 	op: UpdateAccessOp
-	userIds: list[str]
-	sourceId: str
-
-	@field_validator('userIds', mode='after')
-	def validate_user_ids(self) -> Self:
-		self.userIds = _validate_user_ids(self.userIds)
-		return self
-
-	@field_validator('sourceId')
-	def validate_source_id(self) -> Self:
-		self.sourceId = _validate_source_ids([self.sourceId])[0]
-		return self
+	userIds: Annotated[list[str], AfterValidator(_validate_user_ids)]
+	sourceId: Annotated[str, AfterValidator(_validate_source_id)]
 
 
 class ActionPayloadUpdateAccessProviderId(BaseModel):
 	op: UpdateAccessOp
-	userIds: list[str]
-	providerId: str
-
-	@field_validator('userIds', mode='after')
-	def validate_user_ids(self) -> Self:
-		self.userIds = _validate_user_ids(self.userIds)
-		return self
-
-	@field_validator('providerId')
-	def validate_provider_id(self) -> Self:
-		self.providerId = _validate_provider_id(self.providerId)
-		return self
+	userIds: Annotated[list[str], AfterValidator(_validate_user_ids)]
+	providerId: Annotated[str, AfterValidator(_validate_provider_id)]
 
 
 class ActionPayloadUpdateAccessDeclSourceId(BaseModel):
-	userIds: list[str]
-	sourceId: str
-
-	@field_validator('userIds', mode='after')
-	def validate_user_ids(self) -> Self:
-		self.userIds = _validate_user_ids(self.userIds)
-		return self
-
-	@field_validator('sourceId')
-	def validate_source_id(self) -> Self:
-		self.sourceId = _validate_source_ids([self.sourceId])[0]
-		return self
+	userIds: Annotated[list[str], AfterValidator(_validate_user_ids)]
+	sourceId: Annotated[str, AfterValidator(_validate_source_id)]
 
 
 class ActionType(str, Enum):
diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index 224f466e..c7e588b3 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -5,7 +5,6 @@
 import logging
 import multiprocessing as mp
 import os
-import re
 import traceback
 from collections.abc import Callable
 from functools import partial, wraps
@@ -102,15 +101,6 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 	return result['value']
 
 
-def is_valid_source_id(source_id: str) -> bool:
-	# note the ":" in the item id part
-	return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+: [a-zA-Z0-9:-]+$', source_id) is not None
-
-
-def is_valid_provider_id(provider_id: str) -> bool:
-	return re.match(r'^[a-zA-Z0-9_-]+__[a-zA-Z0-9_-]+$', provider_id) is not None
-
-
 def timed(func: Callable):
 	'''
 	Decorator to time a function

From 11b436c8ce43778dbf6beda8a7e3978626e7aee5 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Wed, 11 Mar 2026 14:34:55 +0530
Subject: [PATCH 09/56] fix: use a dedicated event to allow app halt without
 app being disabled

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/controller.py   |  1 +
 context_chat_backend/task_fetcher.py | 28 ++++++++++++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 580416f7..55206ca0 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -88,6 +88,7 @@ async def lifespan(app: FastAPI):
 	nc = NextcloudApp()
 	if nc.enabled_state:
 		app_enabled.set()
+		start_bg_threads(app_config, app_enabled)
 	logger.info(f'App enable state at startup: {app_enabled.is_set()}')
 	t = Thread(target=background_thread_task, args=())
 	t.start()
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 84b974b2..e93eac34 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -44,6 +44,7 @@
 
 APP_ROLE = get_app_role()
 THREADS = {}
+THREAD_STOP_EVENT = Event()
 LOGGER = logging.getLogger('ccb.task_fetcher')
 FILES_INDEXING_BATCH_SIZE = 64  # todo: config?
 # divides the batch into these many chunks
@@ -199,8 +200,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 
 
 	while True:
-		if not app_enabled.is_set():
-			LOGGER.info('Files indexing thread is stopping as the app is disabled')
+		if THREAD_STOP_EVENT.is_set():
+			LOGGER.info('Files indexing thread is stopping due to stop event being set')
 			return
 
 		try:
@@ -329,8 +330,8 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 		return
 
 	while True:
-		if not app_enabled.is_set():
-			LOGGER.info('Files indexing thread is stopping as the app is disabled')
+		if THREAD_STOP_EVENT.is_set():
+			LOGGER.info('Updates processing thread is stopping due to stop event being set')
 			return
 
 		try:
@@ -490,6 +491,14 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 def start_bg_threads(app_config: TConfig, app_enabled: Event):
 	match APP_ROLE:
 		case AppRole.INDEXING | AppRole.NORMAL:
+			if (
+				ThreadType.FILES_INDEXING in THREADS
+				or ThreadType.UPDATES_PROCESSING in THREADS
+			):
+				LOGGER.info('Background threads already running, skipping start')
+				return
+
+			THREAD_STOP_EVENT.clear()
 			THREADS[ThreadType.FILES_INDEXING] = Thread(
 				target=files_indexing_thread,
 				args=(app_config, app_enabled),
@@ -502,7 +511,13 @@ def start_bg_threads(app_config: TConfig, app_enabled: Event):
 			)
 			THREADS[ThreadType.FILES_INDEXING].start()
 			THREADS[ThreadType.UPDATES_PROCESSING].start()
+
 		case AppRole.RP | AppRole.NORMAL:
+			if ThreadType.REQUEST_PROCESSING in THREADS:
+				LOGGER.info('Background threads already running, skipping start')
+				return
+
+			THREAD_STOP_EVENT.clear()
 			THREADS[ThreadType.REQUEST_PROCESSING] = Thread(
 				target=request_processing_thread,
 				args=(app_config, app_enabled),
@@ -516,12 +531,17 @@ def wait_for_bg_threads():
 		case AppRole.INDEXING | AppRole.NORMAL:
 			if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS):
 				return
+
+			THREAD_STOP_EVENT.set()
 			THREADS[ThreadType.FILES_INDEXING].join()
 			THREADS[ThreadType.UPDATES_PROCESSING].join()
 			THREADS.pop(ThreadType.FILES_INDEXING)
 			THREADS.pop(ThreadType.UPDATES_PROCESSING)
+
 		case AppRole.RP | AppRole.NORMAL:
 			if (ThreadType.REQUEST_PROCESSING not in THREADS):
 				return
+
+			THREAD_STOP_EVENT.set()
 			THREADS[ThreadType.REQUEST_PROCESSING].join()
 			THREADS.pop(ThreadType.REQUEST_PROCESSING)

From c88e15364d53764257f7fddaca76505cf27c80d9 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Wed, 11 Mar 2026 17:54:48 +0530
Subject: [PATCH 10/56] fix fetch url and pydantic types

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/task_fetcher.py | 14 +++++++-------
 context_chat_backend/types.py        | 17 +++++++++--------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index e93eac34..5784d12b 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -80,7 +80,7 @@ async def __fetch_file_content(
 			# a file pointer for storing the stream in memory until it is consumed
 			fp = BytesIO()
 			await nc._session.download2fp(
-				url_path=f'/apps/context_chat/files/{file_id}',
+				url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}',
 				fp=fp,
 				dav=False,
 				params={ 'userId': user_id },
@@ -209,7 +209,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 			# todo: add the 'size' param to the return of this call.
 			q_items_res = nc.ocs(
 				'GET',
-				'/apps/context_chat/queues/documents',
+				'/ocs/v2.php/apps/context_chat/queues/documents',
 				params={ 'n': FILES_INDEXING_BATCH_SIZE }
 			)
 
@@ -292,7 +292,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 		try:
 			nc.ocs(
 				'DELETE',
-				'/apps/context_chat/queues/documents/',
+				'/ocs/v2.php/apps/context_chat/queues/documents/',
 				json={
 					'files': to_delete_file_ids,
 					'content_providers': to_delete_provider_ids,
@@ -308,7 +308,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 				nc = NextcloudApp()
 				nc.ocs(
 					'DELETE',
-					'/apps/context_chat/queues/documents/',
+					'/ocs/v2.php/apps/context_chat/queues/documents/',
 					json={
 						'files': to_delete_file_ids,
 						'content_providers': to_delete_provider_ids,
@@ -338,7 +338,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 			nc = NextcloudApp()
 			q_items_res = nc.ocs(
 				'GET',
-				'/apps/context_chat/queues/actions',
+				'/ocs/v2.php/apps/context_chat/queues/actions',
 				params={ 'n': ACTIONS_BATCH_SIZE }
 			)
 
@@ -461,7 +461,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 		try:
 			nc.ocs(
 				'DELETE',
-				'/apps/context_chat/queues/actions/',
+				'/ocs/v2.php/apps/context_chat/queues/actions/',
 				json={ 'actions': processed_event_ids },
 			)
 		except (
@@ -474,7 +474,7 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 				nc = NextcloudApp()
 				nc.ocs(
 					'DELETE',
-					'/apps/context_chat/queues/actions/',
+					'/ocs/v2.php/apps/context_chat/queues/actions/',
 					json={ 'ids': processed_event_ids },
 				)
 			continue
diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
index 8577c931..972756fa 100644
--- a/context_chat_backend/types.py
+++ b/context_chat_backend/types.py
@@ -136,10 +136,10 @@ class CommonSourceItem(BaseModel):
 	# source_id of the form "appId__providerId: itemId"
 	reference: Annotated[str, AfterValidator(_validate_source_id)]
 	title: str
-	modified: int | str  # todo: int/string?
+	modified: int
 	type: str
 	provider: Annotated[str, AfterValidator(_validate_provider_id)]
-	size: int
+	size: float
 
 	@field_validator('modified', mode='before')
 	@classmethod
@@ -160,18 +160,19 @@ def validate_strings_non_empty(cls, v):
 			raise ValueError('Must be a non-empty string')
 		return v.strip()
 
+	@field_validator('size')
+	@classmethod
+	def validate_size(cls, v):
+		if isinstance(v, int | float) and v >= 0:
+			return float(v)
+		raise ValueError(f'Invalid size value: {v}, must be a non-negative number')
+
 	@model_validator(mode='after')
 	def validate_type(self) -> Self:
 		if self.reference.startswith(FILES_PROVIDER_ID) and self.type not in SUPPORTED_MIMETYPES:
 			raise ValueError(f'Unsupported file type: {self.type} for reference {self.reference}')
 		return self
 
-	@model_validator(mode='after')
-	def validate_size(self) -> Self:
-		if not isinstance(self.size, int) or self.size < 0:
-			raise ValueError(f'Invalid size value: {self.size}, must be a non-negative integer')
-		return self
-
 
 class ReceivedFileItem(CommonSourceItem):
 	content: None

From cd5241e199a2ae2316d4f8f3841aa27bb7c12842 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Wed, 11 Mar 2026 18:52:35 +0530
Subject: [PATCH 11/56] fix: use the correct file id

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/controller.py   |  9 ++--
 context_chat_backend/task_fetcher.py | 79 +++++++++++++++++-----------
 context_chat_backend/types.py        | 22 +++++++-
 3 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 55206ca0..797ba201 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -24,7 +24,6 @@
 from contextlib import asynccontextmanager
 from functools import wraps
 from threading import Event, Thread
-from time import sleep
 from typing import Any
 
 from fastapi import FastAPI, Request
@@ -130,9 +129,11 @@ async def lifespan(app: FastAPI):
 # logger background thread
 
 def background_thread_task():
-	while(True):
-		logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing})
-		sleep(10)
+	# todo
+	# while(True):
+	# 	logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing})
+	# 	sleep(10)
+	...
 
 # exception handlers
 
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 5784d12b..0442cd53 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -125,15 +125,29 @@ async def __fetch_files_content(
 	semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES)
 	tasks = []
 
-	for file_id, file_item in files.items():
-		if file_item.size > MAX_FILE_SIZE:
+	for db_id, file in files.items():
+		try:
+			# to detect any validation errors but it should not happen since file.reference is validated
+			file.file_id  # noqa: B018
+		except ValueError as e:
+			LOGGER.error(
+				f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}',
+				exc_info=e,
+			)
+			source_items[db_id] = IndexingError(
+				error=f'Invalid file reference format: {file.reference}',
+				retryable=False,
+			)
+			continue
+
+		if file.size > MAX_FILE_SIZE:
 			LOGGER.info(
-				f'Skipping file id {file_id}, source id {file_item.reference} due to size'
-				f' {(file_item.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB',
+				f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size'
+				f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB',
 			)
-			source_items[file_id] = IndexingError(
+			source_items[db_id] = IndexingError(
 				error=(
-					f'File size {(file_item.size/(1024*1024)):.2f} MiB'
+					f'File size {(file.size/(1024*1024)):.2f} MiB'
 					f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB'
 				),
 				retryable=False,
@@ -141,39 +155,44 @@ async def __fetch_files_content(
 			continue
 		# todo: perform the existing file check before fetching the content to avoid unnecessary fetches
 		# any user id from the list should have read access to the file
-		tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file_id, file_item.userIds[0])))
+		tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0])))
 
 	results = await asyncio.gather(*tasks, return_exceptions=True)
-	for (file_id, file_item), result in zip(files.items(), results, strict=True):
+	for (db_id, file), result in zip(files.items(), results, strict=True):
 		if isinstance(result, IndexingException):
 			LOGGER.error(
-				f'Error fetching content for file id {file_id}, reference {file_item.reference}: {result}',
+				f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}'
+				f': {result}',
 				exc_info=result,
 			)
-			source_items[file_id] = IndexingError(
+			source_items[db_id] = IndexingError(
 				error=str(result),
 				retryable=result.retryable,
 			)
 		elif isinstance(result, str) or isinstance(result, BytesIO):
-			source_items[file_id] = SourceItem(
-				**file_item.model_dump(),
-				content=result,
+			source_items[db_id] = SourceItem(
+				**{
+					**file.model_dump(),
+					'content': result,
+				}
 			)
 		elif isinstance(result, BaseException):
 			LOGGER.error(
-				f'Unexpected error fetching content for file id {file_id}, reference {file_item.reference}: {result}',
+				f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},'
+				f' reference {file.reference}: {result}',
 				exc_info=result,
 			)
-			source_items[file_id] = IndexingError(
+			source_items[db_id] = IndexingError(
 				error=f'Unexpected error: {result}',
 				retryable=True,
 			)
 		else:
 			LOGGER.error(
-				f'Unknown error fetching content for file id {file_id}, reference {file_item.reference}: {result}',
+				f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}'
+				f': {result}',
 				exc_info=True,
 			)
-			source_items[file_id] = IndexingError(
+			source_items[db_id] = IndexingError(
 				error='Unknown error',
 				retryable=True,
 			)
@@ -232,11 +251,11 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 			if q_items.files:
 				fetched_files = asyncio.run(__fetch_files_content(q_items.files))
 
-			for file_id, result in fetched_files.items():
+			for db_id, result in fetched_files.items():
 				if isinstance(result, SourceItem):
-					source_files[file_id] = result
+					source_files[db_id] = result
 				else:
-					source_errors[file_id] = result
+					source_errors[db_id] = result
 
 			files_result = {}
 			providers_result = {}
@@ -257,8 +276,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 			):
 				LOGGER.error('Some sources failed to index', extra={
 					'file_errors': {
-						file_id: error
-						for file_id, error in files_result.items()
+						db_id: error
+						for db_id, error in files_result.items()
 						if isinstance(error, IndexingError)
 					},
 					'provider_errors': {
@@ -280,12 +299,12 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 			continue
 
 		# delete the entries from the PHP side queue where indexing succeeded or the error is not retryable
-		to_delete_file_ids = [
-			file_id for file_id, result in files_result.items()
+		to_delete_files_db_ids = [
+			db_id for db_id, result in files_result.items()
 			if result is None or (isinstance(result, IndexingError) and not result.retryable)
 		]
-		to_delete_provider_ids = [
-			provider_id for provider_id, result in providers_result.items()
+		to_delete_provider_db_ids = [
+			db_id for db_id, result in providers_result.items()
 			if result is None or (isinstance(result, IndexingError) and not result.retryable)
 		]
 
@@ -294,8 +313,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 				'DELETE',
 				'/ocs/v2.php/apps/context_chat/queues/documents/',
 				json={
-					'files': to_delete_file_ids,
-					'content_providers': to_delete_provider_ids,
+					'files': to_delete_files_db_ids,
+					'content_providers': to_delete_provider_db_ids,
 				},
 			)
 		except (
@@ -310,8 +329,8 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 					'DELETE',
 					'/ocs/v2.php/apps/context_chat/queues/documents/',
 					json={
-						'files': to_delete_file_ids,
-						'content_providers': to_delete_provider_ids,
+						'files': to_delete_files_db_ids,
+						'content_providers': to_delete_provider_db_ids,
 					},
 				)
 			continue
diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
index 972756fa..9f23e14f 100644
--- a/context_chat_backend/types.py
+++ b/context_chat_backend/types.py
@@ -7,7 +7,7 @@
 from io import BytesIO
 from typing import Annotated, Literal, Self
 
-from pydantic import AfterValidator, BaseModel, Discriminator, field_validator, model_validator
+from pydantic import AfterValidator, BaseModel, Discriminator, computed_field, field_validator, model_validator
 
 from .mimetype_list import SUPPORTED_MIMETYPES
 from .vectordb.types import UpdateAccessOp
@@ -69,6 +69,21 @@ def _validate_user_id(user_id: str) -> str:
 	return _validate_user_ids([user_id])[0]
 
 
+def _get_file_id_from_source_ref(source_ref: str) -> int:
+	'''
+	source reference is in the format "FILES_PROVIDER_ID: <file_id>".
+	'''
+	if not source_ref.startswith(f'{FILES_PROVIDER_ID}: '):
+		raise ValueError(f'Source reference does not start with expected prefix: {source_ref}')
+
+	try:
+		return int(source_ref[len(f'{FILES_PROVIDER_ID}: '):])
+	except ValueError as e:
+		raise ValueError(
+			f'Invalid source reference format for extracting file_id: {source_ref}'
+		) from e
+
+
 class TEmbeddingAuthApiKey(BaseModel):
 	apikey: str
 
@@ -177,6 +192,11 @@ def validate_type(self) -> Self:
 class ReceivedFileItem(CommonSourceItem):
 	content: None
 
+	@computed_field
+	@property
+	def file_id(self) -> int:
+		return _get_file_id_from_source_ref(self.reference)
+
 
 class SourceItem(CommonSourceItem):
 	'''

From 4958d1d980b0d0741762ffc9c3eac3ff91e5c2b0 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Wed, 11 Mar 2026 19:24:51 +0530
Subject: [PATCH 12/56] fix: wip: improve embeddings exception handling

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/network_em.py        | 13 +++++++++----
 context_chat_backend/task_fetcher.py      |  1 +
 context_chat_backend/vectordb/pgvector.py | 17 ++++++-----------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py
index 18bb11f4..d39ea56a 100644
--- a/context_chat_backend/network_em.py
+++ b/context_chat_backend/network_em.py
@@ -79,6 +79,7 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float]
 				raise FatalEmbeddingException(response.text)
 			if response.status_code // 100 != 2:
 				raise EmbeddingException(response.text)
+		# todo: rework exception handling and their downstream interpretation
 		except FatalEmbeddingException as e:
 			logger.error('Fatal error while getting embeddings: %s', str(e), exc_info=e)
 			raise e
@@ -108,10 +109,14 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float]
 			logger.error('Unexpected error while getting embeddings', exc_info=e)
 			raise EmbeddingException('Error: unexpected error while getting embeddings') from e
 
-		# converts TypedDict to a pydantic model
-		resp = CreateEmbeddingResponse(**response.json())
-		if isinstance(input_, str):
-			return resp['data'][0]['embedding']
+		try:
+			# converts TypedDict to a pydantic model
+			resp = CreateEmbeddingResponse(**response.json())
+			if isinstance(input_, str):
+				return resp['data'][0]['embedding']
+		except Exception as e:
+			logger.error('Error parsing embedding response', exc_info=e)
+			raise EmbeddingException('Error: failed to parse embedding response') from e
 
 		# only one embedding in d['embedding'] since truncate is True
 		return [d['embedding'] for d in resp['data']]  # pyright: ignore[reportReturnType]
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 0442cd53..51f98e7d 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -261,6 +261,7 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 			providers_result = {}
 			chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING
 
+			# todo: do it in asyncio, it's not truly parallel yet
 			# chunk file parsing for better file operation parallelism
 			for i in range(0, len(source_files), chunk_size):
 				chunk = dict(list(source_files.items())[i:i+chunk_size])
diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py
index 8bcc6f4c..bfca0bb6 100644
--- a/context_chat_backend/vectordb/pgvector.py
+++ b/context_chat_backend/vectordb/pgvector.py
@@ -17,7 +17,7 @@
 from langchain_postgres.vectorstores import Base, PGVector
 
 from ..chain.types import InDocument, ScopeType
-from ..types import EmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem
+from ..types import EmbeddingException, FatalEmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem
 from ..utils import timed
 from .base import BaseVectorDB
 from .types import DbException, SafeDbException, UpdateAccessOp
@@ -181,7 +181,11 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index
 						retryable=True,
 					)
 					continue
-				except RetryableEmbeddingException as e:
+				except FatalEmbeddingException as e:
+					raise EmbeddingException(
+						f'Fatal error while embedding documents for source {indoc.source_id}: {e}'
+					) from e
+				except (RetryableEmbeddingException, EmbeddingException) as e:
 					# temporary error, continue with the next document
 					logger.exception('Error adding documents to vectordb, should be retried later.', exc_info=e, extra={
 						'source_id': indoc.source_id,
@@ -191,15 +195,6 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index
 						retryable=True,
 					)
 					continue
-				except EmbeddingException as e:
-					logger.exception('Error adding documents to vectordb', exc_info=e, extra={
-						'source_id': indoc.source_id,
-					})
-					results[php_db_id] = IndexingError(
-						error=str(e),
-						retryable=False,
-					)
-					continue
 				except Exception as e:
 					logger.exception('Error adding documents to vectordb', exc_info=e, extra={
 						'source_id': indoc.source_id,

From a04912120965d8ff9a285eac559794b716a595ce Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Wed, 11 Mar 2026 19:44:06 +0530
Subject: [PATCH 13/56] fix(ci): update to the latest changes

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 104 ++++++++++++++++++-------
 1 file changed, 76 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index fb06bafa..9563bcdd 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -199,26 +199,87 @@ jobs:
           ls -la context_chat_backend/persistent_storage/*
           sleep 30 # Wait for the em server to get ready
 
-      - name: Scan files, baseline
-        run: |
-          ./occ files:scan admin
-          ./occ context_chat:scan admin -m text/plain
-
-      - name: Check python memory usage
+      - name: Initial memory usage check
         run: |
           ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem
           ps -p $(cat pid.txt) -o %mem --no-headers > initial_mem.txt
 
-      - name: Scan files
-        run: |
-          ./occ files:scan admin
-          ./occ context_chat:scan admin -m text/markdown &
-          ./occ context_chat:scan admin -m text/x-rst
-
-      - name: Check python memory usage
+      - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files
         run: |
-          ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem
-          ps -p $(cat pid.txt) -o %mem --no-headers > after_scan_mem.txt
+          success=0
+          for i in {1..90}; do
+            echo "Checking stats, attempt $i..."
+
+            mkfifo error_pipe
+            stats=$(timeout 5 ./occ context_chat:stats 2>error_pipe)
+            echo "Stats output:"
+            echo "$stats"
+            echo "---"
+
+            # Check for critical errors in output
+            if echo "$stats" | grep -q "Error during request"; then
+              echo "Backend connection error detected, retrying..."
+              rm -f error_pipe
+              sleep 10
+              continue
+            fi
+
+            # Extract Total eligible files
+            total_files=$(echo "$stats" | grep -oP 'Total eligible files:\s*\K\d+' || echo "")
+
+            # Extract Indexed documents count (files__default)
+            indexed_count=$(echo "$stats" | grep -oP "'files__default'\s*=>\s*\K\d+" || echo "")
+
+            # Validate parsed values
+            if [ -z "$total_files" ] || [ -z "$indexed_count" ]; then
+              echo "Error: Could not parse stats output properly"
+              if echo "$stats" | grep -q "Indexed documents:"; then
+                echo "  Indexed documents section found but could not extract count"
+              fi
+              rm -f error_pipe
+              sleep 10
+              continue
+            fi
+
+            echo "Total eligible files: $total_files"
+            echo "Indexed documents (files__default): $indexed_count"
+
+            # Calculate absolute difference
+            diff=$((total_files - indexed_count))
+            if [ $diff -lt 0 ]; then
+              diff=$((-diff))
+            fi
+
+            # Calculate 2% threshold using bc for floating point support
+            threshold=$(echo "scale=4; $total_files * 0.02" | bc)
+
+            # Check if difference is within tolerance
+            if (( $(echo "$diff <= $threshold" | bc -l) )); then
+              echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)"
+              rm -f error_pipe
+              success=1
+              break
+            else
+              pct=$(echo "scale=2; ($diff / $total_files) * 100" | bc)
+              echo "Outside 2% tolerance: diff=$diff (${pct}%), threshold=$threshold"
+            fi
+
+            # Check if backend is still alive
+            ccb_alive=$(ps -p $(cat pid.txt) -o cmd= | grep -c "main.py" || echo "0")
+            if [ "$ccb_alive" -eq 0 ]; then
+              echo "Error: Context Chat Backend process is not running. Exiting."
+              rm -f error_pipe
+              exit 1
+            fi
+
+            rm -f error_pipe
+            sleep 10
+          done
+
+          if [ $success -ne 1 ]; then
+            echo "Max attempts reached"
+            exit 1
+          fi
 
       - name: Run the prompts
         run: |
@@ -252,19 +313,6 @@ jobs:
             echo "Memory usage during scan is stable. No memory leak detected."
           fi
 
-      - name: Compare memory usage and detect leak
-        run: |
-          initial_mem=$(cat after_scan_mem.txt | tr -d ' ')
-          final_mem=$(cat after_prompt_mem.txt | tr -d ' ')
-          echo "Initial Memory Usage: $initial_mem%"
-          echo "Memory Usage after prompt: $final_mem%"
-
-          if (( $(echo "$final_mem > $initial_mem" | bc -l) )); then
-            echo "Memory usage has increased during prompt. Possible memory leak detected!"
-          else
-            echo "Memory usage during prompt is stable. No memory leak detected."
-          fi
-
       - name: Show server logs
         if: always()
         run: |

From 795380c7c62ce5f60f80aa16ffa1e7568133f03e Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 12 Mar 2026 16:10:58 +0530
Subject: [PATCH 14/56] fix(ci): use file to store stderr

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 9563bcdd..de0f4659 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -210,16 +210,21 @@ jobs:
           for i in {1..90}; do
             echo "Checking stats, attempt $i..."
 
-            mkfifo error_pipe
-            stats=$(timeout 5 ./occ context_chat:stats 2>error_pipe)
+            stats_err=$(mktemp)
+            stats=$(timeout 5 ./occ context_chat:stats 2>"$stats_err")
+            stats_exit=$?
             echo "Stats output:"
             echo "$stats"
+            if [ -s "$stats_err" ]; then
+              echo "Stderr:"
+              cat "$stats_err"
+            fi
             echo "---"
+            rm -f "$stats_err"
 
             # Check for critical errors in output
-            if echo "$stats" | grep -q "Error during request"; then
-              echo "Backend connection error detected, retrying..."
-              rm -f error_pipe
+            if [ $stats_exit -ne 0 ] || echo "$stats" | grep -q "Error during request"; then
+              echo "Backend connection error detected (exit=$stats_exit), retrying..."
               sleep 10
               continue
             fi
@@ -236,7 +241,6 @@ jobs:
               if echo "$stats" | grep -q "Indexed documents:"; then
                 echo "  Indexed documents section found but could not extract count"
               fi
-              rm -f error_pipe
               sleep 10
               continue
             fi
@@ -256,7 +260,6 @@ jobs:
             # Check if difference is within tolerance
             if (( $(echo "$diff <= $threshold" | bc -l) )); then
               echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)"
-              rm -f error_pipe
               success=1
               break
             else
@@ -268,11 +271,9 @@ jobs:
             ccb_alive=$(ps -p $(cat pid.txt) -o cmd= | grep -c "main.py" || echo "0")
             if [ "$ccb_alive" -eq 0 ]; then
               echo "Error: Context Chat Backend process is not running. Exiting."
-              rm -f error_pipe
               exit 1
             fi
 
-            rm -f error_pipe
             sleep 10
           done
 

From 7bc0ed7c3c535f930f03cc38c4dd884b5370696c Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 12 Mar 2026 17:17:38 +0530
Subject: [PATCH 15/56] fix(ci): add cron jobs

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index de0f4659..0d8e4229 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -204,9 +204,18 @@ jobs:
           ps -p $(cat pid.txt) -o pid,cmd,%mem,rss --sort=-%mem
           ps -p $(cat pid.txt) -o %mem --no-headers > initial_mem.txt
 
+      - name: Run cron jobs
+        run: |
+          # every 10 seconds indefinitely
+          while true; do
+            php cron.php
+            sleep 10
+          done &
+
       - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files
         run: |
           success=0
+          echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files"
           for i in {1..90}; do
             echo "Checking stats, attempt $i..."
 
@@ -277,6 +286,10 @@ jobs:
             sleep 10
           done
 
+          echo "::endgroup::"
+
+          ./occ context_chat:stats
+
           if [ $success -ne 1 ]; then
             echo "Max attempts reached"
             exit 1

From d94c687e057a7049e6b0f1f32b580f326692acd3 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 12 Mar 2026 17:35:47 +0530
Subject: [PATCH 16/56] fix(ci): do a occ files scan before cron jobs

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 0d8e4229..58f9f50c 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -169,6 +169,10 @@ jobs:
           cd ..
           rm -rf documentation
 
+      - name: Run files scan
+        run: |
+          ./occ files:scan --all
+
       - name: Setup python 3.11
         uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5
         with:

From dadc8fa7d193f40ddacffecf6266d8a2b37a6817 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Mon, 16 Mar 2026 20:09:30 +0530
Subject: [PATCH 17/56] feat: record indexing errors in content decode function

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .../chain/ingest/doc_loader.py                | 44 +++++++++----------
 context_chat_backend/chain/ingest/injest.py   | 20 ++++++---
 2 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py
index d26f74b1..832c8331 100644
--- a/context_chat_backend/chain/ingest/doc_loader.py
+++ b/context_chat_backend/chain/ingest/doc_loader.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 
-import logging
 import re
 import tempfile
 from collections.abc import Callable
@@ -18,9 +17,8 @@
 from pypdf.errors import FileNotDecryptedError as PdfFileNotDecryptedError
 from striprtf import striprtf
 
-from ...types import SourceItem
+from ...types import IndexingException, SourceItem
 
-logger = logging.getLogger('ccb.doc_loader')
 
 def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str:
 	raw_bytes = file.read()
@@ -75,10 +73,10 @@ def _load_xlsx(file: BytesIO) -> str:
 	return read_excel(file, na_filter=False).to_string(header=False, na_rep='')
 
 
-def _load_email(file: BytesIO, ext: str = 'eml') -> str | None:
+def _load_email(file: BytesIO, ext: str = 'eml') -> str:
 	# NOTE: msg format is not tested
 	if ext not in ['eml', 'msg']:
-		return None
+		raise IndexingException(f'Unsupported email format: {ext}')
 
 	# TODO: implement attachment partitioner using unstructured.partition.partition_{email,msg}
 	# since langchain does not pass through the attachment_partitioner kwarg
@@ -116,34 +114,36 @@ def attachment_partitioner(
 }
 
 
-def decode_source(source: SourceItem) -> str | None:
+def decode_source(source: SourceItem) -> str:
+	'''
+	Raises
+	------
+	IndexingException
+	'''
+
 	io_obj: BytesIO | None = None
 	try:
 		# .pot files are powerpoint templates but also plain text files,
 		# so we skip them to prevent decoding errors
 		if source.title.endswith('.pot'):
-			return None
-
-		mimetype = source.type
-		if mimetype is None:
-			return None
+			raise IndexingException('PowerPoint template files (.pot) are not supported')
 
 		if isinstance(source.content, str):
 			io_obj = BytesIO(source.content.encode('utf-8', 'ignore'))
 		else:
 			io_obj = source.content
 
-		if _loader_map.get(mimetype):
-			result = _loader_map[mimetype](io_obj)
-			return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
-
-		return io_obj.read().decode('utf-8', 'ignore')
-	except PdfFileNotDecryptedError:
-		logger.warning(f'PDF file ({source.reference}) is encrypted and cannot be read')
-		return None
-	except Exception:
-		logger.exception(f'Error decoding source file ({source.reference})', stack_info=True)
-		return None
+		if _loader_map.get(source.type):
+			result = _loader_map[source.type](io_obj)
+			return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip()
+
+		return io_obj.read().decode('utf-8', 'ignore').strip()
+	except IndexingException:
+		raise
+	except PdfFileNotDecryptedError as e:
+		raise IndexingException('PDF file is encrypted and cannot be read') from e
+	except Exception as e:
+		raise IndexingException(f'Error decoding source file: {e}') from e
 	finally:
 		if io_obj is not None:
 			io_obj.close()
diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index 7369f452..d9ea5433 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -8,7 +8,7 @@
 from langchain.schema import Document
 
 from ...dyn_loader import VectorDBLoader
-from ...types import IndexingError, SourceItem, TConfig
+from ...types import IndexingError, IndexingException, SourceItem, TConfig
 from ...vectordb.base import BaseVectorDB
 from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp
 from ..types import InDocument
@@ -59,9 +59,17 @@ def _sources_to_indocuments(
 
 		# todo: maybe fetch the content of the files here
 		# transform the source to have text data
-		content = decode_source(source)
+		try:
+			content = decode_source(source)
+		except IndexingException as e:
+			logger.error(f'Error decoding source ({source.reference}): {e}', exc_info=e)
+			errored_docs[db_id] = IndexingError(
+				error=str(e),
+				retryable=False,
+			)
+			continue
 
-		if content is None or (content := content.strip()) == '':
+		if content == '':
 			logger.debug('decoded empty source', extra={ 'source_id': source.reference })
 			errored_docs[db_id] = IndexingError(
 				error='Decoded content is empty',
@@ -74,12 +82,12 @@ def _sources_to_indocuments(
 		# NOTE: do not use this with all docs when programming files are added
 		content = re.sub(r'(\s){5,}', r'\g<1>', content)
 		# filter out null bytes
-		content = content.replace('\0', '')
+		content = content.replace('\0', '').strip()
 
-		if content is None or content == '':
+		if content == '':
 			logger.debug('decoded empty source after cleanup', extra={ 'source_id': source.reference })
 			errored_docs[db_id] = IndexingError(
-				error='Decoded content is empty',
+				error='Cleaned up content is empty',
 				retryable=False,
 			)
 			continue

From f9d86dcf1ddac21e61edcc3698b79e0a69475a24 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Tue, 17 Mar 2026 20:27:10 +0530
Subject: [PATCH 18/56] chore: move file fetch inside injest

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/ingest/injest.py | 197 ++++++++++++++++++--
 context_chat_backend/task_fetcher.py        | 173 +----------------
 context_chat_backend/types.py               |   7 +-
 context_chat_backend/vectordb/base.py       |  11 +-
 context_chat_backend/vectordb/pgvector.py   |  14 +-
 5 files changed, 208 insertions(+), 194 deletions(-)

diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index d9ea5433..18a37b4b 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -2,13 +2,18 @@
 # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import asyncio
 import logging
 import re
+from collections.abc import Mapping
+from io import BytesIO
 
+import niquests
 from langchain.schema import Document
+from nc_py_api import AsyncNextcloudApp
 
 from ...dyn_loader import VectorDBLoader
-from ...types import IndexingError, IndexingException, SourceItem, TConfig
+from ...types import IndexingError, IndexingException, ReceivedFileItem, SourceItem, TConfig
 from ...vectordb.base import BaseVectorDB
 from ...vectordb.types import DbException, SafeDbException, UpdateAccessOp
 from ..types import InDocument
@@ -17,15 +22,165 @@
 
 logger = logging.getLogger('ccb.injest')
 
+# max concurrent fetches to avoid overloading the NC server or hitting rate limits
+CONCURRENT_FILE_FETCHES = 10  # todo: config?
+MAX_FILE_SIZE = 100 * 1024 * 1024  # 100 MB, all loaded in RAM at once, todo: config?
+
+
+async def __fetch_file_content(
+	semaphore: asyncio.Semaphore,
+	file_id: int,
+	user_id: str,
+	_rlimit = 3,
+) -> BytesIO:
+	'''
+	Raises
+	------
+	IndexingException
+	'''
+
+	async with semaphore:
+		nc = AsyncNextcloudApp()
+		try:
+			# a file pointer for storing the stream in memory until it is consumed
+			fp = BytesIO()
+			await nc._session.download2fp(
+				url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}',
+				fp=fp,
+				dav=False,
+				params={ 'userId': user_id },
+			)
+			return fp
+		except niquests.exceptions.RequestException as e:
+			if e.response is None:
+				raise
+
+			if e.response.status_code == niquests.codes.too_many_requests:  # pyright: ignore[reportAttributeAccessIssue]
+				# todo: implement rate limits in php CC?
+				wait_for = int(e.response.headers.get('Retry-After', '30'))
+				if _rlimit <= 0:
+					raise IndexingException(
+						f'Rate limited when fetching content for file id {file_id}, user id {user_id},'
+						' max retries exceeded',
+						retryable=True,
+					) from e
+				logger.warning(
+					f'Rate limited when fetching content for file id {file_id}, user id {user_id},'
+					f' waiting {wait_for} before retrying',
+					exc_info=e,
+				)
+				await asyncio.sleep(wait_for)
+				return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1)
+
+			raise
+		except IndexingException:
+			raise
+		except Exception as e:
+			logger.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e)
+			raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e
+
+
+async def __fetch_files_content(
+	sources: Mapping[int, SourceItem | ReceivedFileItem]
+) -> tuple[Mapping[int, SourceItem], Mapping[int, IndexingError]]:
+	source_items = {}
+	error_items = {}
+	semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES)
+	tasks = []
+
+	for db_id, file in sources.items():
+		if isinstance(file, SourceItem):
+			continue
+
+		try:
+			# to detect any validation errors but it should not happen since file.reference is validated
+			file.file_id  # noqa: B018
+		except ValueError as e:
+			logger.error(
+				f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}',
+				exc_info=e,
+			)
+			error_items[db_id] = IndexingError(
+				error=f'Invalid file reference format: {file.reference}',
+				retryable=False,
+			)
+			continue
+
+		if file.size > MAX_FILE_SIZE:
+			logger.info(
+				f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size'
+				f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB',
+			)
+			error_items[db_id] = IndexingError(
+				error=(
+					f'File size {(file.size/(1024*1024)):.2f} MiB'
+					f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB'
+				),
+				retryable=False,
+			)
+			continue
+		# any user id from the list should have read access to the file
+		tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0])))
+
+	results = await asyncio.gather(*tasks, return_exceptions=True)
+	for (db_id, file), result in zip(sources.items(), results, strict=True):
+		if isinstance(file, SourceItem):
+			continue
+
+		if isinstance(result, IndexingException):
+			logger.error(
+				f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}'
+				f': {result}',
+				exc_info=result,
+			)
+			error_items[db_id] = IndexingError(
+				error=str(result),
+				retryable=result.retryable,
+			)
+		elif isinstance(result, str) or isinstance(result, BytesIO):
+			source_items[db_id] = SourceItem(
+				**{
+					**file.model_dump(),
+					'content': result,
+				}
+			)
+		elif isinstance(result, BaseException):
+			logger.error(
+				f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},'
+				f' reference {file.reference}: {result}',
+				exc_info=result,
+			)
+			error_items[db_id] = IndexingError(
+				error=f'Unexpected error: {result}',
+				retryable=True,
+			)
+		else:
+			logger.error(
+				f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}'
+				f': {result}',
+				exc_info=True,
+			)
+			error_items[db_id] = IndexingError(
+				error='Unknown error',
+				retryable=True,
+			)
+
+	# add the content providers from the orginal "sources" to the result unprocessed
+	for db_id, source in sources.items():
+		if isinstance(source, SourceItem):
+			source_items[db_id] = source
+
+	return source_items, error_items
+
 
 def _filter_sources(
 	vectordb: BaseVectorDB,
-	sources: dict[int, SourceItem]
-) -> tuple[dict[int, SourceItem], dict[int, SourceItem]]:
+	sources: Mapping[int, SourceItem | ReceivedFileItem]
+) -> tuple[Mapping[int, SourceItem | ReceivedFileItem], Mapping[int, SourceItem | ReceivedFileItem]]:
 	'''
 	Returns
 	-------
-	tuple[list[str], list[UploadFile]]
+	tuple[Mapping[int, SourceItem | ReceivedFileItem], Mapping[int, SourceItem | ReceivedFileItem]]:
 		First value is a list of sources that already exist in the vectordb.
 		Second value is a list of sources that are new and should be embedded.
 	'''
@@ -49,15 +204,14 @@ def _filter_sources(
 
 def _sources_to_indocuments(
 	config: TConfig,
-	sources: dict[int, SourceItem]
-) -> tuple[dict[int, InDocument], dict[int, IndexingError]]:
+	sources: Mapping[int, SourceItem]
+) -> tuple[Mapping[int, InDocument], Mapping[int, IndexingError]]:
 	indocuments = {}
 	errored_docs = {}
 
 	for db_id, source in sources.items():
 		logger.debug('processing source', extra={ 'source_id': source.reference })
 
-		# todo: maybe fetch the content of the files here
 		# transform the source to have text data
 		try:
 			content = decode_source(source)
@@ -121,8 +275,8 @@ def _sources_to_indocuments(
 
 def _increase_access_for_existing_sources(
 	vectordb: BaseVectorDB,
-	existing_sources: dict[int, SourceItem]
-) -> dict[int, IndexingError | None]:
+	existing_sources: Mapping[int, SourceItem | ReceivedFileItem]
+) -> Mapping[int, IndexingError | None]:
 	'''
 	update userIds for existing sources
 	allow the userIds as additional users, not as the only users
@@ -162,8 +316,8 @@ def _increase_access_for_existing_sources(
 def _process_sources(
 	vectordb: BaseVectorDB,
 	config: TConfig,
-	sources: dict[int, SourceItem]
-) -> dict[int, IndexingError | None]:
+	sources: Mapping[int, SourceItem | ReceivedFileItem]
+) -> Mapping[int, IndexingError | None]:
 	'''
 	Processes the sources and adds them to the vectordb.
 	Returns the list of source ids that were successfully added and those that need to be retried.
@@ -178,18 +332,21 @@ def _process_sources(
 
 	source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources)
 
-	if len(to_embed_sources) == 0:
+	populated_to_embed_sources, errored_sources = asyncio.run(__fetch_files_content(to_embed_sources))
+	source_proc_results.update(errored_sources)  # pyright: ignore[reportAttributeAccessIssue]
+
+	if len(populated_to_embed_sources) == 0:
 		# no new sources to embed
 		logger.debug('Filtered all sources, nothing to embed')
 		return source_proc_results
 
 	logger.debug('Filtered sources:', extra={
-		'source_ids': [source.reference for source in to_embed_sources.values()]
+		'source_ids': [source.reference for source in populated_to_embed_sources.values()]
 	})
 	# invalid/empty sources are filtered out here and not counted in loaded/retryable
-	indocuments, errored_docs = _sources_to_indocuments(config, to_embed_sources)
+	indocuments, errored_docs = _sources_to_indocuments(config, populated_to_embed_sources)
 
-	source_proc_results.update(errored_docs)
+	source_proc_results.update(errored_docs)  # pyright: ignore[reportAttributeAccessIssue]
 	logger.debug('Converted sources to documents')
 
 	if len(indocuments) == 0:
@@ -197,8 +354,12 @@ def _process_sources(
 		logger.debug('All documents were found empty after being processed')
 		return source_proc_results
 
+	logger.debug('Adding documents to vectordb', extra={
+		'source_ids': [indoc.source_id for indoc in indocuments.values()]
+	})
+
 	doc_add_results = vectordb.add_indocuments(indocuments)
-	source_proc_results.update(doc_add_results)
+	source_proc_results.update(doc_add_results)  # pyright: ignore[reportAttributeAccessIssue]
 	logger.debug('Added documents to vectordb')
 
 	return source_proc_results
@@ -215,8 +376,8 @@ def _decode_latin_1(s: str) -> str:
 def embed_sources(
 	vectordb_loader: VectorDBLoader,
 	config: TConfig,
-	sources: dict[int, SourceItem]
-) -> dict[int, IndexingError | None]:
+	sources: Mapping[int, SourceItem | ReceivedFileItem]
+) -> Mapping[int, IndexingError | None]:
 	logger.debug('Embedding sources:', extra={
 		'source_ids': [
 			f'{source.reference} ({_decode_latin_1(source.title)})'
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 51f98e7d..28aff6a0 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -3,17 +3,16 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 
-import asyncio
 import logging
 import os
+from collections.abc import Mapping
 from contextlib import suppress
 from enum import Enum
-from io import BytesIO
 from threading import Event, Thread
 from time import sleep
 
 import niquests
-from nc_py_api import AsyncNextcloudApp, NextcloudApp
+from nc_py_api import NextcloudApp
 from pydantic import ValidationError
 
 from .chain.ingest.injest import embed_sources
@@ -25,7 +24,6 @@
 	EmbeddingException,
 	FilesQueueItems,
 	IndexingError,
-	IndexingException,
 	LoaderException,
 	ReceivedFileItem,
 	SourceItem,
@@ -46,12 +44,10 @@
 THREADS = {}
 THREAD_STOP_EVENT = Event()
 LOGGER = logging.getLogger('ccb.task_fetcher')
-FILES_INDEXING_BATCH_SIZE = 64  # todo: config?
+FILES_INDEXING_BATCH_SIZE = 16  # theoretical max RAM usage: 16 * 100 MiB, todo: config?
+MIN_FILES_PER_CPU = 4
 # divides the batch into these many chunks
 PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1)  # todo: config?
-# max concurrent fetches to avoid overloading the NC server or hitting rate limits
-CONCURRENT_FILE_FETCHES = 10  # todo: config?
-MAX_FILE_SIZE = 100 * 1024 * 1024  # 100 MB, todo: config?
 ACTIONS_BATCH_SIZE = 512  # todo: config?
 POLLING_COOLDOWN = 30
 
@@ -62,143 +58,6 @@ class ThreadType(Enum):
 	REQUEST_PROCESSING = 'request_processing'
 
 
-async def __fetch_file_content(
-	semaphore: asyncio.Semaphore,
-	file_id: int,
-	user_id: str,
-	_rlimit = 3,
-) -> BytesIO:
-	'''
-	Raises
-	------
-	IndexingException
-	'''
-
-	async with semaphore:
-		nc = AsyncNextcloudApp()
-		try:
-			# a file pointer for storing the stream in memory until it is consumed
-			fp = BytesIO()
-			await nc._session.download2fp(
-				url_path=f'/ocs/v2.php/apps/context_chat/files/{file_id}',
-				fp=fp,
-				dav=False,
-				params={ 'userId': user_id },
-			)
-			return fp
-		except niquests.exceptions.RequestException as e:
-			# todo: raise IndexingException with retryable=True for rate limit errors,
-			# todo: and handle it in the caller to not delete the source from the queue and retry later through
-			# todo: the normal lock expiry mechanism
-			if e.response is None:
-				raise
-
-			if e.response.status_code == niquests.codes.too_many_requests:  # pyright: ignore[reportAttributeAccessIssue]
-				# todo: implement rate limits in php CC?
-				wait_for = int(e.response.headers.get('Retry-After', '30'))
-				if _rlimit <= 0:
-					raise IndexingException(
-						f'Rate limited when fetching content for file id {file_id}, user id {user_id},'
-						' max retries exceeded',
-						retryable=True,
-					) from e
-				LOGGER.warning(
-					f'Rate limited when fetching content for file id {file_id}, user id {user_id},'
-					f' waiting {wait_for} before retrying',
-					exc_info=e,
-				)
-				await asyncio.sleep(wait_for)
-				return await __fetch_file_content(semaphore, file_id, user_id, _rlimit - 1)
-
-			raise
-		except IndexingException:
-			raise
-		except Exception as e:
-			LOGGER.error(f'Error fetching content for file id {file_id}, user id {user_id}: {e}', exc_info=e)
-			raise IndexingException(f'Error fetching content for file id {file_id}, user id {user_id}: {e}') from e
-
-
-async def __fetch_files_content(
-	files: dict[int, ReceivedFileItem]
-) -> dict[int, SourceItem | IndexingError]:
-	source_items = {}
-	semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES)
-	tasks = []
-
-	for db_id, file in files.items():
-		try:
-			# to detect any validation errors but it should not happen since file.reference is validated
-			file.file_id  # noqa: B018
-		except ValueError as e:
-			LOGGER.error(
-				f'Invalid file reference format for db id {db_id}, file reference {file.reference}: {e}',
-				exc_info=e,
-			)
-			source_items[db_id] = IndexingError(
-				error=f'Invalid file reference format: {file.reference}',
-				retryable=False,
-			)
-			continue
-
-		if file.size > MAX_FILE_SIZE:
-			LOGGER.info(
-				f'Skipping db id {db_id}, file id {file.file_id}, source id {file.reference} due to size'
-				f' {(file.size/(1024*1024)):.2f} MiB exceeding the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB',
-			)
-			source_items[db_id] = IndexingError(
-				error=(
-					f'File size {(file.size/(1024*1024)):.2f} MiB'
-					f' exceeds the limit {(MAX_FILE_SIZE/(1024*1024)):.2f} MiB'
-				),
-				retryable=False,
-			)
-			continue
-		# todo: perform the existing file check before fetching the content to avoid unnecessary fetches
-		# any user id from the list should have read access to the file
-		tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0])))
-
-	results = await asyncio.gather(*tasks, return_exceptions=True)
-	for (db_id, file), result in zip(files.items(), results, strict=True):
-		if isinstance(result, IndexingException):
-			LOGGER.error(
-				f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}'
-				f': {result}',
-				exc_info=result,
-			)
-			source_items[db_id] = IndexingError(
-				error=str(result),
-				retryable=result.retryable,
-			)
-		elif isinstance(result, str) or isinstance(result, BytesIO):
-			source_items[db_id] = SourceItem(
-				**{
-					**file.model_dump(),
-					'content': result,
-				}
-			)
-		elif isinstance(result, BaseException):
-			LOGGER.error(
-				f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},'
-				f' reference {file.reference}: {result}',
-				exc_info=result,
-			)
-			source_items[db_id] = IndexingError(
-				error=f'Unexpected error: {result}',
-				retryable=True,
-			)
-		else:
-			LOGGER.error(
-				f'Unknown error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}'
-				f': {result}',
-				exc_info=True,
-			)
-			source_items[db_id] = IndexingError(
-				error='Unknown error',
-				retryable=True,
-			)
-	return source_items
-
-
 def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None:
 	try:
 		vectordb_loader = VectorDBLoader(app_config)
@@ -206,7 +65,7 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None:
 		LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e)
 		return
 
-	def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingError | None]:
+	def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]:
 		try:
 			return exec_in_proc(
 				target=embed_sources,
@@ -225,7 +84,6 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 
 		try:
 			nc = NextcloudApp()
-			# todo: add the 'size' param to the return of this call.
 			q_items_res = nc.ocs(
 				'GET',
 				'/ocs/v2.php/apps/context_chat/queues/documents',
@@ -242,29 +100,14 @@ def _load_sources(source_items: dict[int, SourceItem]) -> dict[int, IndexingErro
 				sleep(POLLING_COOLDOWN)
 				continue
 
-			# populate files content and convert to source items
-			fetched_files = {}
-			source_files = {}
-			# unified error structure for files and content providers
-			source_errors = {}
-
-			if q_items.files:
-				fetched_files = asyncio.run(__fetch_files_content(q_items.files))
-
-			for db_id, result in fetched_files.items():
-				if isinstance(result, SourceItem):
-					source_files[db_id] = result
-				else:
-					source_errors[db_id] = result
-
 			files_result = {}
 			providers_result = {}
-			chunk_size = FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING
+			chunk_size = max(MIN_FILES_PER_CPU, FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING)
 
 			# todo: do it in asyncio, it's not truly parallel yet
 			# chunk file parsing for better file operation parallelism
-			for i in range(0, len(source_files), chunk_size):
-				chunk = dict(list(source_files.items())[i:i+chunk_size])
+			for i in range(0, len(q_items.files), chunk_size):
+				chunk = dict(list(q_items.files.items())[i:i+chunk_size])
 				files_result.update(_load_sources(chunk))
 
 			for i in range(0, len(q_items.content_providers), chunk_size):
diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
index 9f23e14f..59d2568f 100644
--- a/context_chat_backend/types.py
+++ b/context_chat_backend/types.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 import re
+from collections.abc import Mapping
 from enum import Enum
 from io import BytesIO
 from typing import Annotated, Literal, Self
@@ -224,8 +225,8 @@ class Config:
 
 
 class FilesQueueItems(BaseModel):
-	files: dict[int, ReceivedFileItem]  # [db id]: FileItem
-	content_providers: dict[int, SourceItem]  # [db id]: SourceItem
+	files: Mapping[int, ReceivedFileItem]  # [db id]: FileItem
+	content_providers: Mapping[int, SourceItem]  # [db id]: SourceItem
 
 
 class IndexingException(Exception):
@@ -343,4 +344,4 @@ class ActionsQueueItemUpdateAccessDeclSourceId(CommonActionsQueueItem):
 
 
 class ActionsQueueItems(BaseModel):
-	actions: dict[int, ActionsQueueItem]
+	actions: Mapping[int, ActionsQueueItem]
diff --git a/context_chat_backend/vectordb/base.py b/context_chat_backend/vectordb/base.py
index ebd54075..2b4aa35e 100644
--- a/context_chat_backend/vectordb/base.py
+++ b/context_chat_backend/vectordb/base.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 from abc import ABC, abstractmethod
+from collections.abc import Mapping
 from typing import Any
 
 from langchain.schema import Document
@@ -10,7 +11,7 @@
 from langchain.schema.vectorstore import VectorStore
 
 from ..chain.types import InDocument, ScopeType
-from ..types import IndexingError, SourceItem
+from ..types import IndexingError, ReceivedFileItem, SourceItem
 from ..utils import timed
 from .types import UpdateAccessOp
 
@@ -62,7 +63,7 @@ def get_instance(self) -> VectorStore:
 		'''
 
 	@abstractmethod
-	def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]:
+	def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, IndexingError | None]:
 		'''
 		Adds the given indocuments to the vectordb and updates the docs + access tables.
 
@@ -79,7 +80,7 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index
 
 	@timed
 	@abstractmethod
-	def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]:
+	def check_sources(self, sources: Mapping[int, SourceItem | ReceivedFileItem]) -> tuple[list[str], list[str]]:
 		'''
 		Checks the sources in the vectordb if they are already embedded
 			and are up to date.
@@ -88,8 +89,8 @@ def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list
 
 		Args
 		----
-		sources: list[UploadFile]
-			List of source ids to check.
+		sources: Mapping[int, SourceItem | ReceivedFileItem]
+			Dict of sources to check.
 
 		Returns
 		-------
diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py
index bfca0bb6..86f636be 100644
--- a/context_chat_backend/vectordb/pgvector.py
+++ b/context_chat_backend/vectordb/pgvector.py
@@ -4,6 +4,7 @@
 #
 import logging
 import os
+from collections.abc import Mapping
 from datetime import datetime
 
 import psycopg
@@ -17,7 +18,14 @@
 from langchain_postgres.vectorstores import Base, PGVector
 
 from ..chain.types import InDocument, ScopeType
-from ..types import EmbeddingException, FatalEmbeddingException, IndexingError, RetryableEmbeddingException, SourceItem
+from ..types import (
+	EmbeddingException,
+	FatalEmbeddingException,
+	IndexingError,
+	ReceivedFileItem,
+	RetryableEmbeddingException,
+	SourceItem,
+)
 from ..utils import timed
 from .base import BaseVectorDB
 from .types import DbException, SafeDbException, UpdateAccessOp
@@ -129,7 +137,7 @@ def get_users(self) -> list[str]:
 			except Exception as e:
 				raise DbException('Error: getting a list of all users from access list') from e
 
-	def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, IndexingError | None]:
+	def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int, IndexingError | None]:
 		"""
 		Raises
 			EmbeddingException: if the embedding request definitively fails
@@ -208,7 +216,7 @@ def add_indocuments(self, indocuments: dict[int, InDocument]) -> dict[int, Index
 		return results
 
 	@timed
-	def check_sources(self, sources: dict[int, SourceItem]) -> tuple[list[str], list[str]]:
+	def check_sources(self, sources: Mapping[int, SourceItem | ReceivedFileItem]) -> tuple[list[str], list[str]]:
 		'''
 		returns a tuple of (existing_source_ids, to_embed_source_ids)
 		'''

From 1ade19186593193a5005d2aadc97a83b25f601b8 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Wed, 18 Mar 2026 16:49:09 +0530
Subject: [PATCH 19/56] fix: truly parallel file parsing and indexing

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/task_fetcher.py | 48 ++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 28aff6a0..f07f5012 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -4,8 +4,10 @@
 #
 
 import logging
+import math
 import os
 from collections.abc import Mapping
+from concurrent.futures import ThreadPoolExecutor
 from contextlib import suppress
 from enum import Enum
 from threading import Event, Thread
@@ -47,7 +49,7 @@
 FILES_INDEXING_BATCH_SIZE = 16  # theoretical max RAM usage: 16 * 100 MiB, todo: config?
 MIN_FILES_PER_CPU = 4
 # divides the batch into these many chunks
-PARALLEL_FILE_PARSING = max(1, (os.cpu_count() or 2) - 1)  # todo: config?
+PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1)  # todo: config?
 ACTIONS_BATCH_SIZE = 512  # todo: config?
 POLLING_COOLDOWN = 30
 
@@ -71,10 +73,14 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 				target=embed_sources,
 				args=(vectordb_loader, app_config, source_items),
 			)
-		except (DbException, EmbeddingException):
-			raise
 		except Exception as e:
-			raise DbException('Error: failed to load sources') from e
+			err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown")
+			source_ids = (s.reference for s in source_items.values())
+			err = IndexingError(
+				error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}',
+				retryable=True,
+			)
+			return dict.fromkeys(source_items, err)
 
 
 	while True:
@@ -102,17 +108,33 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 
 			files_result = {}
 			providers_result = {}
-			chunk_size = max(MIN_FILES_PER_CPU, FILES_INDEXING_BATCH_SIZE // PARALLEL_FILE_PARSING)
 
-			# todo: do it in asyncio, it's not truly parallel yet
 			# chunk file parsing for better file operation parallelism
-			for i in range(0, len(q_items.files), chunk_size):
-				chunk = dict(list(q_items.files.items())[i:i+chunk_size])
-				files_result.update(_load_sources(chunk))
-
-			for i in range(0, len(q_items.content_providers), chunk_size):
-				chunk = dict(list(q_items.content_providers.items())[i:i+chunk_size])
-				providers_result.update(_load_sources(chunk))
+			file_chunk_size = max(MIN_FILES_PER_CPU, math.ceil(len(q_items.files) / PARALLEL_FILE_PARSING_COUNT))
+			file_chunks = [
+				dict(list(q_items.files.items())[i:i+file_chunk_size])
+				for i in range(0, len(q_items.files), file_chunk_size)
+			]
+			provider_chunk_size = max(
+				MIN_FILES_PER_CPU,
+				math.ceil(len(q_items.content_providers) / PARALLEL_FILE_PARSING_COUNT),
+			)
+			provider_chunks = [
+				dict(list(q_items.content_providers.items())[i:i+provider_chunk_size])
+				for i in range(0, len(q_items.content_providers), provider_chunk_size)
+			]
+
+			with ThreadPoolExecutor(
+				max_workers=PARALLEL_FILE_PARSING_COUNT,
+				thread_name_prefix='IndexingPool',
+			) as executor:
+				file_futures = [executor.submit(_load_sources, chunk) for chunk in file_chunks]
+				provider_futures = [executor.submit(_load_sources, chunk) for chunk in provider_chunks]
+
+				for future in file_futures:
+					files_result.update(future.result())
+				for future in provider_futures:
+					providers_result.update(future.result())
 
 			if (
 				any(isinstance(res, IndexingError) for res in files_result.values())

From 12fd1ca00fc6d3fab6e91b8bb4dbc6c11488ca74 Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Tue, 24 Mar 2026 10:36:04 +0100
Subject: [PATCH 20/56] initial pass at request processing

---
 context_chat_backend/controller.py   |   4 +-
 context_chat_backend/task_fetcher.py | 362 +++++++++++++++++++++++++--
 2 files changed, 350 insertions(+), 16 deletions(-)

diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 797ba201..3ebdc8ae 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -40,7 +40,7 @@
 from .models.types import LlmException
 from nc_py_api.ex_app import AppAPIAuthMiddleware
 from .utils import JSONResponse, exec_in_proc, value_of
-from .task_fetcher import start_bg_threads, wait_for_bg_threads
+from .task_fetcher import start_bg_threads, trigger_handler, wait_for_bg_threads
 from .vectordb.service import count_documents_by_provider
 
 # setup
@@ -83,7 +83,7 @@ def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str:
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-	set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch)
+	set_handlers(app, enabled_handler, models_to_fetch=models_to_fetch, trigger_handler=trigger_handler)
 	nc = NextcloudApp()
 	if nc.enabled_state:
 		app_enabled.set()
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index f07f5012..a5028029 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -12,26 +12,25 @@
 from enum import Enum
 from threading import Event, Thread
 from time import sleep
+from typing import Any
 
 import niquests
-from nc_py_api import NextcloudApp
+from langchain.llms.base import LLM
+from langchain.schema import Document
+from nc_py_api import NextcloudApp, NextcloudException
+from niquests import JSONDecodeError, RequestException
 from pydantic import ValidationError
 
+from .chain.context import get_context_chunks, get_context_docs
 from .chain.ingest.injest import embed_sources
+from .chain.query_proc import get_pruned_query
+from .chain.types import ContextException, LLMOutput, ScopeType
+from .controller import llm_loader
 from .dyn_loader import VectorDBLoader
-from .types import (
-	ActionsQueueItems,
-	ActionType,
-	AppRole,
-	EmbeddingException,
-	FilesQueueItems,
-	IndexingError,
-	LoaderException,
-	ReceivedFileItem,
-	SourceItem,
-	TConfig,
-)
+from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \
+	LoaderException, ReceivedFileItem, SourceItem, TConfig
 from .utils import exec_in_proc, get_app_role
+from .vectordb.base import BaseVectorDB
 from .vectordb.service import (
 	decl_update_access,
 	delete_by_provider,
@@ -52,6 +51,10 @@
 PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1)  # todo: config?
 ACTIONS_BATCH_SIZE = 512  # todo: config?
 POLLING_COOLDOWN = 30
+TRIGGER = Event()
+CHECK_INTERVAL = 5
+CHECK_INTERVAL_WITH_TRIGGER = 5 * 60
+CHECK_INTERVAL_ON_ERROR = 15
 
 
 class ThreadType(Enum):
@@ -370,7 +373,78 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 
 
 def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
-	...
+	logger.info('Starting task fetcher loop')
+
+	try:
+		vectordb_loader = VectorDBLoader(app_config)
+	except LoaderException as e:
+		LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e)
+		return
+
+	nc = NextcloudApp()
+	llm: LLM = llm_loader.load()
+
+	while True:
+		if THREAD_STOP_EVENT.is_set():
+			LOGGER.info('Updates processing thread is stopping due to stop event being set')
+			return
+
+		try:
+			# Fetch pending task
+			try:
+				response = nc.providers.task_processing.next_task(list(provider_ids), list(task_type_ids))
+				if not response:
+					wait_for_tasks()
+					continue
+			except (NextcloudException, RequestException, JSONDecodeError) as e:
+				LOGGER.error(f"Network error fetching the next task {e}", exc_info=e)
+				wait_for_tasks(CHECK_INTERVAL_ON_ERROR)
+				continue
+
+			# Process task
+			task = response["task"]
+			provider = response["provider"]
+
+			try:
+				logger.debug(f'Processing task {task["id"]}')
+				result = process_task(task, vectordb_loader, llm, app_config)
+
+				# Return result to Nextcloud
+				success = return_result_to_nextcloud(task_id, result)
+
+				if success:
+					LOGGER.info(f'Task {task["id"]} completed successfully')
+				else:
+					LOGGER.error(f'Failed to return result for task {task["id"]}')
+
+			except ContextException as e:
+				LOGGER.warning(f'Context error for task {task["id"]}: {e}')
+			# TODO: Return error to Nextcloud
+			except ValueError as e:
+				LOGGER.warning(f'Validation error for task {task["id"]}: {e}')
+			# TODO: Return error to Nextcloud
+			except Exception as e:
+				LOGGER.exception(f'Unexpected error processing task {task["id"]}', exc_info=e)
+			# TODO: Return error to Nextcloud
+
+		except Exception as e:
+			logger.exception('Error in task fetcher loop', exc_info=e)
+	# TODO: Add appropriate error handling and backoff
+
+def trigger_handler(providerId: str):
+	global TRIGGER
+	print('TRIGGER called')
+	TRIGGER.set()
+
+def wait_for_tasks(interval = None):
+	global TRIGGER
+	global CHECK_INTERVAL
+	global CHECK_INTERVAL_WITH_TRIGGER
+	actual_interval = CHECK_INTERVAL if interval is None else interval
+	if TRIGGER.wait(timeout=actual_interval):
+		CHECK_INTERVAL = CHECK_INTERVAL_WITH_TRIGGER
+	TRIGGER.clear()
+
 
 
 def start_bg_threads(app_config: TConfig, app_enabled: Event):
@@ -430,3 +504,263 @@ def wait_for_bg_threads():
 			THREAD_STOP_EVENT.set()
 			THREADS[ThreadType.REQUEST_PROCESSING].join()
 			THREADS.pop(ThreadType.REQUEST_PROCESSING)
+
+
+# Default LLM template for context-based queries
+_LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer.
+{context}
+
+{question}
+'''
+
+def query_vector_database(
+	user_id: str,
+	query: str,
+	vectordb: BaseVectorDB,
+	ctx_limit: int,
+	scope_type: ScopeType | None = None,
+	scope_list: list[str] | None = None,
+) -> list[Document]:
+	"""
+	Query the vector database to retrieve relevant documents.
+
+	Args:
+		user_id: User ID for scoping the search
+		query: The search query text
+		vectordb: Vector database instance
+		ctx_limit: Maximum number of documents to return
+		scope_type: Optional scope type (PROVIDER or SOURCE)
+		scope_list: Optional list of scope identifiers
+
+	Returns:
+		List of relevant Document objects
+
+	Raises:
+		ContextException: If scope type is provided without scope list
+	"""
+	context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list)
+	logger.debug('Retrieved context documents', extra={
+		'user_id': user_id,
+		'num_docs': len(context_docs),
+		'ctx_limit': ctx_limit,
+	})
+	return context_docs
+
+
+def prepare_context_chunks(context_docs: list[Document]) -> list[str]:
+	"""
+	Extract and format text chunks from documents for LLM context.
+
+	Args:
+		context_docs: List of Document objects from vector DB
+
+	Returns:
+		List of formatted text chunks including titles and content
+	"""
+	return get_context_chunks(context_docs)
+
+
+def generate_llm_response(
+	llm: LLM,
+	app_config: TConfig,
+	user_id: str,
+	query: str,
+	template: str,
+	context_chunks: list[str],
+	end_separator: str = '',
+) -> str:
+	"""
+	Generate LLM response using the pruned query and context.
+
+	Args:
+		llm: Language model instance
+		app_config: Application configuration
+		user_id: User ID for the request
+		query: The original query text
+		template: Template for formatting the prompt
+		context_chunks: Context chunks to include in the prompt
+		end_separator: Optional separator to stop generation
+
+	Returns:
+		Generated LLM output text
+
+	Raises:
+		ValueError: If context length is too small to fit the query
+	"""
+	pruned_query_text = get_pruned_query(llm, app_config, query, template, context_chunks)
+
+	stop = [end_separator] if end_separator else None
+	output = llm.invoke(
+		pruned_query_text,
+		stop=stop,
+		userid=user_id,
+	).strip()
+
+	logger.debug('Generated LLM response', extra={
+		'user_id': user_id,
+		'output_length': len(output),
+	})
+	return output
+
+
+def extract_unique_sources(context_docs: list[Document]) -> list[str]:
+	"""
+	Extract unique source IDs from context documents.
+
+	Args:
+		context_docs: List of Document objects
+
+	Returns:
+		List of unique source IDs
+	"""
+	unique_sources: list[str] = list({
+		source for d in context_docs if (source := d.metadata.get('source'))
+	})
+	return unique_sources
+
+def execute_context_query(
+	user_id: str,
+	vectordb_loader: VectorDBLoader,
+	llm: LLM,
+	app_config: TConfig,
+	query: str,
+	ctx_limit: int = 20,
+	scope_type: ScopeType | None = None,
+	scope_list: list[str] | None = None,
+	template: str | None = None,
+	end_separator: str = '',
+) -> LLMOutput:
+	"""
+	Execute a RAG query with context retrieval from vector database.
+
+	This is the main function for processing queries that require context
+	from the vector database. It orchestrates the entire RAG pipeline:
+	1. Query vector database for relevant documents
+	2. Extract and format context chunks
+	3. Generate LLM response with context
+	4. Return output with source references
+
+	Args:
+		user_id: User ID for the request
+		vectordb_loader: Vector database loader instance
+		llm: Language model instance
+		app_config: Application configuration
+		query: The query text
+		ctx_limit: Maximum number of context documents (default: 20)
+		scope_type: Optional scope type for filtering
+		scope_list: Optional list of scope identifiers
+		template: Optional custom prompt template
+		end_separator: Optional separator to stop generation
+
+	Returns:
+		LLMOutput with generated text and source references
+
+	Raises:
+		ContextException: If no documents are retrieved
+		ValueError: If context length is too small to fit the query
+	"""
+	logger.info('Executing context query', extra={
+		'user_id': user_id,
+		'query_length': len(query),
+		'ctx_limit': ctx_limit,
+	})
+
+	# Step 1: Load vector database and retrieve relevant documents
+	db = vectordb_loader.load()
+	context_docs = query_vector_database(user_id, query, db, ctx_limit, scope_type, scope_list)
+
+	if len(context_docs) == 0:
+		raise ContextException('No documents retrieved, please index a few documents first')
+
+	# Step 2: Prepare context chunks for LLM
+	context_chunks = prepare_context_chunks(context_docs)
+	logger.debug('Prepared context chunks', extra={
+		'num_docs': len(context_docs),
+		'num_chunks': len(context_chunks),
+	})
+
+	# Step 3: Generate LLM response
+	output = generate_llm_response(
+		llm,
+		app_config,
+		user_id,
+		query,
+		template or _LLM_TEMPLATE,
+		context_chunks,
+		end_separator,
+	)
+
+	# Step 4: Extract unique sources for citation
+	unique_sources = extract_unique_sources(context_docs)
+
+	logger.info('Context query completed', extra={
+		'user_id': user_id,
+		'num_sources': len(unique_sources),
+	})
+
+	return LLMOutput(output=output, sources=unique_sources)
+
+# ============================================================================
+# Task Queue Processing
+# ============================================================================
+
+
+def return_result_to_nextcloud(task_id: str, result: LLMOutput) -> bool:
+	"""
+	Return query result back to Nextcloud.
+
+	STUB: This function should be implemented to send results back
+	to Nextcloud's task queue or API endpoint.
+
+	Args:
+		task_id: Unique task identifier
+		result: The LLMOutput result to return
+
+	Returns:
+		True if successful, False otherwise
+	"""
+	logger.debug('Returning result to Nextcloud (STUB)', extra={
+		'task_id': task_id,
+		'output_length': len(result['output']),
+		'num_sources': len(result['sources']),
+	})
+	# TODO: Implement actual Nextcloud result submission
+	return True
+
+
+def process_task(
+	task: dict[str, Any],
+	vectordb_loader: VectorDBLoader,
+	llm: LLM,
+	app_config: TConfig,
+) -> LLMOutput:
+	"""
+	Process a single query task.
+
+	Args:
+		task: Task dictionary from fetch_query_tasks_from_nextcloud
+		vectordb_loader: Vector database loader instance
+		llm: Language model instance
+		app_config: Application configuration
+
+	Returns:
+		LLMOutput with generated text and sources
+
+	Raises:
+		Various exceptions from query execution
+	"""
+	user_id = task['user_id']
+	query = task['query']
+
+	return execute_context_query(
+		user_id=user_id,
+		vectordb_loader=vectordb_loader,
+		llm=llm,
+		app_config=app_config,
+		query=query,
+		ctx_limit=task.get('ctx_limit', 20),
+		scope_type=task.get('scope_type'),
+		scope_list=task.get('scope_list'),
+		template=task.get('template'), # TODO: Somehow get the real template, tasks don't have it
+		end_separator=task.get('end_separator', ''), # TODO: same here
+	)
\ No newline at end of file

From 8aa2471080c10ea7b0a97a9d2dac4023e005464c Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Wed, 25 Mar 2026 10:42:40 +0100
Subject: [PATCH 21/56] implement request processing

---
 context_chat_backend/chain/one_shot.py |   1 +
 context_chat_backend/chain/types.py    |  12 +
 context_chat_backend/controller.py     |  19 +-
 context_chat_backend/task_fetcher.py   | 292 +++++++++++++++----------
 4 files changed, 201 insertions(+), 123 deletions(-)

diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py
index 1c0521bf..d0f5bbed 100644
--- a/context_chat_backend/chain/one_shot.py
+++ b/context_chat_backend/chain/one_shot.py
@@ -20,6 +20,7 @@
 
 logger = logging.getLogger('ccb.chain')
 
+# todo: remove this maybe
 def process_query(
 	user_id: str,
 	llm: LLM,
diff --git a/context_chat_backend/chain/types.py b/context_chat_backend/chain/types.py
index b006ad1a..c5277563 100644
--- a/context_chat_backend/chain/types.py
+++ b/context_chat_backend/chain/types.py
@@ -42,3 +42,15 @@ class LLMOutput(TypedDict):
 class SearchResult(TypedDict):
 	source_id: str
 	title: str
+
+class EnrichedSource(BaseModel):
+	id: str
+	label: str
+	icon: str
+	url: str
+
+class EnrichedSourceList(BaseModel):
+	sources: list[EnrichedSource]
+
+class ScopeList(BaseModel):
+	source_ids: list[str]
\ No newline at end of file
diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 3ebdc8ae..1e0d2773 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+from nc_py_api.ex_app.providers.task_processing import TaskProcessingProvider
 
 # isort: off
 from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult
@@ -65,9 +66,23 @@
 } if __download_models_from_hf else {}
 app_enabled = Event()
 
-def enabled_handler(enabled: bool, _: NextcloudApp | AsyncNextcloudApp) -> str:
+def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str:
 	try:
 		if enabled:
+			provider = TaskProcessingProvider(
+				id="context_chat-context_chat_search",
+				name="Context Chat",
+				task_type="context_chat:context_chat_search",
+				expected_runtime=30,
+			)
+			nc.providers.task_processing.register(provider)
+			provider = TaskProcessingProvider(
+				id="context_chat-context_chat",
+				name="Context Chat",
+				task_type="context_chat:context_chat",
+				expected_runtime=30,
+			)
+			nc.providers.task_processing.register(provider)
 			app_enabled.set()
 			start_bg_threads(app_config, app_enabled)
 		else:
@@ -383,7 +398,7 @@ def download_logs() -> FileResponse:
 # 				'title': source.headers.get('title'),
 # 				'headers': source.headers,
 # 			})
-# 			return JSONResponse(f'Invaild/missing headers for: {source.filename}', 400)
+# 			return JSONResponse(f'Invaild/missing headers for:provider_ids {source.filename}', 400)
 
 # 	# wait for 10 minutes before failing the request
 # 	semres = doc_parse_semaphore.acquire(block=True, timeout=10*60)
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index a5028029..7951f067 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
-
+import json
 import logging
 import math
 import os
@@ -21,11 +21,13 @@
 from niquests import JSONDecodeError, RequestException
 from pydantic import ValidationError
 
-from .chain.context import get_context_chunks, get_context_docs
+from .chain.context import do_doc_search, get_context_chunks, get_context_docs
 from .chain.ingest.injest import embed_sources
+from .chain.one_shot import process_context_query
 from .chain.query_proc import get_pruned_query
-from .chain.types import ContextException, LLMOutput, ScopeType
-from .controller import llm_loader
+from .chain.types import ContextException, EnrichedSource, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, \
+	SearchResult
+from .controller import Query, execute_query, llm_loader
 from .dyn_loader import VectorDBLoader
 from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \
 	LoaderException, ReceivedFileItem, SourceItem, TConfig
@@ -55,6 +57,7 @@
 CHECK_INTERVAL = 5
 CHECK_INTERVAL_WITH_TRIGGER = 5 * 60
 CHECK_INTERVAL_ON_ERROR = 15
+CONTEXT_LIMIT=20
 
 
 class ThreadType(Enum):
@@ -372,8 +375,25 @@ def updates_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 			continue
 
 
+def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]:
+	"""
+
+	Parameters
+	----------
+	source_ids
+
+	Returns
+	-------
+	source_ids with only files, no folders (or source_ids in case of non-file provider)
+	"""
+	nc = NextcloudApp()
+	data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/resolve_scope_list', json={'source_ids': source_ids, 'userId': userId})
+	sources = ScopeList.model_validate(data).source_ids
+	return sources
+
+
 def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
-	logger.info('Starting task fetcher loop')
+	LOGGER.info('Starting task fetcher loop')
 
 	try:
 		vectordb_loader = VectorDBLoader(app_config)
@@ -392,7 +412,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 		try:
 			# Fetch pending task
 			try:
-				response = nc.providers.task_processing.next_task(list(provider_ids), list(task_type_ids))
+				response = nc.providers.task_processing.next_task(['context_chat-context_chat', 'context_chat-context_chat_search'], ['context_chat:context_chat', 'context_chat:context_chat_search'])
 				if not response:
 					wait_for_tasks()
 					continue
@@ -403,14 +423,26 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 
 			# Process task
 			task = response["task"]
-			provider = response["provider"]
+			userId = task['userId']
 
 			try:
-				logger.debug(f'Processing task {task["id"]}')
-				result = process_task(task, vectordb_loader, llm, app_config)
-
-				# Return result to Nextcloud
-				success = return_result_to_nextcloud(task_id, result)
+				LOGGER.debug(f'Processing task {task["id"]}')
+
+				if task['input'].get('scopeType') == 'source':
+					# Resolve scope list to only files, no folders
+					task['input']['scopeList'] = resolve_scope_list(task['input'].get('scopeList'), userId)
+
+				if task['type'] == 'context_chat:context_chat':
+					result: LLMOutput = process_normal_task(task, vectordb_loader, llm, app_config)
+					# Return result to Nextcloud
+					success = return_normal_result_to_nextcloud(task['id'], userId, result)
+				elif task['type'] == 'context_chat:context_chat_search':
+					result: list[SearchResult] = process_search_task(task, vectordb_loader)
+					# Return result to Nextcloud
+					success = return_search_result_to_nextcloud(task['id'], userId, result)
+				else:
+					LOGGER.error(f'Unknown task type {task["type"]}')
+					success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}'))
 
 				if success:
 					LOGGER.info(f'Task {task["id"]} completed successfully')
@@ -419,17 +451,17 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 
 			except ContextException as e:
 				LOGGER.warning(f'Context error for task {task["id"]}: {e}')
-			# TODO: Return error to Nextcloud
+				return_error_to_nextcloud(task['id'], e)
 			except ValueError as e:
 				LOGGER.warning(f'Validation error for task {task["id"]}: {e}')
-			# TODO: Return error to Nextcloud
+				return_error_to_nextcloud(task['id'], e)
 			except Exception as e:
 				LOGGER.exception(f'Unexpected error processing task {task["id"]}', exc_info=e)
-			# TODO: Return error to Nextcloud
+				return_error_to_nextcloud(task['id'], e)
 
 		except Exception as e:
-			logger.exception('Error in task fetcher loop', exc_info=e)
-	# TODO: Add appropriate error handling and backoff
+			LOGGER.exception('Error in task fetcher loop', exc_info=e)
+			wait_for_tasks(CHECK_INTERVAL_ON_ERROR)
 
 def trigger_handler(providerId: str):
 	global TRIGGER
@@ -506,13 +538,6 @@ def wait_for_bg_threads():
 			THREADS.pop(ThreadType.REQUEST_PROCESSING)
 
 
-# Default LLM template for context-based queries
-_LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer.
-{context}
-
-{question}
-'''
-
 def query_vector_database(
 	user_id: str,
 	query: str,
@@ -539,7 +564,7 @@ def query_vector_database(
 		ContextException: If scope type is provided without scope list
 	"""
 	context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list)
-	logger.debug('Retrieved context documents', extra={
+	LOGGER.debug('Retrieved context documents', extra={
 		'user_id': user_id,
 		'num_docs': len(context_docs),
 		'ctx_limit': ctx_limit,
@@ -596,7 +621,7 @@ def generate_llm_response(
 		userid=user_id,
 	).strip()
 
-	logger.debug('Generated LLM response', extra={
+	LOGGER.debug('Generated LLM response', extra={
 		'user_id': user_id,
 		'output_length': len(output),
 	})
@@ -618,117 +643,112 @@ def extract_unique_sources(context_docs: list[Document]) -> list[str]:
 	})
 	return unique_sources
 
-def execute_context_query(
-	user_id: str,
-	vectordb_loader: VectorDBLoader,
-	llm: LLM,
-	app_config: TConfig,
-	query: str,
-	ctx_limit: int = 20,
-	scope_type: ScopeType | None = None,
-	scope_list: list[str] | None = None,
-	template: str | None = None,
-	end_separator: str = '',
-) -> LLMOutput:
+def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutput) -> bool:
 	"""
-	Execute a RAG query with context retrieval from vector database.
-
-	This is the main function for processing queries that require context
-	from the vector database. It orchestrates the entire RAG pipeline:
-	1. Query vector database for relevant documents
-	2. Extract and format context chunks
-	3. Generate LLM response with context
-	4. Return output with source references
+	Return query result back to Nextcloud.
 
 	Args:
-		user_id: User ID for the request
-		vectordb_loader: Vector database loader instance
-		llm: Language model instance
-		app_config: Application configuration
-		query: The query text
-		ctx_limit: Maximum number of context documents (default: 20)
-		scope_type: Optional scope type for filtering
-		scope_list: Optional list of scope identifiers
-		template: Optional custom prompt template
-		end_separator: Optional separator to stop generation
+		task_id: Unique task identifier
+		result: The LLMOutput result to return
 
 	Returns:
-		LLMOutput with generated text and source references
-
-	Raises:
-		ContextException: If no documents are retrieved
-		ValueError: If context length is too small to fit the query
+		True if successful, False otherwise
 	"""
-	logger.info('Executing context query', extra={
-		'user_id': user_id,
-		'query_length': len(query),
-		'ctx_limit': ctx_limit,
+	LOGGER.debug('Returning result to Nextcloud', extra={
+		'task_id': task_id,
+		'output_length': len(result['output']),
+		'num_sources': len(result['sources']),
 	})
 
-	# Step 1: Load vector database and retrieve relevant documents
-	db = vectordb_loader.load()
-	context_docs = query_vector_database(user_id, query, db, ctx_limit, scope_type, scope_list)
+	nc = NextcloudApp()
 
-	if len(context_docs) == 0:
-		raise ContextException('No documents retrieved, please index a few documents first')
+	try:
+		nc.providers.task_processing.report_result(task_id, {
+			'output': result['output'],
+			'sources': enrich_sources(result['sources'], userId),
+		})
+	except (NextcloudException, RequestException, JSONDecodeError) as e:
+		LOGGER.error(f"Network error reporting task result {e}", exc_info=e)
+		return False
 
-	# Step 2: Prepare context chunks for LLM
-	context_chunks = prepare_context_chunks(context_docs)
-	logger.debug('Prepared context chunks', extra={
-		'num_docs': len(context_docs),
-		'num_chunks': len(context_chunks),
-	})
+	return True
 
-	# Step 3: Generate LLM response
-	output = generate_llm_response(
-		llm,
-		app_config,
-		user_id,
-		query,
-		template or _LLM_TEMPLATE,
-		context_chunks,
-		end_separator,
-	)
+def enrich_sources(results: list[str], userId: str) -> list[EnrichedSource]:
+	nc = NextcloudApp()
+	# todo: refactor to include title here
+	data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': [{'source_id': id} for id in results], 'userId': userId})
+	sources = EnrichedSourceList.model_validate(data).sources
+	return sources
 
-	# Step 4: Extract unique sources for citation
-	unique_sources = extract_unique_sources(context_docs)
+def enrich_search_sources(results: list[SearchResult], userId: str) -> list[EnrichedSource]:
+	nc = NextcloudApp()
+	data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId})
+	sources = EnrichedSourceList.model_validate(data).sources
+	return sources
 
-	logger.info('Context query completed', extra={
-		'user_id': user_id,
-		'num_sources': len(unique_sources),
+
+def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool:
+	"""
+	Return search result back to Nextcloud.
+
+	Args:
+		task_id: Unique task identifier
+		result: The list of search results to return
+
+	Returns:
+		True if successful, False otherwise
+	"""
+	LOGGER.debug('Returning search result to Nextcloud', extra={
+		'task_id': task_id,
+		'num_sources': len(result),
 	})
 
-	return LLMOutput(output=output, sources=unique_sources)
+	nc = NextcloudApp()
 
-# ============================================================================
-# Task Queue Processing
-# ============================================================================
+	try:
+		sources = [json.dumps(source) for source in enrich_search_sources(result, userId)]
 
+		nc.providers.task_processing.report_result(task_id, {
+			'sources': sources,
+		})
+	except (NextcloudException, RequestException, JSONDecodeError) as e:
+		LOGGER.error(f"Network error reporting search task result {e}", exc_info=e)
+		return False
 
-def return_result_to_nextcloud(task_id: str, result: LLMOutput) -> bool:
-	"""
-	Return query result back to Nextcloud.
+	return True
 
-	STUB: This function should be implemented to send results back
-	to Nextcloud's task queue or API endpoint.
+def return_error_to_nextcloud(task_id: int, e: Exception) -> bool:
+	"""
+	Return error result back to Nextcloud.
 
 	Args:
 		task_id: Unique task identifier
-		result: The LLMOutput result to return
+		e: error object
 
 	Returns:
 		True if successful, False otherwise
 	"""
-	logger.debug('Returning result to Nextcloud (STUB)', extra={
-		'task_id': task_id,
-		'output_length': len(result['output']),
-		'num_sources': len(result['sources']),
-	})
-	# TODO: Implement actual Nextcloud result submission
+	LOGGER.debug('Returning error to Nextcloud', exc_info=e)
+
+	nc = NextcloudApp()
+
+	if isinstance(e, ValueError):
+		message = "Validation error: " + str(e)
+	elif isinstance(e, ContextException):
+		message = "Context error" + str(e)
+	else:
+		message = "Unexpected error" + str(e)
+
+	try:
+		nc.providers.task_processing.report_result(task_id, None, message)
+	except (NextcloudException, RequestException, JSONDecodeError) as e:
+		LOGGER.error(f"Network error reporting task result {e}", exc_info=e)
+		return False
+
 	return True
 
 
-def process_task(
+def process_normal_task(
 	task: dict[str, Any],
 	vectordb_loader: VectorDBLoader,
 	llm: LLM,
@@ -750,17 +770,47 @@ def process_task(
 		Various exceptions from query execution
 	"""
 	user_id = task['user_id']
-	query = task['query']
-
-	return execute_context_query(
-		user_id=user_id,
-		vectordb_loader=vectordb_loader,
-		llm=llm,
-		app_config=app_config,
-		query=query,
-		ctx_limit=task.get('ctx_limit', 20),
-		scope_type=task.get('scope_type'),
-		scope_list=task.get('scope_list'),
-		template=task.get('template'), # TODO: Somehow get the real template, tasks don't have it
-		end_separator=task.get('end_separator', ''), # TODO: same here
+	task_input = task['input']
+
+	return exec_in_proc(target=process_context_query,
+		args=(
+			user_id,
+			vectordb_loader,
+			llm,
+			app_config,
+			task_input.get('prompt'),
+			CONTEXT_LIMIT,
+			task_input.get('scopeType'),
+			task_input.get('scopeList'),
+		)
+	)
+
+def process_search_task(
+	task: dict[str, Any],
+	vectordb_loader: VectorDBLoader,
+) -> list[SearchResult]:
+	"""
+	Process a single search task.
+
+	Args:
+		task: Task dictionary from fetch_query_tasks_from_nextcloud
+		vectordb_loader: Vector database loader instance
+
+	Returns:
+		list of Search results
+
+	Raises:
+		Various exceptions from query execution
+	"""
+	user_id = task['user_id']
+	task_input = task['input']
+	return exec_in_proc(target=do_doc_search,
+		args=(
+			user_id,
+			task_input.get('prompt'),
+			vectordb_loader,
+			CONTEXT_LIMIT,
+			task_input.get('scopeType'),
+			task_input.get('scopeList'),
+		)
 	)
\ No newline at end of file

From 2093936913c08e55c5aca01b559314df731b4bb4 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 26 Mar 2026 22:43:48 +0530
Subject: [PATCH 22/56] request processing fixes

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/one_shot.py |   7 +-
 context_chat_backend/chain/types.py    |  14 +-
 context_chat_backend/controller.py     | 179 ++++++++++++-------------
 context_chat_backend/dyn_loader.py     |  16 +--
 context_chat_backend/task_fetcher.py   | 164 +++++++++++-----------
 5 files changed, 185 insertions(+), 195 deletions(-)

diff --git a/context_chat_backend/chain/one_shot.py b/context_chat_backend/chain/one_shot.py
index d0f5bbed..c79f272e 100644
--- a/context_chat_backend/chain/one_shot.py
+++ b/context_chat_backend/chain/one_shot.py
@@ -10,7 +10,7 @@
 from ..types import TConfig
 from .context import get_context_chunks, get_context_docs
 from .query_proc import get_pruned_query
-from .types import ContextException, LLMOutput, ScopeType
+from .types import ContextException, LLMOutput, ScopeType, SearchResult
 
 _LLM_TEMPLATE = '''Answer based only on this context and do not add any imaginative details. Make sure to use the same language as the question in your answer.
 {context}
@@ -79,6 +79,9 @@ def process_context_query(
 		stop=[end_separator],
 		userid=user_id,
 	).strip()
-	unique_sources: list[str] = list({source for d in context_docs if (source := d.metadata.get('source'))})
+	unique_sources = [SearchResult(
+		source_id=source,
+		title=d.metadata.get('title', ''),
+	) for d in context_docs if (source := d.metadata.get('source'))]
 
 	return LLMOutput(output=output, sources=unique_sources)
diff --git a/context_chat_backend/chain/types.py b/context_chat_backend/chain/types.py
index c5277563..3afdf297 100644
--- a/context_chat_backend/chain/types.py
+++ b/context_chat_backend/chain/types.py
@@ -33,16 +33,16 @@ class ContextException(Exception):
 	...
 
 
-class LLMOutput(TypedDict):
-	output: str
-	sources: list[str]
-	# todo: add "titles" field
-
-
 class SearchResult(TypedDict):
 	source_id: str
 	title: str
 
+
+class LLMOutput(TypedDict):
+	output: str
+	sources: list[SearchResult]
+
+
 class EnrichedSource(BaseModel):
 	id: str
 	label: str
@@ -53,4 +53,4 @@ class EnrichedSourceList(BaseModel):
 	sources: list[EnrichedSource]
 
 class ScopeList(BaseModel):
-	source_ids: list[str]
\ No newline at end of file
+	source_ids: list[str]
diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 1e0d2773..33e3cad4 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -5,7 +5,7 @@
 from nc_py_api.ex_app.providers.task_processing import TaskProcessingProvider
 
 # isort: off
-from .chain.types import ContextException, LLMOutput, ScopeType, SearchResult
+from .chain.types import ContextException
 from .types import LoaderException, EmbeddingException
 from .vectordb.types import DbException, SafeDbException
 from .setup_functions import ensure_config_file, repair_run, setup_env_vars
@@ -25,22 +25,17 @@
 from contextlib import asynccontextmanager
 from functools import wraps
 from threading import Event, Thread
-from typing import Any
 
 from fastapi import FastAPI, Request
-from langchain.llms.base import LLM
 from nc_py_api import AsyncNextcloudApp, NextcloudApp
 from nc_py_api.ex_app import persistent_storage, set_handlers
-from pydantic import BaseModel, ValidationInfo, field_validator
 from starlette.responses import FileResponse
 
-from .chain.context import do_doc_search
-from .chain.one_shot import process_context_query, process_query
 from .config_parser import get_config
-from .dyn_loader import LLMModelLoader, VectorDBLoader
+from .dyn_loader import VectorDBLoader
 from .models.types import LlmException
 from nc_py_api.ex_app import AppAPIAuthMiddleware
-from .utils import JSONResponse, exec_in_proc, value_of
+from .utils import JSONResponse, exec_in_proc
 from .task_fetcher import start_bg_threads, trigger_handler, wait_for_bg_threads
 from .vectordb.service import count_documents_by_provider
 
@@ -108,7 +103,6 @@ async def lifespan(app: FastAPI):
 	t.start()
 	yield
 	vectordb_loader.offload()
-	llm_loader.offload()
 	wait_for_bg_threads()
 
 
@@ -120,7 +114,6 @@ async def lifespan(app: FastAPI):
 # loaders
 
 vectordb_loader = VectorDBLoader(app_config)
-llm_loader = LLMModelLoader(app, app_config)
 
 
 # locks and semaphores
@@ -438,90 +431,90 @@ def download_logs() -> FileResponse:
 # 	return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources})
 
 
-class Query(BaseModel):
-	userId: str
-	query: str
-	useContext: bool = True
-	scopeType: ScopeType | None = None
-	scopeList: list[str] | None = None
-	ctxLimit: int = 20
-
-	@field_validator('userId', 'query', 'ctxLimit')
-	@classmethod
-	def check_empty_values(cls, value: Any, info: ValidationInfo):
-		if value_of(value) is None:
-			raise ValueError('Empty value for field', info.field_name)
-
-		return value
-
-	@field_validator('ctxLimit')
-	@classmethod
-	def at_least_one_context(cls, value: int):
-		if value < 1:
-			raise ValueError('Invalid context chunk limit')
-
-		return value
-
-
-def execute_query(query: Query, in_proc: bool = True) -> LLMOutput:
-	llm: LLM = llm_loader.load()
-	template = app.extra.get('LLM_TEMPLATE')
-	no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE']
-	# todo: array
-	end_separator = app.extra.get('LLM_END_SEPARATOR', '')
-
-	if query.useContext:
-		target = process_context_query
-		args=(
-			query.userId,
-			vectordb_loader,
-			llm,
-			app_config,
-			query.query,
-			query.ctxLimit,
-			query.scopeType,
-			query.scopeList,
-			template,
-			end_separator,
-		)
-	else:
-		target=process_query
-		args=(
-			query.userId,
-			llm,
-			app_config,
-			query.query,
-			no_ctx_template,
-			end_separator,
-		)
-
-	if in_proc:
-		return exec_in_proc(target=target, args=args)
-
-	return target(*args)  # pyright: ignore
-
-
-@app.post('/query')
-@enabled_guard(app)
-def _(query: Query) -> LLMOutput:
-	logger.debug('received query request', extra={ 'query': query.dict() })
+# class Query(BaseModel):
+# 	userId: str
+# 	query: str
+# 	useContext: bool = True
+# 	scopeType: ScopeType | None = None
+# 	scopeList: list[str] | None = None
+# 	ctxLimit: int = 20
+
+# 	@field_validator('userId', 'query', 'ctxLimit')
+# 	@classmethod
+# 	def check_empty_values(cls, value: Any, info: ValidationInfo):
+# 		if value_of(value) is None:
+# 			raise ValueError('Empty value for field', info.field_name)
+
+# 		return value
+
+# 	@field_validator('ctxLimit')
+# 	@classmethod
+# 	def at_least_one_context(cls, value: int):
+# 		if value < 1:
+# 			raise ValueError('Invalid context chunk limit')
+
+# 		return value
+
+
+# def execute_query(query: Query, in_proc: bool = True) -> LLMOutput:
+# 	llm: LLM = llm_loader.load()
+# 	template = app.extra.get('LLM_TEMPLATE')
+# 	no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE']
+# 	# todo: array
+# 	end_separator = app.extra.get('LLM_END_SEPARATOR', '')
+
+# 	if query.useContext:
+# 		target = process_context_query
+# 		args=(
+# 			query.userId,
+# 			vectordb_loader,
+# 			llm,
+# 			app_config,
+# 			query.query,
+# 			query.ctxLimit,
+# 			query.scopeType,
+# 			query.scopeList,
+# 			template,
+# 			end_separator,
+# 		)
+# 	else:
+# 		target=process_query
+# 		args=(
+# 			query.userId,
+# 			llm,
+# 			app_config,
+# 			query.query,
+# 			no_ctx_template,
+# 			end_separator,
+# 		)
 
-	if app_config.llm[0] == 'nc_texttotext':
-		return execute_query(query)
+# 	if in_proc:
+# 		return exec_in_proc(target=target, args=args)
 
-	with llm_lock:
-		return execute_query(query, in_proc=False)
+# 	return target(*args)  # pyright: ignore
 
 
-@app.post('/docSearch')
-@enabled_guard(app)
-def _(query: Query) -> list[SearchResult]:
-	# useContext from Query is not used here
-	return exec_in_proc(target=do_doc_search, args=(
-		query.userId,
-		query.query,
-		vectordb_loader,
-		query.ctxLimit,
-		query.scopeType,
-		query.scopeList,
-	))
+# @app.post('/query')
+# @enabled_guard(app)
+# def _(query: Query) -> LLMOutput:
+# 	logger.debug('received query request', extra={ 'query': query.dict() })
+
+# 	if app_config.llm[0] == 'nc_texttotext':
+# 		return execute_query(query)
+
+# 	with llm_lock:
+# 		return execute_query(query, in_proc=False)
+
+
+# @app.post('/docSearch')
+# @enabled_guard(app)
+# def _(query: Query) -> list[SearchResult]:
+# 	# useContext from Query is not used here
+# 	return exec_in_proc(target=do_doc_search, args=(
+# 		query.userId,
+# 		query.query,
+# 		vectordb_loader,
+# 		query.ctxLimit,
+# 		query.scopeType,
+# 		query.scopeList,
+# 	))
diff --git a/context_chat_backend/dyn_loader.py b/context_chat_backend/dyn_loader.py
index d67310ff..47b19575 100644
--- a/context_chat_backend/dyn_loader.py
+++ b/context_chat_backend/dyn_loader.py
@@ -7,11 +7,9 @@
 import gc
 import logging
 from abc import ABC, abstractmethod
-from time import time
 from typing import Any
 
 import torch
-from fastapi import FastAPI
 from langchain.llms.base import LLM
 
 from .models.loader import init_model
@@ -54,19 +52,11 @@ def offload(self) -> None:
 
 
 class LLMModelLoader(Loader):
-	def __init__(self, app: FastAPI, config: TConfig) -> None:
+	def __init__(self, config: TConfig) -> None:
 		self.config = config
-		self.app = app
 
 	def load(self) -> LLM:
-		if self.app.extra.get('LLM_MODEL') is not None:
-			self.app.extra['LLM_LAST_ACCESSED'] = time()
-			return self.app.extra['LLM_MODEL']
-
 		llm_name, llm_config = self.config.llm
-		self.app.extra['LLM_TEMPLATE'] = llm_config.pop('template', '')
-		self.app.extra['LLM_NO_CTX_TEMPLATE'] = llm_config.pop('no_ctx_template', '')
-		self.app.extra['LLM_END_SEPARATOR'] = llm_config.pop('end_separator', '')
 
 		try:
 			model = init_model('llm', (llm_name, llm_config))
@@ -75,13 +65,9 @@ def load(self) -> LLM:
 		if not isinstance(model, LLM):
 			raise LoaderException(f'Error: {model} does not implement "llm" type or has returned an invalid object')
 
-		self.app.extra['LLM_MODEL'] = model
-		self.app.extra['LLM_LAST_ACCESSED'] = time()
 		return model
 
 	def offload(self) -> None:
-		if self.app.extra.get('LLM_MODEL') is not None:
-			del self.app.extra['LLM_MODEL']
 		clear_cache()
 
 
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 7951f067..634b51cd 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
-import json
 import logging
 import math
 import os
@@ -25,12 +24,20 @@
 from .chain.ingest.injest import embed_sources
 from .chain.one_shot import process_context_query
 from .chain.query_proc import get_pruned_query
-from .chain.types import ContextException, EnrichedSource, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, \
-	SearchResult
-from .controller import Query, execute_query, llm_loader
-from .dyn_loader import VectorDBLoader
-from .types import ActionType, ActionsQueueItems, AppRole, EmbeddingException, FilesQueueItems, IndexingError, \
-	LoaderException, ReceivedFileItem, SourceItem, TConfig
+from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult
+from .dyn_loader import LLMModelLoader, VectorDBLoader
+from .types import (
+	ActionsQueueItems,
+	ActionType,
+	AppRole,
+	EmbeddingException,
+	FilesQueueItems,
+	IndexingError,
+	LoaderException,
+	ReceivedFileItem,
+	SourceItem,
+	TConfig,
+)
 from .utils import exec_in_proc, get_app_role
 from .vectordb.base import BaseVectorDB
 from .vectordb.service import (
@@ -387,9 +394,11 @@ def resolve_scope_list(source_ids: list[str], userId: str) -> list[str]:
 	source_ids with only files, no folders (or source_ids in case of non-file provider)
 	"""
 	nc = NextcloudApp()
-	data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/resolve_scope_list', json={'source_ids': source_ids, 'userId': userId})
-	sources = ScopeList.model_validate(data).source_ids
-	return sources
+	data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/resolve_scope_list', json={
+		'source_ids': source_ids,
+		'userId': userId,
+	})
+	return ScopeList.model_validate(data).source_ids
 
 
 def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
@@ -397,6 +406,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 
 	try:
 		vectordb_loader = VectorDBLoader(app_config)
+		llm_loader = LLMModelLoader(app_config)
 	except LoaderException as e:
 		LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e)
 		return
@@ -412,7 +422,10 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 		try:
 			# Fetch pending task
 			try:
-				response = nc.providers.task_processing.next_task(['context_chat-context_chat', 'context_chat-context_chat_search'], ['context_chat:context_chat', 'context_chat:context_chat_search'])
+				response = nc.providers.task_processing.next_task(
+					['context_chat-context_chat', 'context_chat-context_chat_search'],
+					['context_chat:context_chat', 'context_chat:context_chat_search'],
+				)
 				if not response:
 					wait_for_tasks()
 					continue
@@ -437,9 +450,9 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 					# Return result to Nextcloud
 					success = return_normal_result_to_nextcloud(task['id'], userId, result)
 				elif task['type'] == 'context_chat:context_chat_search':
-					result: list[SearchResult] = process_search_task(task, vectordb_loader)
+					search_result: list[SearchResult] = process_search_task(task, vectordb_loader)
 					# Return result to Nextcloud
-					success = return_search_result_to_nextcloud(task['id'], userId, result)
+					success = return_search_result_to_nextcloud(task['id'], userId, search_result)
 				else:
 					LOGGER.error(f'Unknown task type {task["type"]}')
 					success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}'))
@@ -480,62 +493,60 @@ def wait_for_tasks(interval = None):
 
 
 def start_bg_threads(app_config: TConfig, app_enabled: Event):
-	match APP_ROLE:
-		case AppRole.INDEXING | AppRole.NORMAL:
-			if (
-				ThreadType.FILES_INDEXING in THREADS
-				or ThreadType.UPDATES_PROCESSING in THREADS
-			):
-				LOGGER.info('Background threads already running, skipping start')
-				return
-
-			THREAD_STOP_EVENT.clear()
-			THREADS[ThreadType.FILES_INDEXING] = Thread(
-				target=files_indexing_thread,
-				args=(app_config, app_enabled),
-				name='FilesIndexingThread',
-			)
-			THREADS[ThreadType.UPDATES_PROCESSING] = Thread(
-				target=updates_processing_thread,
-				args=(app_config, app_enabled),
-				name='UpdatesProcessingThread',
-			)
-			THREADS[ThreadType.FILES_INDEXING].start()
-			THREADS[ThreadType.UPDATES_PROCESSING].start()
-
-		case AppRole.RP | AppRole.NORMAL:
-			if ThreadType.REQUEST_PROCESSING in THREADS:
-				LOGGER.info('Background threads already running, skipping start')
-				return
-
-			THREAD_STOP_EVENT.clear()
-			THREADS[ThreadType.REQUEST_PROCESSING] = Thread(
-				target=request_processing_thread,
-				args=(app_config, app_enabled),
-				name='RequestProcessingThread',
-			)
-			THREADS[ThreadType.REQUEST_PROCESSING].start()
+	if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL:
+		if (
+			ThreadType.FILES_INDEXING in THREADS
+			or ThreadType.UPDATES_PROCESSING in THREADS
+		):
+			LOGGER.info('Background threads already running, skipping start')
+			return
+
+		THREAD_STOP_EVENT.clear()
+		THREADS[ThreadType.FILES_INDEXING] = Thread(
+			target=files_indexing_thread,
+			args=(app_config, app_enabled),
+			name='FilesIndexingThread',
+		)
+		THREADS[ThreadType.UPDATES_PROCESSING] = Thread(
+			target=updates_processing_thread,
+			args=(app_config, app_enabled),
+			name='UpdatesProcessingThread',
+		)
+		THREADS[ThreadType.FILES_INDEXING].start()
+		THREADS[ThreadType.UPDATES_PROCESSING].start()
+
+	if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL:
+		if ThreadType.REQUEST_PROCESSING in THREADS:
+			LOGGER.info('Background threads already running, skipping start')
+			return
+
+		THREAD_STOP_EVENT.clear()
+		THREADS[ThreadType.REQUEST_PROCESSING] = Thread(
+			target=request_processing_thread,
+			args=(app_config, app_enabled),
+			name='RequestProcessingThread',
+		)
+		THREADS[ThreadType.REQUEST_PROCESSING].start()
 
 
 def wait_for_bg_threads():
-	match APP_ROLE:
-		case AppRole.INDEXING | AppRole.NORMAL:
-			if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS):
-				return
+	if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL:
+		if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS):
+			return
 
-			THREAD_STOP_EVENT.set()
-			THREADS[ThreadType.FILES_INDEXING].join()
-			THREADS[ThreadType.UPDATES_PROCESSING].join()
-			THREADS.pop(ThreadType.FILES_INDEXING)
-			THREADS.pop(ThreadType.UPDATES_PROCESSING)
+		THREAD_STOP_EVENT.set()
+		THREADS[ThreadType.FILES_INDEXING].join()
+		THREADS[ThreadType.UPDATES_PROCESSING].join()
+		THREADS.pop(ThreadType.FILES_INDEXING)
+		THREADS.pop(ThreadType.UPDATES_PROCESSING)
 
-		case AppRole.RP | AppRole.NORMAL:
-			if (ThreadType.REQUEST_PROCESSING not in THREADS):
-				return
+	if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL:
+		if (ThreadType.REQUEST_PROCESSING not in THREADS):
+			return
 
-			THREAD_STOP_EVENT.set()
-			THREADS[ThreadType.REQUEST_PROCESSING].join()
-			THREADS.pop(ThreadType.REQUEST_PROCESSING)
+		THREAD_STOP_EVENT.set()
+		THREADS[ThreadType.REQUEST_PROCESSING].join()
+		THREADS.pop(ThreadType.REQUEST_PROCESSING)
 
 
 def query_vector_database(
@@ -673,18 +684,12 @@ def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutp
 
 	return True
 
-def enrich_sources(results: list[str], userId: str) -> list[EnrichedSource]:
-	nc = NextcloudApp()
-	# todo: refactor to include title here
-	data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': [{'source_id': id} for id in results], 'userId': userId})
-	sources = EnrichedSourceList.model_validate(data).sources
-	return sources
 
-def enrich_search_sources(results: list[SearchResult], userId: str) -> list[EnrichedSource]:
+def enrich_sources(results: list[SearchResult], userId: str) -> list[str]:
 	nc = NextcloudApp()
-	data = nc.ocs('POST', f'/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId})
+	data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId})
 	sources = EnrichedSourceList.model_validate(data).sources
-	return sources
+	return [s.model_dump_json() for s in sources]
 
 
 def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool:
@@ -706,10 +711,8 @@ def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[Se
 	nc = NextcloudApp()
 
 	try:
-		sources = [json.dumps(source) for source in enrich_search_sources(result, userId)]
-
 		nc.providers.task_processing.report_result(task_id, {
-			'sources': sources,
+			'sources': enrich_sources(result, userId),
 		})
 	except (NextcloudException, RequestException, JSONDecodeError) as e:
 		LOGGER.error(f"Network error reporting search task result {e}", exc_info=e)
@@ -769,8 +772,10 @@ def process_normal_task(
 	Raises:
 		Various exceptions from query execution
 	"""
-	user_id = task['user_id']
+	user_id = task['userId']
 	task_input = task['input']
+	if task_input.get('scopeType') == 'none':
+		task_input['scopeType'] = None
 
 	return exec_in_proc(target=process_context_query,
 		args=(
@@ -802,8 +807,11 @@ def process_search_task(
 	Raises:
 		Various exceptions from query execution
 	"""
-	user_id = task['user_id']
+	user_id = task['userId']
 	task_input = task['input']
+	if task_input.get('scopeType') == 'none':
+		task_input['scopeType'] = None
+
 	return exec_in_proc(target=do_doc_search,
 		args=(
 			user_id,
@@ -813,4 +821,4 @@ def process_search_task(
 			task_input.get('scopeType'),
 			task_input.get('scopeList'),
 		)
-	)
\ No newline at end of file
+	)

From 36b5f0211ee2da2123d220a312521afe204a559b Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 26 Mar 2026 23:01:56 +0530
Subject: [PATCH 23/56] chore: drop commented code

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/controller.py | 292 +----------------------------
 1 file changed, 1 insertion(+), 291 deletions(-)

diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 33e3cad4..49d1d737 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -24,7 +24,6 @@
 from collections.abc import Callable
 from contextlib import asynccontextmanager
 from functools import wraps
-from threading import Event, Thread
 
 from fastapi import FastAPI, Request
 from nc_py_api import AsyncNextcloudApp, NextcloudApp
@@ -59,7 +58,7 @@
 		'revision': '607a30d783dfa663caf39e06633721c8d4cfcd7e',
 	}
 } if __download_models_from_hf else {}
-app_enabled = Event()
+app_enabled = threading.Event()
 
 def enabled_handler(enabled: bool, nc: NextcloudApp | AsyncNextcloudApp) -> str:
 	try:
@@ -99,8 +98,6 @@ async def lifespan(app: FastAPI):
 		app_enabled.set()
 		start_bg_threads(app_config, app_enabled)
 	logger.info(f'App enable state at startup: {app_enabled.is_set()}')
-	t = Thread(target=background_thread_task, args=())
-	t.start()
 	yield
 	vectordb_loader.offload()
 	wait_for_bg_threads()
@@ -134,15 +131,6 @@ async def lifespan(app: FastAPI):
 if not app_config.disable_aaa:
 	app.add_middleware(AppAPIAuthMiddleware)
 
-# logger background thread
-
-def background_thread_task():
-	# todo
-	# while(True):
-	# 	logger.info(f'Currently indexing {len(_indexing)} documents (filename, size): ', extra={'_indexing': _indexing})
-	# 	sleep(10)
-	...
-
 # exception handlers
 
 @app.exception_handler(DbException)
@@ -240,281 +228,3 @@ def download_logs() -> FileResponse:
 				if os.path.isfile(file_path): # Might be a folder (just skip it then)
 					zip_file.write(file_path)
 		return FileResponse(tmp.name, media_type='application/zip', filename='docker_logs.zip')
-
-
-# @app.post('/updateAccessDeclarative')
-# @enabled_guard(app)
-# def _(
-# 	userIds: Annotated[list[str], Body()],
-# 	sourceId: Annotated[str, Body()],
-# ):
-# 	logger.debug('Update access declarative request:', extra={
-# 		'user_ids': userIds,
-# 		'source_id': sourceId,
-# 	})
-
-# 	if len(userIds) == 0:
-# 		return JSONResponse('Empty list of user ids', 400)
-
-# 	if not is_valid_source_id(sourceId):
-# 		return JSONResponse('Invalid source id', 400)
-
-# 	exec_in_proc(target=decl_update_access, args=(vectordb_loader, userIds, sourceId))
-
-# 	return JSONResponse('Access updated')
-
-
-# @app.post('/updateAccess')
-# @enabled_guard(app)
-# def _(
-# 	op: Annotated[UpdateAccessOp, Body()],
-# 	userIds: Annotated[list[str], Body()],
-# 	sourceId: Annotated[str, Body()],
-# ):
-# 	logger.debug('Update access request', extra={
-# 		'op': op,
-# 		'user_ids': userIds,
-# 		'source_id': sourceId,
-# 	})
-
-# 	if len(userIds) == 0:
-# 		return JSONResponse('Empty list of user ids', 400)
-
-# 	if not is_valid_source_id(sourceId):
-# 		return JSONResponse('Invalid source id', 400)
-
-# 	exec_in_proc(target=update_access, args=(vectordb_loader, op, userIds, sourceId))
-
-# 	return JSONResponse('Access updated')
-
-
-# @app.post('/updateAccessProvider')
-# @enabled_guard(app)
-# def _(
-# 	op: Annotated[UpdateAccessOp, Body()],
-# 	userIds: Annotated[list[str], Body()],
-# 	providerId: Annotated[str, Body()],
-# ):
-# 	logger.debug('Update access by provider request', extra={
-# 		'op': op,
-# 		'user_ids': userIds,
-# 		'provider_id': providerId,
-# 	})
-
-# 	if len(userIds) == 0:
-# 		return JSONResponse('Empty list of user ids', 400)
-
-# 	if not is_valid_provider_id(providerId):
-# 		return JSONResponse('Invalid provider id', 400)
-
-# 	exec_in_proc(target=update_access_provider, args=(vectordb_loader, op, userIds, providerId))
-
-# 	return JSONResponse('Access updated')
-
-
-# @app.post('/deleteSources')
-# @enabled_guard(app)
-# def _(sourceIds: Annotated[list[str], Body(embed=True)]):
-# 	logger.debug('Delete sources request', extra={
-# 		'source_ids': sourceIds,
-# 	})
-
-# 	sourceIds = [source.strip() for source in sourceIds if source.strip() != '']
-
-# 	if len(sourceIds) == 0:
-# 		return JSONResponse('No sources provided', 400)
-
-# 	res = exec_in_proc(target=delete_by_source, args=(vectordb_loader, sourceIds))
-# 	if res is False:
-# 		return JSONResponse('Error: VectorDB delete failed, check vectordb logs for more info.', 400)
-
-# 	return JSONResponse('All valid sources deleted')
-
-
-# @app.post('/deleteProvider')
-# @enabled_guard(app)
-# def _(providerKey: str = Body(embed=True)):
-# 	logger.debug('Delete sources by provider for all users request', extra={ 'provider_key': providerKey })
-
-# 	if value_of(providerKey) is None:
-# 		return JSONResponse('Invalid provider key provided', 400)
-
-# 	exec_in_proc(target=delete_by_provider, args=(vectordb_loader, providerKey))
-
-# 	return JSONResponse('All valid sources deleted')
-
-
-# @app.post('/deleteUser')
-# @enabled_guard(app)
-# def _(userId: str = Body(embed=True)):
-# 	logger.debug('Remove access list for user, and orphaned sources', extra={ 'user_id': userId })
-
-# 	if value_of(userId) is None:
-# 		return JSONResponse('Invalid userId provided', 400)
-
-# 	exec_in_proc(target=delete_user, args=(vectordb_loader, userId))
-
-# 	return JSONResponse('User deleted')
-
-
-# @app.put('/loadSources')
-# @enabled_guard(app)
-# def _(sources: list[UploadFile]):
-# 	global _indexing
-
-# 	if len(sources) == 0:
-# 		return JSONResponse('No sources provided', 400)
-
-# 	for source in sources:
-# 		if not value_of(source.filename):
-# 			return JSONResponse(f'Invalid source filename for: {source.headers.get("title")}', 400)
-
-# 		with index_lock:
-# 			if source.filename in _indexing:
-# 				# this request will be retried by the client
-# 				return JSONResponse(
-# 					f'This source ({source.filename}) is already being processed in another request, try again later',
-# 					503,
-# 					headers={'cc-retry': 'true'},
-# 				)
-
-# 		if not (
-# 			value_of(source.headers.get('userIds'))
-# 			and source.headers.get('title', None) is not None
-# 			and value_of(source.headers.get('type'))
-# 			and value_of(source.headers.get('modified'))
-# 			and source.headers['modified'].isdigit()
-# 			and value_of(source.headers.get('provider'))
-# 		):
-# 			logger.error('Invalid/missing headers received', extra={
-# 				'source_id': source.filename,
-# 				'title': source.headers.get('title'),
-# 				'headers': source.headers,
-# 			})
-# 			return JSONResponse(f'Invaild/missing headers for:provider_ids {source.filename}', 400)
-
-# 	# wait for 10 minutes before failing the request
-# 	semres = doc_parse_semaphore.acquire(block=True, timeout=10*60)
-# 	if not semres:
-# 		return JSONResponse(
-# 			'Document parser worker limit reached, try again in some time or consider increasing the limit',
-# 			503,
-# 			headers={'cc-retry': 'true'}
-# 		)
-
-# 	with index_lock:
-# 		for source in sources:
-# 			_indexing[source.filename] = source.size
-
-# 	try:
-# 		loaded_sources, not_added_sources = exec_in_proc(
-# 			target=embed_sources,
-# 			args=(vectordb_loader, app.extra['CONFIG'], sources)
-# 		)
-# 	except (DbException, EmbeddingException):
-# 		raise
-# 	except Exception as e:
-# 		raise DbException('Error: failed to load sources') from e
-# 	finally:
-# 		with index_lock:
-# 			for source in sources:
-# 				_indexing.pop(source.filename, None)
-# 		doc_parse_semaphore.release()
-
-# 	if len(loaded_sources) != len(sources):
-# 		logger.debug('Some sources were not loaded', extra={
-# 			'Count of loaded sources': f'{len(loaded_sources)}/{len(sources)}',
-# 			'source_ids': loaded_sources,
-# 		})
-
-# 	# loaded sources include the existing sources that may only have their access updated
-# 	return JSONResponse({'loaded_sources': loaded_sources, 'sources_to_retry': not_added_sources})
-
-
-# class Query(BaseModel):
-# 	userId: str
-# 	query: str
-# 	useContext: bool = True
-# 	scopeType: ScopeType | None = None
-# 	scopeList: list[str] | None = None
-# 	ctxLimit: int = 20
-
-# 	@field_validator('userId', 'query', 'ctxLimit')
-# 	@classmethod
-# 	def check_empty_values(cls, value: Any, info: ValidationInfo):
-# 		if value_of(value) is None:
-# 			raise ValueError('Empty value for field', info.field_name)
-
-# 		return value
-
-# 	@field_validator('ctxLimit')
-# 	@classmethod
-# 	def at_least_one_context(cls, value: int):
-# 		if value < 1:
-# 			raise ValueError('Invalid context chunk limit')
-
-# 		return value
-
-
-# def execute_query(query: Query, in_proc: bool = True) -> LLMOutput:
-# 	llm: LLM = llm_loader.load()
-# 	template = app.extra.get('LLM_TEMPLATE')
-# 	no_ctx_template = app.extra['LLM_NO_CTX_TEMPLATE']
-# 	# todo: array
-# 	end_separator = app.extra.get('LLM_END_SEPARATOR', '')
-
-# 	if query.useContext:
-# 		target = process_context_query
-# 		args=(
-# 			query.userId,
-# 			vectordb_loader,
-# 			llm,
-# 			app_config,
-# 			query.query,
-# 			query.ctxLimit,
-# 			query.scopeType,
-# 			query.scopeList,
-# 			template,
-# 			end_separator,
-# 		)
-# 	else:
-# 		target=process_query
-# 		args=(
-# 			query.userId,
-# 			llm,
-# 			app_config,
-# 			query.query,
-# 			no_ctx_template,
-# 			end_separator,
-# 		)
-
-# 	if in_proc:
-# 		return exec_in_proc(target=target, args=args)
-
-# 	return target(*args)  # pyright: ignore
-
-
-# @app.post('/query')
-# @enabled_guard(app)
-# def _(query: Query) -> LLMOutput:
-# 	logger.debug('received query request', extra={ 'query': query.dict() })
-
-# 	if app_config.llm[0] == 'nc_texttotext':
-# 		return execute_query(query)
-
-# 	with llm_lock:
-# 		return execute_query(query, in_proc=False)
-
-
-# @app.post('/docSearch')
-# @enabled_guard(app)
-# def _(query: Query) -> list[SearchResult]:
-# 	# useContext from Query is not used here
-# 	return exec_in_proc(target=do_doc_search, args=(
-# 		query.userId,
-# 		query.query,
-# 		vectordb_loader,
-# 		query.ctxLimit,
-# 		query.scopeType,
-# 		query.scopeList,
-# 	))

From 85d29f1640eb2ff5daa89016ecbae8ee9d484d27 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 27 Mar 2026 01:06:34 +0530
Subject: [PATCH 24/56] fix(ci): parse json output from the stats command

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 31 +++++++-------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 58f9f50c..589f8852 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -224,7 +224,7 @@ jobs:
             echo "Checking stats, attempt $i..."
 
             stats_err=$(mktemp)
-            stats=$(timeout 5 ./occ context_chat:stats 2>"$stats_err")
+            stats=$(timeout 5 ./occ context_chat:stats --json 2>"$stats_err")
             stats_exit=$?
             echo "Stats output:"
             echo "$stats"
@@ -243,41 +243,25 @@ jobs:
             fi
 
             # Extract Total eligible files
-            total_files=$(echo "$stats" | grep -oP 'Total eligible files:\s*\K\d+' || echo "")
+            total_files=$(echo "$stats" | jq '.eligible_files_count' || echo "")
 
             # Extract Indexed documents count (files__default)
-            indexed_count=$(echo "$stats" | grep -oP "'files__default'\s*=>\s*\K\d+" || echo "")
-
-            # Validate parsed values
-            if [ -z "$total_files" ] || [ -z "$indexed_count" ]; then
-              echo "Error: Could not parse stats output properly"
-              if echo "$stats" | grep -q "Indexed documents:"; then
-                echo "  Indexed documents section found but could not extract count"
-              fi
-              sleep 10
-              continue
-            fi
+            indexed_count=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "")
 
             echo "Total eligible files: $total_files"
             echo "Indexed documents (files__default): $indexed_count"
 
-            # Calculate absolute difference
             diff=$((total_files - indexed_count))
-            if [ $diff -lt 0 ]; then
-              diff=$((-diff))
-            fi
-
-            # Calculate 2% threshold using bc for floating point support
-            threshold=$(echo "scale=4; $total_files * 0.02" | bc)
+            threshold=$((total_files * 2 / 100))
 
             # Check if difference is within tolerance
-            if (( $(echo "$diff <= $threshold" | bc -l) )); then
+            if [ $diff -le $threshold ]; then
               echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)"
               success=1
               break
             else
-              pct=$(echo "scale=2; ($diff / $total_files) * 100" | bc)
-              echo "Outside 2% tolerance: diff=$diff (${pct}%), threshold=$threshold"
+              progress=$((diff * 100 / total_files))
+              echo "Outside 2% tolerance: diff=$diff (${progress}%), threshold=$threshold"
             fi
 
             # Check if backend is still alive
@@ -293,6 +277,7 @@ jobs:
           echo "::endgroup::"
 
           ./occ context_chat:stats
+          ./occ context_chat:stats --json
 
           if [ $success -ne 1 ]; then
             echo "Max attempts reached"

From 4c6d01b9e913de0a931345aeab7169b3029a5c9a Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 27 Mar 2026 02:57:22 +0530
Subject: [PATCH 25/56] fix: seek to 0 to read the full buffer

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/ingest/injest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index 18a37b4b..0196f5d9 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -50,6 +50,7 @@ async def __fetch_file_content(
 				dav=False,
 				params={ 'userId': user_id },
 			)
+			fp.seek(0)
 			return fp
 		except niquests.exceptions.RequestException as e:
 			if e.response is None:

From 51774ff771944c5dffd46b3f33ed2c4a0d7f5bb6 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 27 Mar 2026 02:59:46 +0530
Subject: [PATCH 26/56] fix(ci): 3% tolerance

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 589f8852..73418e93 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -252,16 +252,16 @@ jobs:
             echo "Indexed documents (files__default): $indexed_count"
 
             diff=$((total_files - indexed_count))
-            threshold=$((total_files * 2 / 100))
+            threshold=$((total_files * 3 / 100))
 
             # Check if difference is within tolerance
             if [ $diff -le $threshold ]; then
-              echo "Indexing within 2% tolerance (diff=$diff, threshold=$threshold)"
+              echo "Indexing within 3% tolerance (diff=$diff, threshold=$threshold)"
               success=1
               break
             else
               progress=$((diff * 100 / total_files))
-              echo "Outside 2% tolerance: diff=$diff (${progress}%), threshold=$threshold"
+              echo "Outside 3% tolerance: diff=$diff (${progress}%), threshold=$threshold"
             fi
 
             # Check if backend is still alive

From c81b6758600eae2f049deb7ec578ef5c7eeca41b Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 27 Mar 2026 04:38:36 +0530
Subject: [PATCH 27/56] fix(ci): wait longer for EM server

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 73418e93..5c505483 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -201,7 +201,7 @@ jobs:
           timeout 10 ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080
           timeout 120 ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish
           ls -la context_chat_backend/persistent_storage/*
-          sleep 30 # Wait for the em server to get ready
+          sleep 60 # Wait for the em server to get ready
 
       - name: Initial memory usage check
         run: |
@@ -242,13 +242,13 @@ jobs:
               continue
             fi
 
-            # Extract Total eligible files
-            total_files=$(echo "$stats" | jq '.eligible_files_count' || echo "")
+            # Extract total queued files
+            total_files=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "")
 
-            # Extract Indexed documents count (files__default)
-            indexed_count=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "")
+            # Extract indexed documents count (files__default)
+            indexed_count=$(echo "$stats" | jq '.vectordb_document_counts.files__default' || echo "")
 
-            echo "Total eligible files: $total_files"
+            echo "Total queued files: $total_files"
             echo "Indexed documents (files__default): $indexed_count"
 
             diff=$((total_files - indexed_count))

From 6817f897e4ae14fdfeab0ad7b40a9a2de78cfe4b Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Mon, 30 Mar 2026 15:57:44 +0530
Subject: [PATCH 28/56] fix: don't process files or requests until the EM
 server is healthy

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml |  1 -
 context_chat_backend/network_em.py     | 14 +++++++++++---
 context_chat_backend/task_fetcher.py   | 14 ++++++++++++++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 5c505483..8e6ca7d8 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -201,7 +201,6 @@ jobs:
           timeout 10 ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080
           timeout 120 ./occ app_api:app:register context_chat_backend manual_install --json-info "{\"appid\":\"context_chat_backend\",\"name\":\"Context Chat Backend\",\"daemon_config_name\":\"manual_install\",\"version\":\"${{ fromJson(steps.appinfo.outputs.result).version }}\",\"secret\":\"12345\",\"port\":10034,\"scopes\":[],\"system_app\":0}" --force-scopes --wait-finish
           ls -la context_chat_backend/persistent_storage/*
-          sleep 60 # Wait for the em server to get ready
 
       - name: Initial memory usage check
         run: |
diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py
index d39ea56a..43ced6cc 100644
--- a/context_chat_backend/network_em.py
+++ b/context_chat_backend/network_em.py
@@ -8,7 +8,6 @@
 
 import niquests
 from langchain_core.embeddings import Embeddings
-from pydantic import BaseModel
 
 from .types import (
 	EmbeddingException,
@@ -41,8 +40,17 @@ class CreateEmbeddingResponse(TypedDict):
 	usage: EmbeddingUsage
 
 
-class NetworkEmbeddings(Embeddings, BaseModel):
-	app_config: TConfig
+class NetworkEmbeddings(Embeddings):
+	def __init__(self, app_config: TConfig):
+		self.app_config = app_config
+
+	def check_connection(self) -> bool:
+		try:
+			self.embed_query('hello')
+			return True
+		except EmbeddingException as e:
+			logger.warning('Embedding server connection failed', exc_info=e)
+			return False
 
 	def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] | list[list[float]]:
 		emconf = self.app_config.embedding
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 634b51cd..92d2719e 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -26,6 +26,7 @@
 from .chain.query_proc import get_pruned_query
 from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult
 from .dyn_loader import LLMModelLoader, VectorDBLoader
+from .network_em import NetworkEmbeddings
 from .types import (
 	ActionsQueueItems,
 	ActionType,
@@ -102,6 +103,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 			return
 
 		try:
+			if not __check_em_server(app_config):
+				sleep(POLLING_COOLDOWN)
+				continue
+
 			nc = NextcloudApp()
 			q_items_res = nc.ocs(
 				'GET',
@@ -415,6 +420,10 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 	llm: LLM = llm_loader.load()
 
 	while True:
+		if not __check_em_server(app_config):
+			sleep(POLLING_COOLDOWN)
+			continue
+
 		if THREAD_STOP_EVENT.is_set():
 			LOGGER.info('Updates processing thread is stopping due to stop event being set')
 			return
@@ -822,3 +831,8 @@ def process_search_task(
 			task_input.get('scopeList'),
 		)
 	)
+
+
+def __check_em_server(app_config: TConfig) -> bool:
+	embedding_model = NetworkEmbeddings(app_config=app_config)
+	return embedding_model.check_connection()

From 104a37a8a1b28878b98da5ce7b0eb520ebe73716 Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Wed, 1 Apr 2026 12:38:38 +0200
Subject: [PATCH 29/56] tests: Increase testing time to allow backend to injest
 more sources

---
 .github/workflows/integration-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 8e6ca7d8..b937a147 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -218,8 +218,8 @@ jobs:
       - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files
         run: |
           success=0
-          echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files"
-          for i in {1..90}; do
+          echo "::group::Checking stats periodically for 30 minutes to allow the backend to index the files"
+          for i in {1..180}; do
             echo "Checking stats, attempt $i..."
 
             stats_err=$(mktemp)

From b3b461a2b3a88f2fd815be11c132a7174772aa3c Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Wed, 1 Apr 2026 13:17:13 +0200
Subject: [PATCH 30/56] fix: More log statements

---
 .../chain/ingest/doc_loader.py                | 20 +++++++++--
 context_chat_backend/chain/ingest/injest.py   | 35 +++++++++++++++++++
 context_chat_backend/task_fetcher.py          | 29 +++++++++++++--
 context_chat_backend/utils.py                 | 12 +++++++
 context_chat_backend/vectordb/pgvector.py     | 20 ++++++++++-
 5 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py
index 832c8331..04c611d2 100644
--- a/context_chat_backend/chain/ingest/doc_loader.py
+++ b/context_chat_backend/chain/ingest/doc_loader.py
@@ -7,6 +7,8 @@
 import tempfile
 from collections.abc import Callable
 from io import BytesIO
+import logging
+from time import perf_counter_ns
 
 import docx2txt
 from epub2txt import epub2txt
@@ -19,6 +21,8 @@
 
 from ...types import IndexingException, SourceItem
 
+logger = logging.getLogger('ccb.doc_loader')
+
 
 def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str:
 	raw_bytes = file.read()
@@ -133,10 +137,22 @@ def decode_source(source: SourceItem) -> str:
 		else:
 			io_obj = source.content
 
-		if _loader_map.get(source.type):
-			result = _loader_map[source.type](io_obj)
+		loader_fn = _loader_map.get(source.type)
+		if loader_fn:
+			logger.debug(
+				'Decoding source %r with loader %s (mime: %s) — may be slow or block',
+				source.title, loader_fn.__name__, source.type,
+			)
+			t0 = perf_counter_ns()
+			result = loader_fn(io_obj)
+			elapsed_ms = (perf_counter_ns() - t0) / 1e6
+			logger.debug(
+				'Loader %s for %r finished in %.2f ms (%d chars)',
+				loader_fn.__name__, source.title, elapsed_ms, len(result),
+			)
 			return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip()
 
+		logger.debug('No specific loader for mime type %s, reading as plain text for %r', source.type, source.title)
 		return io_obj.read().decode('utf-8', 'ignore').strip()
 	except IndexingException:
 		raise
diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index 0196f5d9..7ede94a6 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -7,6 +7,7 @@
 import re
 from collections.abc import Mapping
 from io import BytesIO
+from time import perf_counter_ns
 
 import niquests
 from langchain.schema import Document
@@ -42,6 +43,8 @@ async def __fetch_file_content(
 	async with semaphore:
 		nc = AsyncNextcloudApp()
 		try:
+			logger.debug('Downloading file id %d for user %s', file_id, user_id)
+			t0 = perf_counter_ns()
 			# a file pointer for storing the stream in memory until it is consumed
 			fp = BytesIO()
 			await nc._session.download2fp(
@@ -51,6 +54,8 @@ async def __fetch_file_content(
 				params={ 'userId': user_id },
 			)
 			fp.seek(0)
+			elapsed_ms = (perf_counter_ns() - t0) / 1e6
+			logger.debug('Downloaded file id %d for user %s in %.2f ms (%d bytes)', file_id, user_id, elapsed_ms, fp.getbuffer().nbytes)
 			return fp
 		except niquests.exceptions.RequestException as e:
 			if e.response is None:
@@ -89,6 +94,9 @@ async def __fetch_files_content(
 	semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES)
 	tasks = []
 
+	file_count = sum(1 for s in sources.values() if isinstance(s, ReceivedFileItem))
+	logger.debug('Fetching content for %d file(s) (max %d concurrent)', file_count, CONCURRENT_FILE_FETCHES)
+
 	for db_id, file in sources.items():
 		if isinstance(file, SourceItem):
 			continue
@@ -123,7 +131,11 @@ async def __fetch_files_content(
 		# any user id from the list should have read access to the file
 		tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0])))
 
+	logger.debug('Gathering %d file download task(s) — this blocks until all downloads complete or fail', len(tasks))
+	t0 = perf_counter_ns()
 	results = await asyncio.gather(*tasks, return_exceptions=True)
+	elapsed_ms = (perf_counter_ns() - t0) / 1e6
+	logger.debug('All %d file download task(s) completed in %.2f ms', len(tasks), elapsed_ms)
 	for (db_id, file), result in zip(sources.items(), results, strict=True):
 		if isinstance(file, SourceItem):
 			continue
@@ -215,7 +227,14 @@ def _sources_to_indocuments(
 
 		# transform the source to have text data
 		try:
+			logger.debug(
+				'Decoding source %s (type: %s, title: %r) — may be slow for complex file types',
+				source.reference, source.type, source.title,
+			)
+			t0 = perf_counter_ns()
 			content = decode_source(source)
+			elapsed_ms = (perf_counter_ns() - t0) / 1e6
+			logger.debug('Decoded source %s in %.2f ms (%d chars)', source.reference, elapsed_ms, len(content))
 		except IndexingException as e:
 			logger.error(f'Error decoding source ({source.reference}): {e}', exc_info=e)
 			errored_docs[db_id] = IndexingError(
@@ -333,7 +352,17 @@ def _process_sources(
 
 	source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources)
 
+	logger.debug(
+		'Fetching file contents for %d source(s) — this blocks on network I/O to Nextcloud',
+		len(to_embed_sources),
+	)
+	t0 = perf_counter_ns()
 	populated_to_embed_sources, errored_sources = asyncio.run(__fetch_files_content(to_embed_sources))
+	elapsed_ms = (perf_counter_ns() - t0) / 1e6
+	logger.debug(
+		'File content fetch complete in %.2f ms: %d fetched, %d errored',
+		elapsed_ms, len(populated_to_embed_sources), len(errored_sources),
+	)
 	source_proc_results.update(errored_sources)  # pyright: ignore[reportAttributeAccessIssue]
 
 	if len(populated_to_embed_sources) == 0:
@@ -359,7 +388,13 @@ def _process_sources(
 		'source_ids': [indoc.source_id for indoc in indocuments.values()]
 	})
 
+	t0 = perf_counter_ns()
 	doc_add_results = vectordb.add_indocuments(indocuments)
+	elapsed_ms = (perf_counter_ns() - t0) / 1e6
+	logger.info(
+		'vectordb.add_indocuments completed in %.2f ms for %d document(s)',
+		elapsed_ms, len(indocuments),
+	)
 	source_proc_results.update(doc_add_results)  # pyright: ignore[reportAttributeAccessIssue]
 	logger.debug('Added documents to vectordb')
 
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 92d2719e..32673c85 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -82,11 +82,22 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None:
 		return
 
 	def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]:
+		source_refs = [s.reference for s in source_items.values()]
+		LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs)
 		try:
-			return exec_in_proc(
+			result = exec_in_proc(
 				target=embed_sources,
 				args=(vectordb_loader, app_config, source_items),
 			)
+			errors = {k: v for k, v in result.items() if isinstance(v, IndexingError)}
+			LOGGER.info(
+				'embed_sources subprocess finished for %d source(s): %d succeeded, %d errored',
+				len(source_items),
+				len(result) - len(errors),
+				len(errors),
+				extra={'errors': errors} if errors else {},
+			)
+			return result
 		except Exception as e:
 			err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown")
 			source_ids = (s.reference for s in source_items.values())
@@ -94,6 +105,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 				error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}',
 				retryable=True,
 			)
+			LOGGER.error(
+				'embed_sources subprocess raised a %s error for sources %s, marking all as retryable',
+				err_name, source_refs, exc_info=e,
+			)
 			return dict.fromkeys(source_items, err)
 
 
@@ -146,13 +161,21 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 				max_workers=PARALLEL_FILE_PARSING_COUNT,
 				thread_name_prefix='IndexingPool',
 			) as executor:
+				LOGGER.info(
+					'Dispatching %d file chunk(s) and %d provider chunk(s) to %d IndexingPool worker(s)',
+					len(file_chunks), len(provider_chunks), PARALLEL_FILE_PARSING_COUNT,
+				)
 				file_futures = [executor.submit(_load_sources, chunk) for chunk in file_chunks]
 				provider_futures = [executor.submit(_load_sources, chunk) for chunk in provider_chunks]
 
-				for future in file_futures:
+				for i, future in enumerate(file_futures):
+					LOGGER.debug('Waiting for file chunk %d/%d future to complete', i + 1, len(file_futures))
 					files_result.update(future.result())
-				for future in provider_futures:
+					LOGGER.debug('File chunk %d/%d future completed', i + 1, len(file_futures))
+				for i, future in enumerate(provider_futures):
+					LOGGER.debug('Waiting for provider chunk %d/%d future to complete', i + 1, len(provider_futures))
 					providers_result.update(future.result())
+					LOGGER.debug('Provider chunk %d/%d future completed', i + 1, len(provider_futures))
 
 			if (
 				any(isinstance(res, IndexingError) for res in files_result.values())
diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index c7e588b3..d28fc582 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -90,8 +90,20 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 		kwargs=kwargs,
 		daemon=daemon,
 	)
+	target_name = getattr(target, '__name__', str(target))
+	_logger.debug('Starting subprocess for %s', target_name)
+	start = perf_counter_ns()
 	p.start()
+	_logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name)
 	p.join()
+	elapsed_ms = (perf_counter_ns() - start) / 1e6
+	_logger.debug('Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode)
+	if p.exitcode != 0:
+		_logger.warning(
+			'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms'
+			' — possible OOM kill or unhandled signal',
+			p.pid, target_name, p.exitcode, elapsed_ms,
+		)
 
 	result = pconn.recv()
 	if result['error'] is not None:
diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py
index 86f636be..33dfb039 100644
--- a/context_chat_backend/vectordb/pgvector.py
+++ b/context_chat_backend/vectordb/pgvector.py
@@ -6,6 +6,7 @@
 import os
 from collections.abc import Mapping
 from datetime import datetime
+from time import perf_counter_ns
 
 import psycopg
 import sqlalchemy as sa
@@ -152,8 +153,25 @@ def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int,
 					# so we chunk the documents into (5 values * 10k) chunks
 					# change the chunk size when there are more inserted values per document
 					chunk_ids = []
-					for i in range(0, len(indoc.documents), batch_size):
+					total_chunks = len(indoc.documents)
+					num_batches = max(1, -(-total_chunks // batch_size))  # ceiling division
+					logger.debug(
+						'Embedding source %s: %d chunk(s) in %d batch(es) — blocks on embedding model',
+						indoc.source_id, total_chunks, num_batches,
+					)
+					for i in range(0, total_chunks, batch_size):
+						batch_num = i // batch_size + 1
+						logger.debug(
+							'Sending embedding batch %d/%d (%d chunk(s)) for source %s',
+							batch_num, num_batches, len(indoc.documents[i:i+batch_size]), indoc.source_id,
+						)
+						t0 = perf_counter_ns()
 						chunk_ids.extend(self.client.add_documents(indoc.documents[i:i+batch_size]))
+						elapsed_ms = (perf_counter_ns() - t0) / 1e6
+						logger.debug(
+							'Embedding batch %d/%d for source %s completed in %.2f ms',
+							batch_num, num_batches, indoc.source_id, elapsed_ms,
+						)
 
 					doc = DocumentsStore(
 						source_id=indoc.source_id,

From a4a88dae5f231732e448cefb9c0ea3e0da03aee5 Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Wed, 1 Apr 2026 13:18:24 +0200
Subject: [PATCH 31/56] tests: Set wait time back to 90

---
 .github/workflows/integration-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index b937a147..8e6ca7d8 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -218,8 +218,8 @@ jobs:
       - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files
         run: |
           success=0
-          echo "::group::Checking stats periodically for 30 minutes to allow the backend to index the files"
-          for i in {1..180}; do
+          echo "::group::Checking stats periodically for 15 minutes to allow the backend to index the files"
+          for i in {1..90}; do
             echo "Checking stats, attempt $i..."
 
             stats_err=$(mktemp)

From 0c52747375355e6e0338fd68599338f8bd644dc4 Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Wed, 1 Apr 2026 14:04:57 +0200
Subject: [PATCH 32/56] fix: Reduce worker count on github actions

to prevent oom
---
 context_chat_backend/task_fetcher.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 32673c85..91d1991a 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -59,6 +59,10 @@
 MIN_FILES_PER_CPU = 4
 # divides the batch into these many chunks
 PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1)  # todo: config?
+if os.getenv('GITHUB_ACTIONS'):
+	# Keep CI memory usage predictable and avoid OOM-killed workers.
+	PARALLEL_FILE_PARSING_COUNT = max(1, min(PARALLEL_FILE_PARSING_COUNT, 2))
+LOGGER.info(f'Using {PARALLEL_FILE_PARSING_COUNT} parallel file parsing workers')
 ACTIONS_BATCH_SIZE = 512  # todo: config?
 POLLING_COOLDOWN = 30
 TRIGGER = Event()

From e676c329ca5a0c147ef0bfadbf5c372f4e25dd99 Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Wed, 1 Apr 2026 14:14:58 +0200
Subject: [PATCH 33/56] fix(exec_in_proc): Raise RuntimeError if exitcode is
 non-zero

---
 context_chat_backend/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index d28fc582..024e71c8 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -104,6 +104,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 			' — possible OOM kill or unhandled signal',
 			p.pid, target_name, p.exitcode, elapsed_ms,
 		)
+		raise RuntimeError(f'Subprocess PID {p.pid} for {target_name} exited with non-zero exit code {p.exitcode}')
 
 	result = pconn.recv()
 	if result['error'] is not None:

From b027ff3234a50cf8eb5a1447bafbef8f147212b5 Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Wed, 1 Apr 2026 14:46:42 +0200
Subject: [PATCH 34/56] fix(indexing): Reduce memory pressure on gh actions

---
 context_chat_backend/task_fetcher.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 91d1991a..2a7e84fd 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -56,7 +56,11 @@
 THREAD_STOP_EVENT = Event()
 LOGGER = logging.getLogger('ccb.task_fetcher')
 FILES_INDEXING_BATCH_SIZE = 16  # theoretical max RAM usage: 16 * 100 MiB, todo: config?
+if os.getenv('GITHUB_ACTIONS'):
+	FILES_INDEXING_BATCH_SIZE = 4
 MIN_FILES_PER_CPU = 4
+if os.getenv('GITHUB_ACTIONS'):
+	MIN_FILES_PER_CPU = 2
 # divides the batch into these many chunks
 PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1)  # todo: config?
 if os.getenv('GITHUB_ACTIONS'):

From 19b773fac97d3cf76fb581224df76d63e3c9a34d Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Wed, 1 Apr 2026 15:19:06 +0200
Subject: [PATCH 35/56] fix(indexing): Fallback to batch_size=1 if
 embed_sources is killed

and do not retry afterward if one these single item batches get killed
---
 context_chat_backend/task_fetcher.py | 51 +++++++++++++++++++++++++---
 context_chat_backend/utils.py        | 13 ++++++-
 2 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 2a7e84fd..edeabc12 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -39,7 +39,7 @@
 	SourceItem,
 	TConfig,
 )
-from .utils import exec_in_proc, get_app_role
+from .utils import SubprocessKilledError, exec_in_proc, get_app_role
 from .vectordb.base import BaseVectorDB
 from .vectordb.service import (
 	decl_update_access,
@@ -89,6 +89,29 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None:
 		LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e)
 		return
 
+	def _embed_one(db_id: int, item: SourceItem | ReceivedFileItem) -> tuple[int, IndexingError | None]:
+		"""Run embed_sources for a single item in its own subprocess. Returns (db_id, error_or_None)."""
+		try:
+			result = exec_in_proc(
+				target=embed_sources,
+				args=(vectordb_loader, app_config, {db_id: item}),
+			)
+			return db_id, result.get(db_id)
+		except SubprocessKilledError as e:
+			LOGGER.error(
+				'embed_sources subprocess killed for individual source %s — marking as non-retryable'
+				' to prevent infinite OOM retry loop',
+				item.reference, exc_info=e,
+			)
+			return db_id, IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False)
+		except Exception as e:
+			err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown')
+			LOGGER.error(
+				'embed_sources raised a %s error for individual source %s, marking as retryable',
+				err_name, item.reference, exc_info=e,
+			)
+			return db_id, IndexingError(error=str(e), retryable=True)
+
 	def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]:
 		source_refs = [s.reference for s in source_items.values()]
 		LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs)
@@ -106,11 +129,31 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 				extra={'errors': errors} if errors else {},
 			)
 			return result
+		except SubprocessKilledError as e:
+			LOGGER.error(
+				'embed_sources subprocess was killed (likely OOM) for %d source(s): %s',
+				len(source_items), source_refs, exc_info=e,
+			)
+			if len(source_items) == 1:
+				# Single-item subprocess was killed — mark non-retryable to break infinite OOM loop.
+				LOGGER.error(
+					'Single-item subprocess killed for %s — marking as non-retryable',
+					source_refs,
+				)
+				return {db_id: IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False)
+					for db_id in source_items}
+
+			# Multi-item batch: fall back to one subprocess per source to pinpoint the problematic file.
+			LOGGER.warning(
+				'Falling back to individual processing for %d sources to isolate any OOM-causing file(s)',
+				len(source_items),
+			)
+			return dict(_embed_one(db_id, item) for db_id, item in source_items.items())
+
 		except Exception as e:
-			err_name = {DbException: "DB", EmbeddingException: "Embedding"}.get(type(e), "Unknown")
-			source_ids = (s.reference for s in source_items.values())
+			err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown')
 			err = IndexingError(
-				error=f'{err_name} Error occurred, the sources {source_ids} will be retried: {e}',
+				error=f'{err_name} Error: {e}',
 				retryable=True,
 			)
 			LOGGER.error(
diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index 024e71c8..4b9fad51 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -69,6 +69,17 @@ def JSONResponse(
 	return FastAPIJSONResponse(content, status_code, **kwargs)
 
 
+class SubprocessKilledError(RuntimeError):
+	"""Raised when a subprocess exits with a non-zero exit code (likely OOM kill or unhandled signal)."""
+
+	def __init__(self, pid: int, target_name: str, exitcode: int):
+		super().__init__(
+			f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}'
+			' — possible OOM kill or unhandled signal'
+		)
+		self.exitcode = exitcode
+
+
 def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs):
 	try:
 		if fun is None:
@@ -104,7 +115,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 			' — possible OOM kill or unhandled signal',
 			p.pid, target_name, p.exitcode, elapsed_ms,
 		)
-		raise RuntimeError(f'Subprocess PID {p.pid} for {target_name} exited with non-zero exit code {p.exitcode}')
+		raise SubprocessKilledError(p.pid, target_name, p.exitcode)
 
 	result = pconn.recv()
 	if result['error'] is not None:

From bde0bc54e2dde254b37fe426418abbca295a27a0 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 2 Apr 2026 14:18:47 +0530
Subject: [PATCH 36/56] fix: log stdout and stderr from subprocesses

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/utils.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index 4b9fad51..068ffa83 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import io
 import logging
 import multiprocessing as mp
 import os
+import sys
 import traceback
 from collections.abc import Callable
 from functools import partial, wraps
@@ -80,7 +82,12 @@ def __init__(self, pid: int, target_name: str, exitcode: int):
 		self.exitcode = exitcode
 
 
-def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs):
+def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs):
+	stdout_capture = io.StringIO()
+	stderr_capture = io.StringIO()
+	sys.stdout = stdout_capture
+	sys.stderr = stderr_capture
+
 	try:
 		if fun is None:
 			return resconn.send({ 'value': None, 'error': None })
@@ -88,11 +95,15 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, **kwargs):
 	except Exception as e:
 		tb = traceback.format_exc()
 		resconn.send({ 'value': None, 'error': e, 'traceback': tb })
+	finally:
+		stdconn.send({'stdout': stdout_capture.getvalue(), 'stderr': stderr_capture.getvalue()})
 
 
 def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None):  # noqa: B006
 	pconn, cconn = mp.Pipe()
+	std_pconn, std_cconn = mp.Pipe()
 	kwargs['resconn'] = cconn
+	kwargs['stdconn'] = std_cconn
 	p = mp.Process(
 		group=group,
 		target=partial(exception_wrap, target),
@@ -108,20 +119,28 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 	_logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name)
 	p.join()
 	elapsed_ms = (perf_counter_ns() - start) / 1e6
-	_logger.debug('Subprocess PID %d for %s finished in %.2f ms (exit code: %s)', p.pid, target_name, elapsed_ms, p.exitcode)
+	_logger.debug(
+		'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)',
+		p.pid, target_name, elapsed_ms, p.exitcode,
+	)
 	if p.exitcode != 0:
 		_logger.warning(
 			'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms'
 			' — possible OOM kill or unhandled signal',
 			p.pid, target_name, p.exitcode, elapsed_ms,
 		)
-		raise SubprocessKilledError(p.pid, target_name, p.exitcode)
+		raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1)
 
 	result = pconn.recv()
 	if result['error'] is not None:
 		_logger.error('original traceback: %s', result['traceback'])
 		raise result['error']
 
+	stdobj = std_pconn.recv()
+	_logger.info(f'std info for {target_name}', extra={
+		'stdout': stdobj['stdout'],
+		'stderr': stdobj['stderr'],
+	})
 	return result['value']
 
 

From 4de591f79b29746c220cd0a268b9254a18fc424c Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 2 Apr 2026 14:57:16 +0530
Subject: [PATCH 37/56] fix: don't raise before std* is captured

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/utils.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index 068ffa83..3122a417 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -123,6 +123,17 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 		'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)',
 		p.pid, target_name, elapsed_ms, p.exitcode,
 	)
+	stdobj = std_pconn.recv()
+	_logger.info(f'std info for {target_name}', extra={
+		'stdout': stdobj['stdout'],
+		'stderr': stdobj['stderr'],
+	})
+
+	result = pconn.recv()
+	if result['error'] is not None:
+		_logger.error('original traceback: %s', result['traceback'])
+		raise result['error']
+
 	if p.exitcode != 0:
 		_logger.warning(
 			'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms'
@@ -131,16 +142,6 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 		)
 		raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1)
 
-	result = pconn.recv()
-	if result['error'] is not None:
-		_logger.error('original traceback: %s', result['traceback'])
-		raise result['error']
-
-	stdobj = std_pconn.recv()
-	_logger.info(f'std info for {target_name}', extra={
-		'stdout': stdobj['stdout'],
-		'stderr': stdobj['stderr'],
-	})
 	return result['value']
 
 

From 4deda845f40dd3e3419253ec647d156a4c76e218 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 2 Apr 2026 15:01:10 +0530
Subject: [PATCH 38/56] feat: log cpu count and memory info of the system

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 main.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index c4ffa1fd..8d838d80 100755
--- a/main.py
+++ b/main.py
@@ -4,8 +4,9 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 import logging
-from os import getenv
+from os import cpu_count, getenv
 
+import psutil
 import uvicorn
 from nc_py_api.ex_app import run_app
 
@@ -48,6 +49,7 @@ def _setup_log_levels(debug: bool):
 	app_config: TConfig = app.extra['CONFIG']
 	_setup_log_levels(app_config.debug)
 
+	print(f'CPU count: {cpu_count()}, Memory: {psutil.virtual_memory()}')
 	print('App config:\n' + redact_config(app_config).model_dump_json(indent=2), flush=True)
 
 	uv_log_config = uvicorn.config.LOGGING_CONFIG  # pyright: ignore[reportAttributeAccessIssue]

From ad0eac70712600964f45e2401bed411945e148a7 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Thu, 2 Apr 2026 17:41:39 +0530
Subject: [PATCH 39/56] fix: catch BaseException in subprocess

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index 3122a417..02545d9f 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -92,7 +92,7 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co
 		if fun is None:
 			return resconn.send({ 'value': None, 'error': None })
 		resconn.send({ 'value': fun(*args, **kwargs), 'error': None })
-	except Exception as e:
+	except BaseException as e:
 		tb = traceback.format_exc()
 		resconn.send({ 'value': None, 'error': e, 'traceback': tb })
 	finally:

From 36bcfb721364912bcca24c37bc30e357cebfe275 Mon Sep 17 00:00:00 2001
From: Marcel Klehr <mklehr@gmx.net>
Date: Thu, 2 Apr 2026 14:19:49 +0200
Subject: [PATCH 40/56] fix(utils): Improve exec_in_proc to handle more failure
 modes

---
 context_chat_backend/utils.py | 170 +++++++++++++++++++++++++++++-----
 1 file changed, 149 insertions(+), 21 deletions(-)

diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index 02545d9f..e994a3f2 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -9,6 +9,7 @@
 import sys
 import traceback
 from collections.abc import Callable
+from contextlib import suppress
 from functools import partial, wraps
 from multiprocessing.connection import Connection
 from time import perf_counter_ns
@@ -72,31 +73,95 @@ def JSONResponse(
 
 
 class SubprocessKilledError(RuntimeError):
-	"""Raised when a subprocess exits with a non-zero exit code (likely OOM kill or unhandled signal)."""
+	"""Raised when a subprocess is terminated by a signal (for example SIGKILL)."""
 
 	def __init__(self, pid: int, target_name: str, exitcode: int):
 		super().__init__(
-			f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}'
-			' — possible OOM kill or unhandled signal'
+			f'Subprocess PID {pid} for {target_name} exited with signal {abs(exitcode)} '
+			f'(raw exit code: {exitcode})'
 		)
 		self.exitcode = exitcode
 
 
+class SubprocessExecutionError(RuntimeError):
+	"""Raised when a subprocess exits non-zero without a recoverable Python exception payload."""
+
+	def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''):
+		msg = f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}'
+		if details:
+			msg = f'{msg}: {details}'
+		super().__init__(msg)
+		self.exitcode = exitcode
+
+
+_MAX_STD_CAPTURE_CHARS = 64 * 1024
+
+
+def _truncate_capture(text: str) -> tuple[str, bool]:
+	if len(text) <= _MAX_STD_CAPTURE_CHARS:
+		return text, False
+
+	head = _MAX_STD_CAPTURE_CHARS // 2
+	tail = _MAX_STD_CAPTURE_CHARS - head
+	omitted = len(text) - _MAX_STD_CAPTURE_CHARS
+	truncated = (
+		f'[truncated {omitted} chars]\n'
+		f'{text[:head]}\n'
+		'[...snip...]\n'
+		f'{text[-tail:]}'
+	)
+	return truncated, True
+
+
 def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs):
 	stdout_capture = io.StringIO()
 	stderr_capture = io.StringIO()
+	orig_stdout = sys.stdout
+	orig_stderr = sys.stderr
 	sys.stdout = stdout_capture
 	sys.stderr = stderr_capture
 
 	try:
 		if fun is None:
-			return resconn.send({ 'value': None, 'error': None })
-		resconn.send({ 'value': fun(*args, **kwargs), 'error': None })
+			resconn.send({ 'value': None, 'error': None })
+		else:
+			resconn.send({ 'value': fun(*args, **kwargs), 'error': None })
 	except BaseException as e:
 		tb = traceback.format_exc()
-		resconn.send({ 'value': None, 'error': e, 'traceback': tb })
+		payload = {
+			'value': None,
+			'error': e,
+			'traceback': tb,
+			'error_type': type(e).__name__,
+			'error_module': type(e).__module__,
+			'error_message': str(e),
+		}
+		try:
+			resconn.send(payload)
+		except Exception as send_err:
+			# Fallback for unpicklable exceptions.
+			with suppress(Exception):
+				resconn.send({
+					'value': None,
+					'error': None,
+					'traceback': tb,
+					'error_type': type(e).__name__,
+					'error_module': type(e).__module__,
+					'error_message': str(e),
+					'send_error': str(send_err),
+				})
 	finally:
-		stdconn.send({'stdout': stdout_capture.getvalue(), 'stderr': stderr_capture.getvalue()})
+		sys.stdout = orig_stdout
+		sys.stderr = orig_stderr
+		stdout_text, stdout_truncated = _truncate_capture(stdout_capture.getvalue())
+		stderr_text, stderr_truncated = _truncate_capture(stderr_capture.getvalue())
+		with suppress(Exception):
+			stdconn.send({
+				'stdout': stdout_text,
+				'stderr': stderr_text,
+				'stdout_truncated': stdout_truncated,
+				'stderr_truncated': stderr_truncated,
+			})
 
 
 def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None):  # noqa: B006
@@ -117,30 +182,93 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 	start = perf_counter_ns()
 	p.start()
 	_logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name)
+
+	result = None
+	stdobj = {
+		'stdout': '',
+		'stderr': '',
+		'stdout_truncated': False,
+		'stderr_truncated': False,
+	}
+	got_result = False
+	got_std = False
+
+	# Drain result/std pipes while child is still alive to avoid deadlock on full pipe buffers.
+	while p.is_alive() and (not got_result or not got_std):
+		if not got_result and pconn.poll(0.1):
+			with suppress(EOFError, OSError, BrokenPipeError):
+				result = pconn.recv()
+				got_result = True
+		if not got_std and std_pconn.poll():
+			with suppress(EOFError, OSError, BrokenPipeError):
+				stdobj = std_pconn.recv()
+				got_std = True
+
 	p.join()
 	elapsed_ms = (perf_counter_ns() - start) / 1e6
 	_logger.debug(
 		'Subprocess PID %d for %s finished in %.2f ms (exit code: %s)',
 		p.pid, target_name, elapsed_ms, p.exitcode,
 	)
-	stdobj = std_pconn.recv()
-	_logger.info(f'std info for {target_name}', extra={
-		'stdout': stdobj['stdout'],
-		'stderr': stdobj['stderr'],
-	})
-
-	result = pconn.recv()
-	if result['error'] is not None:
-		_logger.error('original traceback: %s', result['traceback'])
+
+	if not got_std:
+		with suppress(EOFError, OSError, BrokenPipeError):
+			if std_pconn.poll():
+				stdobj = std_pconn.recv()
+				got_std = True
+	if stdobj['stdout'] or stdobj['stderr']:
+		extra = {
+			'stdout': stdobj['stdout'],
+			'stderr': stdobj['stderr'],
+		}
+		if stdobj.get('stdout_truncated') or stdobj.get('stderr_truncated'):
+			extra['stdio_truncated'] = {
+				'stdout': bool(stdobj.get('stdout_truncated')),
+				'stderr': bool(stdobj.get('stderr_truncated')),
+			}
+		_logger.info('std info for %s', target_name, extra=extra)
+
+	if not got_result:
+		with suppress(EOFError, OSError, BrokenPipeError):
+			if pconn.poll():
+				result = pconn.recv()
+				got_result = True
+
+	if result is not None and result.get('error') is not None:
+		_logger.error('original traceback: %s', result.get('traceback', ''))
 		raise result['error']
 
-	if p.exitcode != 0:
+	if result is not None and result.get('error_type'):
+		details = (
+			f"{result.get('error_module', '')}.{result.get('error_type', '')}: "
+			f"{result.get('error_message', '')}"
+		)
+		if result.get('traceback'):
+			_logger.error('remote traceback: %s', result['traceback'])
+		raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details)
+
+	if p.exitcode and p.exitcode < 0:
 		_logger.warning(
-			'Subprocess PID %d for %s exited with non-zero exit code %d after %.2f ms'
-			' — possible OOM kill or unhandled signal',
-			p.pid, target_name, p.exitcode, elapsed_ms,
+			'Subprocess PID %d for %s exited due to signal %d after %.2f ms',
+			p.pid, target_name, abs(p.exitcode), elapsed_ms,
+		)
+		raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode)
+
+	if p.exitcode not in (None, 0):
+		raise SubprocessExecutionError(
+			p.pid or 0,
+			target_name,
+			p.exitcode,
+			'No structured exception payload received from child process',
+		)
+
+	if result is None:
+		raise SubprocessExecutionError(
+			p.pid or 0,
+			target_name,
+			0,
+			'Subprocess exited successfully but returned no result payload',
 		)
-		raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode or -1)
 
 	return result['value']
 

From 47eaf72daec83faec6d9a4a4ce9e23b231cfba31 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 11:08:34 +0530
Subject: [PATCH 41/56] one more stab at a fix

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/utils.py | 37 ++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index e994a3f2..b4e93c79 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import atexit
+import faulthandler
 import io
 import logging
 import multiprocessing as mp
@@ -114,6 +116,28 @@ def _truncate_capture(text: str) -> tuple[str, bool]:
 
 
 def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs):
+	# --- diagnostic probes: write directly to the real stderr FD so they survive
+	# Python's stdout/stderr redirection below and even os._exit() won't hide them
+	# from the parent process's stderr stream.
+	_diag_fd = os.dup(2)  # dup before we capture sys.stderr
+
+	def _raw_diag(msg: str) -> None:
+		with suppress(Exception):
+			os.write(_diag_fd, (msg + '\n').encode())
+
+	# Enable faulthandler on the real FD so crash tracebacks (SIGSEGV etc.) appear.
+	with suppress(Exception):
+		faulthandler.enable(file=os.fdopen(os.dup(_diag_fd), 'w', closefd=True), all_threads=True)
+
+	# Atexit probe: if this message NEVER appears, it means os._exit() (C-level)
+	# was called with Python's cleanup phase entirely skipped.
+	_fun_name = getattr(fun, '__name__', str(fun))
+	atexit.register(
+		_raw_diag,
+		f'[exception_wrap/atexit] pid={os.getpid()} target={_fun_name}'
+		': Python atexit reached (normal Python exit)',
+	)
+
 	stdout_capture = io.StringIO()
 	stderr_capture = io.StringIO()
 	orig_stdout = sys.stdout
@@ -124,10 +148,18 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co
 	try:
 		if fun is None:
 			resconn.send({ 'value': None, 'error': None })
+			_raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result sent (fun=None)')
 		else:
-			resconn.send({ 'value': fun(*args, **kwargs), 'error': None })
+			result_value = fun(*args, **kwargs)
+			_raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: fun() returned, sending result')
+			resconn.send({ 'value': result_value, 'error': None })
+			_raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result pipe send complete')
 	except BaseException as e:
 		tb = traceback.format_exc()
+		_raw_diag(
+			f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}'
+			f': caught {type(e).__name__}: {e}'
+		)
 		payload = {
 			'value': None,
 			'error': e,
@@ -162,6 +194,9 @@ def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Co
 				'stdout_truncated': stdout_truncated,
 				'stderr_truncated': stderr_truncated,
 			})
+		_raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: finally block complete')
+		with suppress(Exception):
+			os.close(_diag_fd)
 
 
 def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None):  # noqa: B006

From 309ab2bf19a54fb89c01f61550b07a9daf9d45d1 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 11:43:38 +0530
Subject: [PATCH 42/56] do not throw away the valid result even with exitcode 1

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/utils.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index b4e93c79..fe4ee96c 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -282,6 +282,23 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 			_logger.error('remote traceback: %s', result['traceback'])
 		raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details)
 
+	# If we received a valid result payload, return it even if the exit
+	# code is non-zero.  The non-zero code typically comes from
+	# multiprocessing/C-extension cleanup (e.g. util._exit_function or
+	# a native atexit handler) that runs *after* exception_wrap has
+	# already sent the result over the pipe.
+	if result is not None and 'value' in result:
+		if p.exitcode not in (None, 0):
+			_logger.warning(
+				'Subprocess PID %d for %s exited with code %s after %.2f ms'
+				' but returned a valid result — accepting the result.'
+				' The non-zero exit likely originates from process'
+				' cleanup (multiprocessing finalizers, C-extension'
+				' atexit, etc.).',
+				p.pid, target_name, p.exitcode, elapsed_ms,
+			)
+		return result['value']
+
 	if p.exitcode and p.exitcode < 0:
 		_logger.warning(
 			'Subprocess PID %d for %s exited due to signal %d after %.2f ms',
@@ -297,15 +314,12 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 			'No structured exception payload received from child process',
 		)
 
-	if result is None:
-		raise SubprocessExecutionError(
-			p.pid or 0,
-			target_name,
-			0,
-			'Subprocess exited successfully but returned no result payload',
-		)
-
-	return result['value']
+	raise SubprocessExecutionError(
+		p.pid or 0,
+		target_name,
+		0,
+		'Subprocess exited successfully but returned no result payload',
+	)
 
 
 def timed(func: Callable):

From e1763acdcdfa590cee3c74f6ba1acadf1d9c6f9c Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 12:19:09 +0530
Subject: [PATCH 43/56] fix: use forkserver as process start method

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/controller.py |  4 ----
 main.py                            | 13 +++++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 49d1d737..3a8e15a9 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -16,7 +16,6 @@
 # ruff: noqa: E402
 
 import logging
-import multiprocessing as mp
 import os
 import tempfile
 import threading
@@ -122,9 +121,6 @@ async def lifespan(app: FastAPI):
 index_lock = threading.Lock()
 _indexing = {}
 
-# limit the number of concurrent document parsing
-doc_parse_semaphore = mp.Semaphore(app_config.doc_parser_worker_limit)
-
 
 # middlewares
 
diff --git a/main.py b/main.py
index 8d838d80..4e88ee9f 100755
--- a/main.py
+++ b/main.py
@@ -3,6 +3,7 @@
 # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+
 import logging
 from os import cpu_count, getenv
 
@@ -44,6 +45,18 @@ def _setup_log_levels(debug: bool):
 
 
 if __name__ == '__main__':
+	import multiprocessing as mp
+
+	# do forks from a clean process that doesn't have any threads or locks
+	mp.set_start_method('forkserver')
+	mp.set_forkserver_preload([
+		'langchain',
+		'sqlalchemy',
+		'numpy',
+		'context_chat_backend.chain.ingest.injest',
+		'context_chat_backend.vectordb.pgvector',
+	])
+
 	logging_config = get_logging_config(LOGGER_CONFIG_NAME)
 	setup_logging(logging_config)
 	app_config: TConfig = app.extra['CONFIG']

From 330165205127524780038280854dacc19f552e9c Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 13:16:49 +0530
Subject: [PATCH 44/56] fix(ci): consider eligible files as the total files
 count

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 8e6ca7d8..8ec8eabe 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -241,17 +241,17 @@ jobs:
               continue
             fi
 
-            # Extract total queued files
-            total_files=$(echo "$stats" | jq '.queued_documents_counts.files__default' || echo "")
+            # Extract total eligible files
+            total_eligible_files=$(echo "$stats" | jq '.eligible_files_count' || echo "")
 
             # Extract indexed documents count (files__default)
             indexed_count=$(echo "$stats" | jq '.vectordb_document_counts.files__default' || echo "")
 
-            echo "Total queued files: $total_files"
+            echo "Total eligible files: $total_eligible_files"
             echo "Indexed documents (files__default): $indexed_count"
 
-            diff=$((total_files - indexed_count))
-            threshold=$((total_files * 3 / 100))
+            diff=$((total_eligible_files - indexed_count))
+            threshold=$((total_eligible_files * 3 / 100))
 
             # Check if difference is within tolerance
             if [ $diff -le $threshold ]; then
@@ -259,7 +259,7 @@ jobs:
               success=1
               break
             else
-              progress=$((diff * 100 / total_files))
+              progress=$((diff * 100 / total_eligible_files))
               echo "Outside 3% tolerance: diff=$diff (${progress}%), threshold=$threshold"
             fi
 

From 32aa37474547c3f3e7993cf638171ef309c1e1df Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 15:13:14 +0530
Subject: [PATCH 45/56] fix: use logging config in forkserver and other fixes

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/utils.py | 12 ++++++++----
 main.py                       | 17 +++++++++--------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index fe4ee96c..5f12d0c5 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -86,10 +86,10 @@ def __init__(self, pid: int, target_name: str, exitcode: int):
 
 
 class SubprocessExecutionError(RuntimeError):
-	"""Raised when a subprocess exits non-zero without a recoverable Python exception payload."""
+	"""Raised when a subprocess exits without a recoverable Python exception payload."""
 
 	def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''):
-		msg = f'Subprocess PID {pid} for {target_name} exited with non-zero exit code {exitcode}'
+		msg = f'Subprocess PID {pid} for {target_name} exited with exit code {exitcode}'
 		if details:
 			msg = f'{msg}: {details}'
 		super().__init__(msg)
@@ -199,7 +199,11 @@ def _raw_diag(msg: str) -> None:
 			os.close(_diag_fd)
 
 
-def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None):  # noqa: B006
+def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None):
+	if not kwargs:
+		kwargs = {}
+
+	# parent, child
 	pconn, cconn = mp.Pipe()
 	std_pconn, std_cconn = mp.Pipe()
 	kwargs['resconn'] = cconn
@@ -318,7 +322,7 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs={}, *, daem
 		p.pid or 0,
 		target_name,
 		0,
-		'Subprocess exited successfully but returned no result payload',
+		f'Subprocess exited successfully but returned no result payload: {result}',
 	)
 
 
diff --git a/main.py b/main.py
index 4e88ee9f..c2614515 100755
--- a/main.py
+++ b/main.py
@@ -47,21 +47,22 @@ def _setup_log_levels(debug: bool):
 if __name__ == '__main__':
 	import multiprocessing as mp
 
+	logging_config = get_logging_config(LOGGER_CONFIG_NAME)
+	setup_logging(logging_config)
+	app_config: TConfig = app.extra['CONFIG']
+	_setup_log_levels(app_config.debug)
+
 	# do forks from a clean process that doesn't have any threads or locks
 	mp.set_start_method('forkserver')
 	mp.set_forkserver_preload([
-		'langchain',
-		'sqlalchemy',
-		'numpy',
 		'context_chat_backend.chain.ingest.injest',
 		'context_chat_backend.vectordb.pgvector',
+		'langchain',
+		'logging',
+		'numpy',
+		'sqlalchemy',
 	])
 
-	logging_config = get_logging_config(LOGGER_CONFIG_NAME)
-	setup_logging(logging_config)
-	app_config: TConfig = app.extra['CONFIG']
-	_setup_log_levels(app_config.debug)
-
 	print(f'CPU count: {cpu_count()}, Memory: {psutil.virtual_memory()}')
 	print('App config:\n' + redact_config(app_config).model_dump_json(indent=2), flush=True)
 

From 33ee38ab24d9567f2a0152b7d55870a28ca2bbe1 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 15:23:40 +0530
Subject: [PATCH 46/56] fix: remove extra diagnostics

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .../chain/ingest/doc_loader.py                |  20 +--
 context_chat_backend/chain/ingest/injest.py   |  15 +-
 context_chat_backend/task_fetcher.py          |  60 ++-----
 context_chat_backend/utils.py                 | 146 +++++-------------
 context_chat_backend/vectordb/pgvector.py     |   2 +-
 5 files changed, 62 insertions(+), 181 deletions(-)

diff --git a/context_chat_backend/chain/ingest/doc_loader.py b/context_chat_backend/chain/ingest/doc_loader.py
index 04c611d2..832c8331 100644
--- a/context_chat_backend/chain/ingest/doc_loader.py
+++ b/context_chat_backend/chain/ingest/doc_loader.py
@@ -7,8 +7,6 @@
 import tempfile
 from collections.abc import Callable
 from io import BytesIO
-import logging
-from time import perf_counter_ns
 
 import docx2txt
 from epub2txt import epub2txt
@@ -21,8 +19,6 @@
 
 from ...types import IndexingException, SourceItem
 
-logger = logging.getLogger('ccb.doc_loader')
-
 
 def _temp_file_wrapper(file: BytesIO, loader: Callable, sep: str = '\n') -> str:
 	raw_bytes = file.read()
@@ -137,22 +133,10 @@ def decode_source(source: SourceItem) -> str:
 		else:
 			io_obj = source.content
 
-		loader_fn = _loader_map.get(source.type)
-		if loader_fn:
-			logger.debug(
-				'Decoding source %r with loader %s (mime: %s) — may be slow or block',
-				source.title, loader_fn.__name__, source.type,
-			)
-			t0 = perf_counter_ns()
-			result = loader_fn(io_obj)
-			elapsed_ms = (perf_counter_ns() - t0) / 1e6
-			logger.debug(
-				'Loader %s for %r finished in %.2f ms (%d chars)',
-				loader_fn.__name__, source.title, elapsed_ms, len(result),
-			)
+		if _loader_map.get(source.type):
+			result = _loader_map[source.type](io_obj)
 			return result.encode('utf-8', 'ignore').decode('utf-8', 'ignore').strip()
 
-		logger.debug('No specific loader for mime type %s, reading as plain text for %r', source.type, source.title)
 		return io_obj.read().decode('utf-8', 'ignore').strip()
 	except IndexingException:
 		raise
diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index 7ede94a6..8e321088 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -43,8 +43,6 @@ async def __fetch_file_content(
 	async with semaphore:
 		nc = AsyncNextcloudApp()
 		try:
-			logger.debug('Downloading file id %d for user %s', file_id, user_id)
-			t0 = perf_counter_ns()
 			# a file pointer for storing the stream in memory until it is consumed
 			fp = BytesIO()
 			await nc._session.download2fp(
@@ -54,8 +52,6 @@ async def __fetch_file_content(
 				params={ 'userId': user_id },
 			)
 			fp.seek(0)
-			elapsed_ms = (perf_counter_ns() - t0) / 1e6
-			logger.debug('Downloaded file id %d for user %s in %.2f ms (%d bytes)', file_id, user_id, elapsed_ms, fp.getbuffer().nbytes)
 			return fp
 		except niquests.exceptions.RequestException as e:
 			if e.response is None:
@@ -131,11 +127,7 @@ async def __fetch_files_content(
 		# any user id from the list should have read access to the file
 		tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0])))
 
-	logger.debug('Gathering %d file download task(s) — this blocks until all downloads complete or fail', len(tasks))
-	t0 = perf_counter_ns()
 	results = await asyncio.gather(*tasks, return_exceptions=True)
-	elapsed_ms = (perf_counter_ns() - t0) / 1e6
-	logger.debug('All %d file download task(s) completed in %.2f ms', len(tasks), elapsed_ms)
 	for (db_id, file), result in zip(sources.items(), results, strict=True):
 		if isinstance(file, SourceItem):
 			continue
@@ -227,10 +219,7 @@ def _sources_to_indocuments(
 
 		# transform the source to have text data
 		try:
-			logger.debug(
-				'Decoding source %s (type: %s, title: %r) — may be slow for complex file types',
-				source.reference, source.type, source.title,
-			)
+			logger.debug('Decoding source %s (type: %s)', source.reference, source.type)
 			t0 = perf_counter_ns()
 			content = decode_source(source)
 			elapsed_ms = (perf_counter_ns() - t0) / 1e6
@@ -353,7 +342,7 @@ def _process_sources(
 	source_proc_results = _increase_access_for_existing_sources(vectordb, existing_sources)
 
 	logger.debug(
-		'Fetching file contents for %d source(s) — this blocks on network I/O to Nextcloud',
+		'Fetching file contents for %d source(s) from Nextcloud',
 		len(to_embed_sources),
 	)
 	t0 = perf_counter_ns()
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index edeabc12..c75cec0d 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -31,7 +31,6 @@
 	ActionsQueueItems,
 	ActionType,
 	AppRole,
-	EmbeddingException,
 	FilesQueueItems,
 	IndexingError,
 	LoaderException,
@@ -89,29 +88,6 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None:
 		LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e)
 		return
 
-	def _embed_one(db_id: int, item: SourceItem | ReceivedFileItem) -> tuple[int, IndexingError | None]:
-		"""Run embed_sources for a single item in its own subprocess. Returns (db_id, error_or_None)."""
-		try:
-			result = exec_in_proc(
-				target=embed_sources,
-				args=(vectordb_loader, app_config, {db_id: item}),
-			)
-			return db_id, result.get(db_id)
-		except SubprocessKilledError as e:
-			LOGGER.error(
-				'embed_sources subprocess killed for individual source %s — marking as non-retryable'
-				' to prevent infinite OOM retry loop',
-				item.reference, exc_info=e,
-			)
-			return db_id, IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False)
-		except Exception as e:
-			err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown')
-			LOGGER.error(
-				'embed_sources raised a %s error for individual source %s, marking as retryable',
-				err_name, item.reference, exc_info=e,
-			)
-			return db_id, IndexingError(error=str(e), retryable=True)
-
 	def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]:
 		source_refs = [s.reference for s in source_items.values()]
 		LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs)
@@ -122,43 +98,39 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 			)
 			errors = {k: v for k, v in result.items() if isinstance(v, IndexingError)}
 			LOGGER.info(
-				'embed_sources subprocess finished for %d source(s): %d succeeded, %d errored',
-				len(source_items),
-				len(result) - len(errors),
-				len(errors),
-				extra={'errors': errors} if errors else {},
+				'embed_sources finished for %d source(s): %d succeeded, %d errored',
+				len(source_items), len(result) - len(errors), len(errors),
+				extra={'errors': errors},
 			)
 			return result
 		except SubprocessKilledError as e:
 			LOGGER.error(
-				'embed_sources subprocess was killed (likely OOM) for %d source(s): %s',
-				len(source_items), source_refs, exc_info=e,
+				'embed_sources subprocess was killed for %d source(s) with exitcode %s: %s',
+				len(source_items), e.exitcode, source_refs, exc_info=e,
 			)
 			if len(source_items) == 1:
-				# Single-item subprocess was killed — mark non-retryable to break infinite OOM loop.
-				LOGGER.error(
-					'Single-item subprocess killed for %s — marking as non-retryable',
-					source_refs,
+				return dict.fromkeys(
+					source_items,
+					IndexingError(error=f'Subprocess killed with exitcode {e.exitcode}: {e}', retryable=False),
 				)
-				return {db_id: IndexingError(error=f'Subprocess killed (OOM?): {e}', retryable=False)
-					for db_id in source_items}
 
-			# Multi-item batch: fall back to one subprocess per source to pinpoint the problematic file.
+			# Fall back to one-by-one to isolate the problematic file.
 			LOGGER.warning(
-				'Falling back to individual processing for %d sources to isolate any OOM-causing file(s)',
+				'Falling back to individual processing for %d sources',
 				len(source_items),
 			)
-			return dict(_embed_one(db_id, item) for db_id, item in source_items.items())
-
+			fallback: dict[int, IndexingError | None] = {}
+			for db_id, item in source_items.items():
+				fallback.update(_load_sources({db_id: item}))
+			return fallback
 		except Exception as e:
-			err_name = {DbException: 'DB', EmbeddingException: 'Embedding'}.get(type(e), 'Unknown')
 			err = IndexingError(
-				error=f'{err_name} Error: {e}',
+				error=f'{e.__class__.__name__}: {e}',
 				retryable=True,
 			)
 			LOGGER.error(
 				'embed_sources subprocess raised a %s error for sources %s, marking all as retryable',
-				err_name, source_refs, exc_info=e,
+				e.__class__.__name__, source_refs, exc_info=e,
 			)
 			return dict.fromkeys(source_items, err)
 
diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
index 5f12d0c5..4552e320 100644
--- a/context_chat_backend/utils.py
+++ b/context_chat_backend/utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: 2023 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
-import atexit
 import faulthandler
 import io
 import logging
@@ -23,6 +22,7 @@
 
 T = TypeVar('T')
 _logger = logging.getLogger('ccb.utils')
+_MAX_STD_CAPTURE_CHARS = 64 * 1024
 
 
 def not_none(value: T | None) -> TypeGuard[T]:
@@ -77,7 +77,7 @@ def JSONResponse(
 class SubprocessKilledError(RuntimeError):
 	"""Raised when a subprocess is terminated by a signal (for example SIGKILL)."""
 
-	def __init__(self, pid: int, target_name: str, exitcode: int):
+	def __init__(self, pid: int | None, target_name: str, exitcode: int):
 		super().__init__(
 			f'Subprocess PID {pid} for {target_name} exited with signal {abs(exitcode)} '
 			f'(raw exit code: {exitcode})'
@@ -88,7 +88,7 @@ def __init__(self, pid: int, target_name: str, exitcode: int):
 class SubprocessExecutionError(RuntimeError):
 	"""Raised when a subprocess exits without a recoverable Python exception payload."""
 
-	def __init__(self, pid: int, target_name: str, exitcode: int, details: str = ''):
+	def __init__(self, pid: int | None, target_name: str, exitcode: int, details: str = ''):
 		msg = f'Subprocess PID {pid} for {target_name} exited with exit code {exitcode}'
 		if details:
 			msg = f'{msg}: {details}'
@@ -96,47 +96,29 @@ def __init__(self, pid: int, target_name: str, exitcode: int, details: str = '')
 		self.exitcode = exitcode
 
 
-_MAX_STD_CAPTURE_CHARS = 64 * 1024
-
-
-def _truncate_capture(text: str) -> tuple[str, bool]:
+def _truncate_capture(text: str) -> str:
 	if len(text) <= _MAX_STD_CAPTURE_CHARS:
-		return text, False
+		return text
 
 	head = _MAX_STD_CAPTURE_CHARS // 2
 	tail = _MAX_STD_CAPTURE_CHARS - head
 	omitted = len(text) - _MAX_STD_CAPTURE_CHARS
-	truncated = (
+	return (
 		f'[truncated {omitted} chars]\n'
 		f'{text[:head]}\n'
 		'[...snip...]\n'
 		f'{text[-tail:]}'
 	)
-	return truncated, True
 
 
 def exception_wrap(fun: Callable | None, *args, resconn: Connection, stdconn: Connection, **kwargs):
-	# --- diagnostic probes: write directly to the real stderr FD so they survive
-	# Python's stdout/stderr redirection below and even os._exit() won't hide them
-	# from the parent process's stderr stream.
-	_diag_fd = os.dup(2)  # dup before we capture sys.stderr
-
-	def _raw_diag(msg: str) -> None:
-		with suppress(Exception):
-			os.write(_diag_fd, (msg + '\n').encode())
-
-	# Enable faulthandler on the real FD so crash tracebacks (SIGSEGV etc.) appear.
+	# Preserve real stderr FD for faulthandler before we redirect sys.stderr.
+	_faulthandler_fd = os.dup(2)
 	with suppress(Exception):
-		faulthandler.enable(file=os.fdopen(os.dup(_diag_fd), 'w', closefd=True), all_threads=True)
-
-	# Atexit probe: if this message NEVER appears, it means os._exit() (C-level)
-	# was called with Python's cleanup phase entirely skipped.
-	_fun_name = getattr(fun, '__name__', str(fun))
-	atexit.register(
-		_raw_diag,
-		f'[exception_wrap/atexit] pid={os.getpid()} target={_fun_name}'
-		': Python atexit reached (normal Python exit)',
-	)
+		faulthandler.enable(
+			file=os.fdopen(_faulthandler_fd, 'w', closefd=False),
+			all_threads=True,
+		)
 
 	stdout_capture = io.StringIO()
 	stderr_capture = io.StringIO()
@@ -148,55 +130,31 @@ def _raw_diag(msg: str) -> None:
 	try:
 		if fun is None:
 			resconn.send({ 'value': None, 'error': None })
-			_raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result sent (fun=None)')
 		else:
-			result_value = fun(*args, **kwargs)
-			_raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: fun() returned, sending result')
-			resconn.send({ 'value': result_value, 'error': None })
-			_raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: result pipe send complete')
+			resconn.send({ 'value': fun(*args, **kwargs), 'error': None })
 	except BaseException as e:
 		tb = traceback.format_exc()
-		_raw_diag(
-			f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}'
-			f': caught {type(e).__name__}: {e}'
-		)
 		payload = {
 			'value': None,
 			'error': e,
 			'traceback': tb,
-			'error_type': type(e).__name__,
-			'error_module': type(e).__module__,
-			'error_message': str(e),
 		}
 		try:
 			resconn.send(payload)
 		except Exception as send_err:
-			# Fallback for unpicklable exceptions.
-			with suppress(Exception):
-				resconn.send({
-					'value': None,
-					'error': None,
-					'traceback': tb,
-					'error_type': type(e).__name__,
-					'error_module': type(e).__module__,
-					'error_message': str(e),
-					'send_error': str(send_err),
-				})
+			stderr_capture.write(f'Original error: {e}, pipe send error: {send_err}')
 	finally:
 		sys.stdout = orig_stdout
 		sys.stderr = orig_stderr
-		stdout_text, stdout_truncated = _truncate_capture(stdout_capture.getvalue())
-		stderr_text, stderr_truncated = _truncate_capture(stderr_capture.getvalue())
+		stdout_text = _truncate_capture(stdout_capture.getvalue())
+		stderr_text = _truncate_capture(stderr_capture.getvalue())
 		with suppress(Exception):
 			stdconn.send({
 				'stdout': stdout_text,
 				'stderr': stderr_text,
-				'stdout_truncated': stdout_truncated,
-				'stderr_truncated': stderr_truncated,
 			})
-		_raw_diag(f'[exception_wrap/probe] pid={os.getpid()} target={_fun_name}: finally block complete')
 		with suppress(Exception):
-			os.close(_diag_fd)
+			os.close(_faulthandler_fd)
 
 
 def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None):
@@ -217,22 +175,17 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, da
 		daemon=daemon,
 	)
 	target_name = getattr(target, '__name__', str(target))
-	_logger.debug('Starting subprocess for %s', target_name)
 	start = perf_counter_ns()
 	p.start()
-	_logger.debug('Subprocess PID %d started for %s, waiting for it to finish (no timeout)', p.pid, target_name)
+	_logger.debug('Subprocess PID %d started for %s', p.pid, target_name)
 
 	result = None
-	stdobj = {
-		'stdout': '',
-		'stderr': '',
-		'stdout_truncated': False,
-		'stderr_truncated': False,
-	}
+	stdobj = { 'stdout': '', 'stderr': '' }
 	got_result = False
 	got_std = False
 
 	# Drain result/std pipes while child is still alive to avoid deadlock on full pipe buffers.
+	# Pipe's buffer size is 64 KiB
 	while p.is_alive() and (not got_result or not got_std):
 		if not got_result and pconn.poll(0.1):
 			with suppress(EOFError, OSError, BrokenPipeError):
@@ -254,72 +207,55 @@ def exec_in_proc(group=None, target=None, name=None, args=(), kwargs=None, *, da
 		with suppress(EOFError, OSError, BrokenPipeError):
 			if std_pconn.poll():
 				stdobj = std_pconn.recv()
-				got_std = True
-	if stdobj['stdout'] or stdobj['stderr']:
-		extra = {
-			'stdout': stdobj['stdout'],
-			'stderr': stdobj['stderr'],
-		}
-		if stdobj.get('stdout_truncated') or stdobj.get('stderr_truncated'):
-			extra['stdio_truncated'] = {
-				'stdout': bool(stdobj.get('stdout_truncated')),
-				'stderr': bool(stdobj.get('stderr_truncated')),
-			}
-		_logger.info('std info for %s', target_name, extra=extra)
+				# no need to update got_std here
+	if stdobj.get('stdout') or stdobj.get('stderr'):
+		_logger.info('std info for %s', target_name, extra={
+			'stdout': stdobj.get('stdout', ''),
+			'stderr': stdobj.get('stderr', ''),
+		})
 
 	if not got_result:
 		with suppress(EOFError, OSError, BrokenPipeError):
 			if pconn.poll():
 				result = pconn.recv()
-				got_result = True
+				# no need to update got_result here
 
 	if result is not None and result.get('error') is not None:
-		_logger.error('original traceback: %s', result.get('traceback', ''))
+		_logger.error(
+			'original traceback of %s (PID %d, exitcode: %s): %s',
+			target_name,
+			p.pid,
+			p.exitcode,
+			result.get('traceback', ''),
+		)
 		raise result['error']
 
-	if result is not None and result.get('error_type'):
-		details = (
-			f"{result.get('error_module', '')}.{result.get('error_type', '')}: "
-			f"{result.get('error_message', '')}"
-		)
-		if result.get('traceback'):
-			_logger.error('remote traceback: %s', result['traceback'])
-		raise SubprocessExecutionError(p.pid or 0, target_name, p.exitcode or 1, details)
-
-	# If we received a valid result payload, return it even if the exit
-	# code is non-zero.  The non-zero code typically comes from
-	# multiprocessing/C-extension cleanup (e.g. util._exit_function or
-	# a native atexit handler) that runs *after* exception_wrap has
-	# already sent the result over the pipe.
 	if result is not None and 'value' in result:
 		if p.exitcode not in (None, 0):
 			_logger.warning(
 				'Subprocess PID %d for %s exited with code %s after %.2f ms'
-				' but returned a valid result — accepting the result.'
-				' The non-zero exit likely originates from process'
-				' cleanup (multiprocessing finalizers, C-extension'
-				' atexit, etc.).',
+				' but returned a valid result',
 				p.pid, target_name, p.exitcode, elapsed_ms,
 			)
 		return result['value']
 
 	if p.exitcode and p.exitcode < 0:
 		_logger.warning(
-			'Subprocess PID %d for %s exited due to signal %d after %.2f ms',
-			p.pid, target_name, abs(p.exitcode), elapsed_ms,
+			'Subprocess PID %d for %s exited due to signal %d, exitcode %d after %.2f ms',
+			p.pid, target_name, abs(p.exitcode), p.exitcode, elapsed_ms,
 		)
-		raise SubprocessKilledError(p.pid or 0, target_name, p.exitcode)
+		raise SubprocessKilledError(p.pid, target_name, p.exitcode)
 
 	if p.exitcode not in (None, 0):
 		raise SubprocessExecutionError(
-			p.pid or 0,
+			p.pid,
 			target_name,
 			p.exitcode,
-			'No structured exception payload received from child process',
+			f'No structured exception payload received from child process: {result}',
 		)
 
 	raise SubprocessExecutionError(
-		p.pid or 0,
+		p.pid,
 		target_name,
 		0,
 		f'Subprocess exited successfully but returned no result payload: {result}',
diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py
index 33dfb039..41d7f0db 100644
--- a/context_chat_backend/vectordb/pgvector.py
+++ b/context_chat_backend/vectordb/pgvector.py
@@ -156,7 +156,7 @@ def add_indocuments(self, indocuments: Mapping[int, InDocument]) -> Mapping[int,
 					total_chunks = len(indoc.documents)
 					num_batches = max(1, -(-total_chunks // batch_size))  # ceiling division
 					logger.debug(
-						'Embedding source %s: %d chunk(s) in %d batch(es) — blocks on embedding model',
+						'Embedding source %s: %d chunk(s) in %d batch(es)',
 						indoc.source_id, total_chunks, num_batches,
 					)
 					for i in range(0, total_chunks, batch_size):

From d9ebdac85772930b556f02ea501d3c73160d567b Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 17:54:44 +0530
Subject: [PATCH 47/56] fix: use zip on the subset of filtered sources

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/chain/ingest/injest.py | 23 ++++++++++-----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/context_chat_backend/chain/ingest/injest.py b/context_chat_backend/chain/ingest/injest.py
index 8e321088..190eebd4 100644
--- a/context_chat_backend/chain/ingest/injest.py
+++ b/context_chat_backend/chain/ingest/injest.py
@@ -89,6 +89,7 @@ async def __fetch_files_content(
 	error_items = {}
 	semaphore = asyncio.Semaphore(CONCURRENT_FILE_FETCHES)
 	tasks = []
+	task_sources = {}
 
 	file_count = sum(1 for s in sources.values() if isinstance(s, ReceivedFileItem))
 	logger.debug('Fetching content for %d file(s) (max %d concurrent)', file_count, CONCURRENT_FILE_FETCHES)
@@ -126,13 +127,18 @@ async def __fetch_files_content(
 			continue
 		# any user id from the list should have read access to the file
 		tasks.append(asyncio.ensure_future(__fetch_file_content(semaphore, file.file_id, file.userIds[0])))
+		task_sources[db_id] = file
 
 	results = await asyncio.gather(*tasks, return_exceptions=True)
-	for (db_id, file), result in zip(sources.items(), results, strict=True):
-		if isinstance(file, SourceItem):
-			continue
-
-		if isinstance(result, IndexingException):
+	for (db_id, file), result in zip(task_sources.items(), results, strict=True):
+		if isinstance(result, str) or isinstance(result, BytesIO):
+			source_items[db_id] = SourceItem(
+				**{
+					**file.model_dump(),
+					'content': result,
+				}
+			)
+		elif isinstance(result, IndexingException):
 			logger.error(
 				f'Error fetching content for db id {db_id}, file id {file.file_id}, reference {file.reference}'
 				f': {result}',
@@ -142,13 +148,6 @@ async def __fetch_files_content(
 				error=str(result),
 				retryable=result.retryable,
 			)
-		elif isinstance(result, str) or isinstance(result, BytesIO):
-			source_items[db_id] = SourceItem(
-				**{
-					**file.model_dump(),
-					'content': result,
-				}
-			)
 		elif isinstance(result, BaseException):
 			logger.error(
 				f'Unexpected error fetching content for db id {db_id}, file id {file.file_id},'

From ea77480df7060a21cb556d7dfe13f8d5da21337f Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 18:41:30 +0530
Subject: [PATCH 48/56] fix(em): use tcp socket connection check

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/network_em.py   | 29 ++++++++++++++++++++++++----
 context_chat_backend/task_fetcher.py | 17 +++++++---------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py
index 43ced6cc..ba1edc9e 100644
--- a/context_chat_backend/network_em.py
+++ b/context_chat_backend/network_em.py
@@ -3,8 +3,10 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 import logging
+import socket
 from time import sleep
 from typing import Literal, TypedDict
+from urllib.parse import urlparse
 
 import niquests
 from langchain_core.embeddings import Embeddings
@@ -19,6 +21,7 @@
 )
 
 logger = logging.getLogger('ccb.nextwork_em')
+TCP_CONNECT_TIMEOUT = 2.0  # seconds
 
 # Copied from llama_cpp/llama_types.py
 
@@ -44,12 +47,30 @@ class NetworkEmbeddings(Embeddings):
 	def __init__(self, app_config: TConfig):
 		self.app_config = app_config
 
-	def check_connection(self) -> bool:
+	def _get_host_and_port(self) -> tuple[str, int]:
+		parsed = urlparse(self.app_config.embedding.base_url)
+		host = parsed.hostname
+
+		if not host:
+			raise ValueError("Invalid URL: Missing hostname")
+
+		if parsed.port:
+			port = parsed.port
+		else:
+			port = 443 if parsed.scheme == "https" else 80
+
+		return host, port
+
+	def check_connection(self, check_origin: str) -> bool:
 		try:
-			self.embed_query('hello')
+			host, port = self._get_host_and_port()
+			sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+			sock.settimeout(TCP_CONNECT_TIMEOUT)
+			sock.connect((host, port))
+			sock.close()
 			return True
-		except EmbeddingException as e:
-			logger.warning('Embedding server connection failed', exc_info=e)
+		except (ValueError, TimeoutError, ConnectionRefusedError, socket.gaierror) as e:
+			logger.warning(f'[{check_origin}] Embedding server is not reachable, retrying after some time: {e}')
 			return False
 
 	def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float] | list[list[float]]:
diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index c75cec0d..c931e7df 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -83,6 +83,7 @@ class ThreadType(Enum):
 
 def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None:
 	try:
+		network_em = NetworkEmbeddings(app_config)
 		vectordb_loader = VectorDBLoader(app_config)
 	except LoaderException as e:
 		LOGGER.error('Error initializing vector DB loader, files indexing thread will not start:', exc_info=e)
@@ -141,7 +142,7 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 			return
 
 		try:
-			if not __check_em_server(app_config):
+			if not network_em.check_connection(ThreadType.FILES_INDEXING.value):
 				sleep(POLLING_COOLDOWN)
 				continue
 
@@ -456,6 +457,7 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 	LOGGER.info('Starting task fetcher loop')
 
 	try:
+		network_em = NetworkEmbeddings(app_config)
 		vectordb_loader = VectorDBLoader(app_config)
 		llm_loader = LLMModelLoader(app_config)
 	except LoaderException as e:
@@ -466,14 +468,14 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 	llm: LLM = llm_loader.load()
 
 	while True:
-		if not __check_em_server(app_config):
-			sleep(POLLING_COOLDOWN)
-			continue
-
 		if THREAD_STOP_EVENT.is_set():
 			LOGGER.info('Updates processing thread is stopping due to stop event being set')
 			return
 
+		if not network_em.check_connection(ThreadType.REQUEST_PROCESSING.value):
+			sleep(POLLING_COOLDOWN)
+			continue
+
 		try:
 			# Fetch pending task
 			try:
@@ -877,8 +879,3 @@ def process_search_task(
 			task_input.get('scopeList'),
 		)
 	)
-
-
-def __check_em_server(app_config: TConfig) -> bool:
-	embedding_model = NetworkEmbeddings(app_config=app_config)
-	return embedding_model.check_connection()

From 1ce237a36addb872e3affc790faeae5583e80b28 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 18:42:59 +0530
Subject: [PATCH 49/56] fix(ci): remove github CI restrictions

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/task_fetcher.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index c931e7df..004104f8 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -55,16 +55,9 @@
 THREAD_STOP_EVENT = Event()
 LOGGER = logging.getLogger('ccb.task_fetcher')
 FILES_INDEXING_BATCH_SIZE = 16  # theoretical max RAM usage: 16 * 100 MiB, todo: config?
-if os.getenv('GITHUB_ACTIONS'):
-	FILES_INDEXING_BATCH_SIZE = 4
 MIN_FILES_PER_CPU = 4
-if os.getenv('GITHUB_ACTIONS'):
-	MIN_FILES_PER_CPU = 2
 # divides the batch into these many chunks
 PARALLEL_FILE_PARSING_COUNT = max(1, (os.cpu_count() or 2) - 1)  # todo: config?
-if os.getenv('GITHUB_ACTIONS'):
-	# Keep CI memory usage predictable and avoid OOM-killed workers.
-	PARALLEL_FILE_PARSING_COUNT = max(1, min(PARALLEL_FILE_PARSING_COUNT, 2))
 LOGGER.info(f'Using {PARALLEL_FILE_PARSING_COUNT} parallel file parsing workers')
 ACTIONS_BATCH_SIZE = 512  # todo: config?
 POLLING_COOLDOWN = 30

From d82e01b6555e4a362ba58fda1414cba83dc00023 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 18:54:20 +0530
Subject: [PATCH 50/56] fix: remove unused code and some de-duplication

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/task_fetcher.py | 286 +++++++--------------------
 1 file changed, 75 insertions(+), 211 deletions(-)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 004104f8..1e456465 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -15,16 +15,14 @@
 
 import niquests
 from langchain.llms.base import LLM
-from langchain.schema import Document
 from nc_py_api import NextcloudApp, NextcloudException
 from niquests import JSONDecodeError, RequestException
 from pydantic import ValidationError
 
-from .chain.context import do_doc_search, get_context_chunks, get_context_docs
+from .chain.context import do_doc_search
 from .chain.ingest.injest import embed_sources
 from .chain.one_shot import process_context_query
-from .chain.query_proc import get_pruned_query
-from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, ScopeType, SearchResult
+from .chain.types import ContextException, EnrichedSourceList, LLMOutput, ScopeList, SearchResult
 from .dyn_loader import LLMModelLoader, VectorDBLoader
 from .network_em import NetworkEmbeddings
 from .types import (
@@ -39,7 +37,6 @@
 	TConfig,
 )
 from .utils import SubprocessKilledError, exec_in_proc, get_app_role
-from .vectordb.base import BaseVectorDB
 from .vectordb.service import (
 	decl_update_access,
 	delete_by_provider,
@@ -498,11 +495,16 @@ def request_processing_thread(app_config: TConfig, app_enabled: Event) -> None:
 				if task['type'] == 'context_chat:context_chat':
 					result: LLMOutput = process_normal_task(task, vectordb_loader, llm, app_config)
 					# Return result to Nextcloud
-					success = return_normal_result_to_nextcloud(task['id'], userId, result)
+					success = return_result_to_nextcloud(task['id'], userId, {
+						'output': result['output'],
+						'sources': enrich_sources(result['sources'], userId),
+					})
 				elif task['type'] == 'context_chat:context_chat_search':
 					search_result: list[SearchResult] = process_search_task(task, vectordb_loader)
 					# Return result to Nextcloud
-					success = return_search_result_to_nextcloud(task['id'], userId, search_result)
+					success = return_result_to_nextcloud(task['id'], userId, {
+						'sources': enrich_sources(search_result, userId),
+					})
 				else:
 					LOGGER.error(f'Unknown task type {task["type"]}')
 					success = return_error_to_nextcloud(task['id'], Exception(f'Unknown task type {task["type"]}'))
@@ -541,200 +543,6 @@ def wait_for_tasks(interval = None):
 	TRIGGER.clear()
 
 
-
-def start_bg_threads(app_config: TConfig, app_enabled: Event):
-	if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL:
-		if (
-			ThreadType.FILES_INDEXING in THREADS
-			or ThreadType.UPDATES_PROCESSING in THREADS
-		):
-			LOGGER.info('Background threads already running, skipping start')
-			return
-
-		THREAD_STOP_EVENT.clear()
-		THREADS[ThreadType.FILES_INDEXING] = Thread(
-			target=files_indexing_thread,
-			args=(app_config, app_enabled),
-			name='FilesIndexingThread',
-		)
-		THREADS[ThreadType.UPDATES_PROCESSING] = Thread(
-			target=updates_processing_thread,
-			args=(app_config, app_enabled),
-			name='UpdatesProcessingThread',
-		)
-		THREADS[ThreadType.FILES_INDEXING].start()
-		THREADS[ThreadType.UPDATES_PROCESSING].start()
-
-	if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL:
-		if ThreadType.REQUEST_PROCESSING in THREADS:
-			LOGGER.info('Background threads already running, skipping start')
-			return
-
-		THREAD_STOP_EVENT.clear()
-		THREADS[ThreadType.REQUEST_PROCESSING] = Thread(
-			target=request_processing_thread,
-			args=(app_config, app_enabled),
-			name='RequestProcessingThread',
-		)
-		THREADS[ThreadType.REQUEST_PROCESSING].start()
-
-
-def wait_for_bg_threads():
-	if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL:
-		if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS):
-			return
-
-		THREAD_STOP_EVENT.set()
-		THREADS[ThreadType.FILES_INDEXING].join()
-		THREADS[ThreadType.UPDATES_PROCESSING].join()
-		THREADS.pop(ThreadType.FILES_INDEXING)
-		THREADS.pop(ThreadType.UPDATES_PROCESSING)
-
-	if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL:
-		if (ThreadType.REQUEST_PROCESSING not in THREADS):
-			return
-
-		THREAD_STOP_EVENT.set()
-		THREADS[ThreadType.REQUEST_PROCESSING].join()
-		THREADS.pop(ThreadType.REQUEST_PROCESSING)
-
-
-def query_vector_database(
-	user_id: str,
-	query: str,
-	vectordb: BaseVectorDB,
-	ctx_limit: int,
-	scope_type: ScopeType | None = None,
-	scope_list: list[str] | None = None,
-) -> list[Document]:
-	"""
-	Query the vector database to retrieve relevant documents.
-
-	Args:
-		user_id: User ID for scoping the search
-		query: The search query text
-		vectordb: Vector database instance
-		ctx_limit: Maximum number of documents to return
-		scope_type: Optional scope type (PROVIDER or SOURCE)
-		scope_list: Optional list of scope identifiers
-
-	Returns:
-		List of relevant Document objects
-
-	Raises:
-		ContextException: If scope type is provided without scope list
-	"""
-	context_docs = get_context_docs(user_id, query, vectordb, ctx_limit, scope_type, scope_list)
-	LOGGER.debug('Retrieved context documents', extra={
-		'user_id': user_id,
-		'num_docs': len(context_docs),
-		'ctx_limit': ctx_limit,
-	})
-	return context_docs
-
-
-def prepare_context_chunks(context_docs: list[Document]) -> list[str]:
-	"""
-	Extract and format text chunks from documents for LLM context.
-
-	Args:
-		context_docs: List of Document objects from vector DB
-
-	Returns:
-		List of formatted text chunks including titles and content
-	"""
-	return get_context_chunks(context_docs)
-
-
-def generate_llm_response(
-	llm: LLM,
-	app_config: TConfig,
-	user_id: str,
-	query: str,
-	template: str,
-	context_chunks: list[str],
-	end_separator: str = '',
-) -> str:
-	"""
-	Generate LLM response using the pruned query and context.
-
-	Args:
-		llm: Language model instance
-		app_config: Application configuration
-		user_id: User ID for the request
-		query: The original query text
-		template: Template for formatting the prompt
-		context_chunks: Context chunks to include in the prompt
-		end_separator: Optional separator to stop generation
-
-	Returns:
-		Generated LLM output text
-
-	Raises:
-		ValueError: If context length is too small to fit the query
-	"""
-	pruned_query_text = get_pruned_query(llm, app_config, query, template, context_chunks)
-
-	stop = [end_separator] if end_separator else None
-	output = llm.invoke(
-		pruned_query_text,
-		stop=stop,
-		userid=user_id,
-	).strip()
-
-	LOGGER.debug('Generated LLM response', extra={
-		'user_id': user_id,
-		'output_length': len(output),
-	})
-	return output
-
-
-def extract_unique_sources(context_docs: list[Document]) -> list[str]:
-	"""
-	Extract unique source IDs from context documents.
-
-	Args:
-		context_docs: List of Document objects
-
-	Returns:
-		List of unique source IDs
-	"""
-	unique_sources: list[str] = list({
-		source for d in context_docs if (source := d.metadata.get('source'))
-	})
-	return unique_sources
-
-def return_normal_result_to_nextcloud(task_id: int, userId: str, result: LLMOutput) -> bool:
-	"""
-	Return query result back to Nextcloud.
-
-	Args:
-		task_id: Unique task identifier
-		result: The LLMOutput result to return
-
-	Returns:
-		True if successful, False otherwise
-	"""
-	LOGGER.debug('Returning result to Nextcloud', extra={
-		'task_id': task_id,
-		'output_length': len(result['output']),
-		'num_sources': len(result['sources']),
-	})
-
-	nc = NextcloudApp()
-
-	try:
-		nc.providers.task_processing.report_result(task_id, {
-			'output': result['output'],
-			'sources': enrich_sources(result['sources'], userId),
-		})
-	except (NextcloudException, RequestException, JSONDecodeError) as e:
-		LOGGER.error(f"Network error reporting task result {e}", exc_info=e)
-		return False
-
-	return True
-
-
 def enrich_sources(results: list[SearchResult], userId: str) -> list[str]:
 	nc = NextcloudApp()
 	data = nc.ocs('POST', '/ocs/v2.php/apps/context_chat/enrich_sources', json={'sources': results, 'userId': userId})
@@ -742,34 +550,32 @@ def enrich_sources(results: list[SearchResult], userId: str) -> list[str]:
 	return [s.model_dump_json() for s in sources]
 
 
-def return_search_result_to_nextcloud(task_id: int, userId: str, result: list[SearchResult]) -> bool:
+def return_result_to_nextcloud(task_id: int, userId: str, result: dict[str, Any]) -> bool:
 	"""
-	Return search result back to Nextcloud.
+	Return query result back to Nextcloud.
 
 	Args:
-		task_id: Unique task identifier
-		result: The list of search results to return
+		result: dict[str, Any]
 
 	Returns:
 		True if successful, False otherwise
 	"""
-	LOGGER.debug('Returning search result to Nextcloud', extra={
+	LOGGER.debug('Returning result to Nextcloud', extra={
 		'task_id': task_id,
-		'num_sources': len(result),
+		'result': result,
 	})
 
 	nc = NextcloudApp()
 
 	try:
-		nc.providers.task_processing.report_result(task_id, {
-			'sources': enrich_sources(result, userId),
-		})
+		nc.providers.task_processing.report_result(task_id, result)
 	except (NextcloudException, RequestException, JSONDecodeError) as e:
-		LOGGER.error(f"Network error reporting search task result {e}", exc_info=e)
+		LOGGER.error(f"Network error reporting task result {e}", exc_info=e)
 		return False
 
 	return True
 
+
 def return_error_to_nextcloud(task_id: int, e: Exception) -> bool:
 	"""
 	Return error result back to Nextcloud.
@@ -827,6 +633,7 @@ def process_normal_task(
 	if task_input.get('scopeType') == 'none':
 		task_input['scopeType'] = None
 
+	# todo: document no template support
 	return exec_in_proc(target=process_context_query,
 		args=(
 			user_id,
@@ -872,3 +679,60 @@ def process_search_task(
 			task_input.get('scopeList'),
 		)
 	)
+
+
+def start_bg_threads(app_config: TConfig, app_enabled: Event):
+	if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL:
+		if (
+			ThreadType.FILES_INDEXING in THREADS
+			or ThreadType.UPDATES_PROCESSING in THREADS
+		):
+			LOGGER.info('Background threads already running, skipping start')
+			return
+
+		THREAD_STOP_EVENT.clear()
+		THREADS[ThreadType.FILES_INDEXING] = Thread(
+			target=files_indexing_thread,
+			args=(app_config, app_enabled),
+			name='FilesIndexingThread',
+		)
+		THREADS[ThreadType.UPDATES_PROCESSING] = Thread(
+			target=updates_processing_thread,
+			args=(app_config, app_enabled),
+			name='UpdatesProcessingThread',
+		)
+		THREADS[ThreadType.FILES_INDEXING].start()
+		THREADS[ThreadType.UPDATES_PROCESSING].start()
+
+	if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL:
+		if ThreadType.REQUEST_PROCESSING in THREADS:
+			LOGGER.info('Background threads already running, skipping start')
+			return
+
+		THREAD_STOP_EVENT.clear()
+		THREADS[ThreadType.REQUEST_PROCESSING] = Thread(
+			target=request_processing_thread,
+			args=(app_config, app_enabled),
+			name='RequestProcessingThread',
+		)
+		THREADS[ThreadType.REQUEST_PROCESSING].start()
+
+
+def wait_for_bg_threads():
+	if APP_ROLE == AppRole.INDEXING or APP_ROLE == AppRole.NORMAL:
+		if (ThreadType.FILES_INDEXING not in THREADS or ThreadType.UPDATES_PROCESSING not in THREADS):
+			return
+
+		THREAD_STOP_EVENT.set()
+		THREADS[ThreadType.FILES_INDEXING].join()
+		THREADS[ThreadType.UPDATES_PROCESSING].join()
+		THREADS.pop(ThreadType.FILES_INDEXING)
+		THREADS.pop(ThreadType.UPDATES_PROCESSING)
+
+	if APP_ROLE == AppRole.RP or APP_ROLE == AppRole.NORMAL:
+		if (ThreadType.REQUEST_PROCESSING not in THREADS):
+			return
+
+		THREAD_STOP_EVENT.set()
+		THREADS[ThreadType.REQUEST_PROCESSING].join()
+		THREADS.pop(ThreadType.REQUEST_PROCESSING)

From 286db22e8cb664f600ddfa3b759ce8e83963ff2b Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Fri, 3 Apr 2026 19:32:28 +0530
Subject: [PATCH 51/56] fix(mp): run repairs and config file check only in
 MainProcess

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/controller.py | 8 ++++++--
 main.py                            | 3 +--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/context_chat_backend/controller.py b/context_chat_backend/controller.py
index 3a8e15a9..9c3812e9 100644
--- a/context_chat_backend/controller.py
+++ b/context_chat_backend/controller.py
@@ -16,6 +16,7 @@
 # ruff: noqa: E402
 
 import logging
+import multiprocessing as mp
 import os
 import tempfile
 import threading
@@ -39,8 +40,11 @@
 
 # setup
 
-repair_run()
-ensure_config_file()
+# only run once
+if mp.current_process().name == 'MainProcess':
+	repair_run()
+	ensure_config_file()
+
 logger = logging.getLogger('ccb.controller')
 app_config = get_config(os.environ['CC_CONFIG_PATH'])
 __download_models_from_hf = os.environ.get('CC_DOWNLOAD_MODELS_FROM_HF', 'true').lower() in ('1', 'true', 'yes')
diff --git a/main.py b/main.py
index c2614515..076b7db0 100755
--- a/main.py
+++ b/main.py
@@ -5,6 +5,7 @@
 #
 
 import logging
+import multiprocessing as mp
 from os import cpu_count, getenv
 
 import psutil
@@ -45,8 +46,6 @@ def _setup_log_levels(debug: bool):
 
 
 if __name__ == '__main__':
-	import multiprocessing as mp
-
 	logging_config = get_logging_config(LOGGER_CONFIG_NAME)
 	setup_logging(logging_config)
 	app_config: TConfig = app.extra['CONFIG']

From 726eb64f5624eb9a2262aa6c6b17641e04b33973 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Tue, 7 Apr 2026 16:43:07 +0530
Subject: [PATCH 52/56] fix: attach source_ids as keys in json logs

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/task_fetcher.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/context_chat_backend/task_fetcher.py b/context_chat_backend/task_fetcher.py
index 1e456465..be74b316 100644
--- a/context_chat_backend/task_fetcher.py
+++ b/context_chat_backend/task_fetcher.py
@@ -81,7 +81,9 @@ def files_indexing_thread(app_config: TConfig, app_enabled: Event) -> None:
 
 	def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) -> Mapping[int, IndexingError | None]:
 		source_refs = [s.reference for s in source_items.values()]
-		LOGGER.info('Starting embed_sources subprocess for %d source(s): %s', len(source_items), source_refs)
+		LOGGER.info('Starting embed_sources subprocess for %d source(s)', len(source_items), extra={
+			'source_ids': source_refs,
+		})
 		try:
 			result = exec_in_proc(
 				target=embed_sources,
@@ -96,8 +98,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 			return result
 		except SubprocessKilledError as e:
 			LOGGER.error(
-				'embed_sources subprocess was killed for %d source(s) with exitcode %s: %s',
-				len(source_items), e.exitcode, source_refs, exc_info=e,
+				'embed_sources subprocess was killed for %d source(s) with exitcode %s',
+				len(source_items), e.exitcode, exc_info=e, extra={
+					'source_ids': source_refs,
+				},
 			)
 			if len(source_items) == 1:
 				return dict.fromkeys(
@@ -120,8 +124,10 @@ def _load_sources(source_items: Mapping[int, SourceItem | ReceivedFileItem]) ->
 				retryable=True,
 			)
 			LOGGER.error(
-				'embed_sources subprocess raised a %s error for sources %s, marking all as retryable',
-				e.__class__.__name__, source_refs, exc_info=e,
+				'embed_sources subprocess raised a %s error for %d sources, marking all as retryable',
+				e.__class__.__name__, len(source_refs), exc_info=e, extra={
+					'source_ids': source_refs,
+				}
 			)
 			return dict.fromkeys(source_items, err)
 

From 073f9d0e4a2f7fd52c1ef0df3410ea390c70c683 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Tue, 7 Apr 2026 16:43:26 +0530
Subject: [PATCH 53/56] fix(ci): upload db dump artifacts

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 8ec8eabe..9c664838 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -89,7 +89,7 @@ jobs:
           POSTGRES_USER: root
           POSTGRES_PASSWORD: rootpassword
           POSTGRES_DB: nextcloud
-        options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5
+        options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 --name postgres --hostname postgres
 
     steps:
       - name: Checkout server
@@ -214,6 +214,13 @@ jobs:
             php cron.php
             sleep 10
           done &
+          sleep 30
+          # list all the bg jobs
+          ./occ background-job:list
+
+      - name: Initial dump of DB with context_chat_queue populated
+        run: |
+          docker exec postgres pg_dump nextcloud > /tmp/0_pgdump_nextcloud
 
       - name: Periodically check context_chat stats for 15 minutes to allow the backend to index the files
         run: |
@@ -315,6 +322,10 @@ jobs:
             echo "Memory usage during scan is stable. No memory leak detected."
           fi
 
+      - name: Final dump of DB with vectordb populated
+        run: |
+          docker exec postgres pg_dump nextcloud > /tmp/1_pgdump_nextcloud
+
       - name: Show server logs
         if: always()
         run: |
@@ -350,6 +361,14 @@ jobs:
         run: |
           tail -v -n +1 context_chat_backend/persistent_storage/logs/em_server.log* || echo "No logs in logs directory"
 
+      - name: Upload database dumps
+        uses: actions/upload-artifact@v4
+        with:
+          name: database-dumps
+          path: |
+            /tmp/0_pgdump_nextcloud
+            /tmp/1_pgdump_nextcloud
+
   summary:
     permissions:
       contents: none

From 13ea740d94841069b1c72398440dab9a2a30cd31 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Tue, 7 Apr 2026 18:01:47 +0530
Subject: [PATCH 54/56] fix: retry PGVector object creation if table already
 exists

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 context_chat_backend/vectordb/pgvector.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/context_chat_backend/vectordb/pgvector.py b/context_chat_backend/vectordb/pgvector.py
index 41d7f0db..d7b718dc 100644
--- a/context_chat_backend/vectordb/pgvector.py
+++ b/context_chat_backend/vectordb/pgvector.py
@@ -120,7 +120,15 @@ def __init__(self, embedding: Embeddings | None = None, **kwargs):
 			kwargs['connection'] = os.environ['CCB_DB_URL']
 
 		# setup langchain db + our access list table
-		self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs)
+		try:
+			self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs)
+		except sa.exc.IntegrityError as ie:  # pyright: ignore[reportAttributeAccessIssue]
+			if not isinstance(ie.orig, psycopg.errors.UniqueViolation):
+				raise
+
+			# tried to create the tables but it was already created in another process
+			# init the client again to detect it already exists, and continue from there
+			self.client = PGVector(embedding, collection_name=COLLECTION_NAME, **kwargs)
 
 	def get_instance(self) -> VectorStore:
 		return self.client

From dcb04e7209558ea9185f902637474e301d70f1b9 Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Tue, 7 Apr 2026 20:11:24 +0530
Subject: [PATCH 55/56] fix: unique db dump artifact id

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 9c664838..384e3520 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -364,7 +364,7 @@ jobs:
       - name: Upload database dumps
         uses: actions/upload-artifact@v4
         with:
-          name: database-dumps
+          name: database-dumps-${{ matrix.server-versions }}-php@${{ matrix.php-versions }}
           path: |
             /tmp/0_pgdump_nextcloud
             /tmp/1_pgdump_nextcloud

From dc1d57b15161ff13ffa56208bc4a21bb4e13b10b Mon Sep 17 00:00:00 2001
From: Anupam Kumar <kyteinsky@gmail.com>
Date: Tue, 7 Apr 2026 20:12:51 +0530
Subject: [PATCH 56/56] fix(ci): log stats before exit

Signed-off-by: Anupam Kumar <kyteinsky@gmail.com>
---
 .github/workflows/integration-test.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 384e3520..d30073ab 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -282,9 +282,6 @@ jobs:
 
           echo "::endgroup::"
 
-          ./occ context_chat:stats
-          ./occ context_chat:stats --json
-
           if [ $success -ne 1 ]; then
             echo "Max attempts reached"
             exit 1
@@ -369,6 +366,11 @@ jobs:
             /tmp/0_pgdump_nextcloud
             /tmp/1_pgdump_nextcloud
 
+      - name: Final stats log
+        run: |
+          ./occ context_chat:stats
+          ./occ context_chat:stats --json
+
   summary:
     permissions:
       contents: none