From 4d3dd79d4b4d12d4d06b91580b8408705d87f786 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 15 Jan 2026 13:25:07 -0500 Subject: [PATCH 01/10] Ensure crawl logs are also deleted when crawls are deleted --- backend/btrixcloud/basecrawls.py | 7 ++++++- backend/btrixcloud/crawl_logs.py | 4 ++++ backend/btrixcloud/crawls.py | 8 ++------ backend/btrixcloud/main.py | 3 ++- backend/btrixcloud/ops.py | 3 ++- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index b468d13b4a..d0f5b2adc7 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -60,10 +60,11 @@ from .webhooks import EventWebhookOps from .background_jobs import BackgroundJobOps from .pages import PageOps + from .crawl_logs import CrawlLogOps else: CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object - StorageOps = EventWebhookOps = BackgroundJobOps = object + StorageOps = EventWebhookOps = BackgroundJobOps = CrawlLogOps = object # ============================================================================ @@ -80,6 +81,7 @@ class BaseCrawlOps: storage_ops: StorageOps event_webhook_ops: EventWebhookOps background_job_ops: BackgroundJobOps + crawl_log_ops: CrawlLogOps page_ops: PageOps def __init__( @@ -92,6 +94,7 @@ def __init__( storage_ops: StorageOps, event_webhook_ops: EventWebhookOps, background_job_ops: BackgroundJobOps, + crawl_log_ops: CrawlLogOps, ): self.crawls = mdb["crawls"] self.presigned_urls = mdb["presigned_urls"] @@ -102,6 +105,7 @@ def __init__( self.storage_ops = storage_ops self.event_webhook_ops = event_webhook_ops self.background_job_ops = background_job_ops + self.crawl_log_ops = crawl_log_ops self.page_ops = cast(PageOps, None) def set_page_ops(self, page_ops): @@ -421,6 +425,7 @@ async def delete_crawls( ) await self.page_ops.delete_crawl_pages(crawl_id, org.id) + await self.crawl_log_ops.delete_crawl_logs(crawl_id, org.id) if crawl.collectionIds: for coll_id in crawl.collectionIds: diff --git a/backend/btrixcloud/crawl_logs.py b/backend/btrixcloud/crawl_logs.py index ac738f7dea..94a2081f87 100644 --- a/backend/btrixcloud/crawl_logs.py +++ b/backend/btrixcloud/crawl_logs.py @@ -179,3 +179,7 @@ async def get_crawl_logs( log_lines = [CrawlLogLine.from_dict(res) for res in items] return log_lines, total + + async def delete_crawl_logs(self, crawl_id: str, oid: UUID): + """Delete all logs from a specific crawl""" + return await self.logs.delete_many({"crawlId": crawl_id, "oid": oid}) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index a856e6fc20..25339b2b50 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -42,7 +42,6 @@ ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager -from .crawl_logs import CrawlLogOps from .models import ( ListFilterType, UpdateCrawl, @@ -102,12 +101,10 @@ class CrawlOps(BaseCrawlOps): def __init__( self, crawl_manager: CrawlManager, - log_ops: CrawlLogOps, *args, ): super().__init__(*args) self.crawl_manager = crawl_manager - self.log_ops = log_ops self.crawl_configs.set_crawl_ops(self) self.colls.set_crawl_ops(self) self.event_webhook_ops.set_crawl_ops(self) @@ -1258,7 +1255,7 @@ async def get_crawl_logs( qa_run_id: Optional[str] = None, ) -> Tuple[list[CrawlLogLine], int]: """get crawl logs""" - return await self.log_ops.get_crawl_logs( + return await self.crawl_log_ops.get_crawl_logs( org, crawl_id, page_size=page_size, @@ -1364,7 +1361,6 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): # pylint: disable=too-many-arguments, too-many-locals, too-many-statements def init_crawls_api( crawl_manager: CrawlManager, - crawl_log_ops: CrawlLogOps, app, user_dep, *args, @@ -1372,7 +1368,7 @@ def init_crawls_api( """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code - ops = CrawlOps(crawl_manager, crawl_log_ops, *args) + ops = CrawlOps(crawl_manager, *args) org_viewer_dep = ops.orgs.org_viewer_dep org_crawl_dep = ops.orgs.org_crawl_dep diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 0773423dee..da6343e978 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -259,11 +259,12 @@ def main() -> None: storage_ops, event_webhook_ops, background_job_ops, + crawl_log_ops, ) base_crawl_ops = init_base_crawls_api(*base_crawl_init) - crawls = init_crawls_api(crawl_manager, crawl_log_ops, *base_crawl_init) + crawls = init_crawls_api(crawl_manager, *base_crawl_init) upload_ops = init_uploads_api(*base_crawl_init) diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index ebcecf322f..708145e1af 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -99,11 +99,12 @@ def init_ops() -> Tuple[ storage_ops, event_webhook_ops, background_job_ops, + crawl_log_ops, ) base_crawl_ops = BaseCrawlOps(*base_crawl_init) - crawl_ops = CrawlOps(crawl_manager, crawl_log_ops, *base_crawl_init) + crawl_ops = CrawlOps(crawl_manager, *base_crawl_init) upload_ops = UploadOps(*base_crawl_init) From 222b0e33e9ed09ffe863a4ac4f73d1cf7c4fe40d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 19 Jan 2026 12:48:22 -0500 Subject: [PATCH 02/10] Add migration to delete logs for deleted crawls --- ...gration_0056_crawl_logs_deleted_cleanup.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py diff --git a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py new file mode 100644 index 0000000000..63cfa7e325 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py @@ -0,0 +1,64 @@ +""" +Migration 0056 - Remove logs for deleted crawls +""" + +from motor.motor_asyncio import AsyncIOMotorDatabase + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0056" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb: AsyncIOMotorDatabase, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Delete crawl logs from database for crawls and orgs that + have since been deleted. + """ + # pylint: disable=duplicate-code + crawl_logs_mdb = self.mdb["crawl_logs"] + crawls_mdb = self.mdb["crawls"] + + crawl_logs_to_delete: list[str] = [] + + log_crawl_ids = await crawl_logs_mdb.distinct("crawlId", {}) + + crawl_count = len(log_crawl_ids) + index = 0 + + for crawl_id in log_crawl_ids: + index += 1 + res = await crawls_mdb.find({"_id": crawl_id}) + if res is None: + crawl_logs_to_delete.append(crawl_id) + + if index % 100 == 0: + print( + f"Checked {index} of {crawl_count} crawls for logs to delete", + flush=True, + ) + + if crawl_logs_to_delete: + del_count = len(crawl_logs_to_delete) + print( + f"Checked {index} crawls, deleting logs for {del_count} deleted crawls", + flush=True, + ) + + try: + await crawl_logs_mdb.delete_many( + {"crawlId": {"$in": crawl_logs_to_delete}} + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error deleting crawl logs from deleted crawls: {err}", flush=True + ) From b57cd478736a89d34644f45e240bca53d840a2df Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 19 Jan 2026 15:20:09 -0500 Subject: [PATCH 03/10] Bump CURR_DB_VERSION to 0056 --- backend/btrixcloud/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 746a279cbb..5f5e0f7312 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -44,7 +44,7 @@ ) = object -CURR_DB_VERSION = "0055" +CURR_DB_VERSION = "0056" MIN_DB_VERSION = 7.0 From 5cc26823ff6fd723691becd4b140417d03227c40 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 19 Jan 2026 15:49:04 -0500 Subject: [PATCH 04/10] find -> find_one --- .../migrations/migration_0056_crawl_logs_deleted_cleanup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py index 63cfa7e325..f2aae78f1e 100644 --- a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py +++ b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py @@ -36,7 +36,7 @@ async def migrate_up(self): for crawl_id in log_crawl_ids: index += 1 - res = await crawls_mdb.find({"_id": crawl_id}) + res = await crawls_mdb.find_one({"_id": crawl_id}) if res is None: crawl_logs_to_delete.append(crawl_id) From 2eb5cd59656ee092d61957c35c8212f17c5ae87b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 9 Apr 2026 13:06:06 -0400 Subject: [PATCH 05/10] Log number of deleted log lines in migration --- .../migrations/migration_0056_crawl_logs_deleted_cleanup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py index f2aae78f1e..6341caa047 100644 --- a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py +++ b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py @@ -54,9 +54,10 @@ async def migrate_up(self): ) try: - await crawl_logs_mdb.delete_many( + res = await crawl_logs_mdb.delete_many( {"crawlId": {"$in": crawl_logs_to_delete}} ) + print(f"Deleted {res.deleted_count} log lines", flush=True) # pylint: disable=broad-exception-caught except Exception as err: print( From 84aa7886bb23b21a004e66b7d8d3895e9e94c117 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 9 Apr 2026 13:18:55 -0400 Subject: [PATCH 06/10] Ensure crawl logs for qa runs are cleaned up when qa runs are deleted --- backend/btrixcloud/crawl_logs.py | 12 +++++++++--- backend/btrixcloud/crawls.py | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/crawl_logs.py b/backend/btrixcloud/crawl_logs.py index 94a2081f87..608c3fa67e 100644 --- a/backend/btrixcloud/crawl_logs.py +++ b/backend/btrixcloud/crawl_logs.py @@ -1,6 +1,6 @@ """crawl logs""" -from typing import TYPE_CHECKING, Any, Optional, Dict, Tuple, List +from typing import TYPE_CHECKING, Any, Optional, Dict, Tuple, List, Union import json from uuid import UUID, uuid4 @@ -180,6 +180,12 @@ async def get_crawl_logs( return log_lines, total - async def delete_crawl_logs(self, crawl_id: str, oid: UUID): + async def delete_crawl_logs( + self, crawl_id: str, oid: UUID, qa_run_id: Optional[str] = None + ): """Delete all logs from a specific crawl""" - return await self.logs.delete_many({"crawlId": crawl_id, "oid": oid}) + query: dict[str, str | UUID] = {"crawlId": crawl_id, "oid": oid} + if qa_run_id: + query["qaRunId"] = qa_run_id + + return await self.logs.delete_many(query) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 25339b2b50..7ee4da8d2f 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1073,6 +1073,7 @@ async def delete_crawl_qa_runs( for qa_run_id in delete_list.qa_run_ids: await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id) await self.delete_crawl_qa_run_files(crawl_id, qa_run_id, org) + await self.crawl_log_ops.delete_crawl_logs(crawl_id, org.id, qa_run_id) res = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl"}, From a29510b2e3066c593ef96de1be60aa5fac44564b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 9 Apr 2026 13:41:30 -0400 Subject: [PATCH 07/10] Delete crawl logs for deleted QA runs of existing crawls in migration --- ...gration_0056_crawl_logs_deleted_cleanup.py | 46 +++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py index 6341caa047..1f39501129 100644 --- a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py +++ b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py @@ -21,12 +21,15 @@ async def migrate_up(self): """Perform migration up. Delete crawl logs from database for crawls and orgs that - have since been deleted. + have since been deleted, as well as for QA runs that were + deleted for still-existing crawls. """ - # pylint: disable=duplicate-code + # pylint: disable=duplicate-code, too-many-locals crawl_logs_mdb = self.mdb["crawl_logs"] crawls_mdb = self.mdb["crawls"] + # DELETED CRAWLS + crawl_logs_to_delete: list[str] = [] log_crawl_ids = await crawl_logs_mdb.distinct("crawlId", {}) @@ -57,9 +60,46 @@ async def migrate_up(self): res = await crawl_logs_mdb.delete_many( {"crawlId": {"$in": crawl_logs_to_delete}} ) - print(f"Deleted {res.deleted_count} log lines", flush=True) + print(f"Deleted {res.deleted_count} crawl log lines", flush=True) # pylint: disable=broad-exception-caught except Exception as err: print( f"Error deleting crawl logs from deleted crawls: {err}", flush=True ) + + # DELETED QA RUNS + + qa_run_logs_to_delete: list[str] = [] + + log_qa_run_ids = await crawl_logs_mdb.distinct("qaRunId", {}) + + qa_run_count = len(log_qa_run_ids) + qa_index = 0 + + for qa_run_id in log_qa_run_ids: + qa_index += 1 + res = await crawls_mdb.find_one({f"qaFinished.{qa_run_id}": {"$exists": 1}}) + if res is None: + qa_run_logs_to_delete.append(qa_run_id) + + if qa_index % 100 == 0: + print( + f"Checked {qa_index} of {qa_run_count} QA runs for logs to delete", + flush=True, + ) + + if qa_run_logs_to_delete: + qa_del_count = len(qa_run_logs_to_delete) + print( + f"Checked {qa_index} QA runs, deleting logs for {qa_del_count} deleted runs", + flush=True, + ) + + try: + res = await crawl_logs_mdb.delete_many( + {"qaRunId": {"$in": qa_run_logs_to_delete}} + ) + print(f"Deleted {res.deleted_count} QA run log lines", flush=True) + # pylint: disable=broad-exception-caught + except Exception as err: + print(f"Error deleting logs from deleted QA runs: {err}", flush=True) From f10f5afe4d0d8211a58b4602c1b640278d08630b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 9 Apr 2026 13:56:05 -0400 Subject: [PATCH 08/10] Remove unused import --- backend/btrixcloud/crawl_logs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawl_logs.py b/backend/btrixcloud/crawl_logs.py index 608c3fa67e..5d316a75ab 100644 --- a/backend/btrixcloud/crawl_logs.py +++ b/backend/btrixcloud/crawl_logs.py @@ -1,6 +1,6 @@ """crawl logs""" -from typing import TYPE_CHECKING, Any, Optional, Dict, Tuple, List, Union +from typing import TYPE_CHECKING, Any, Optional, Dict, Tuple, List import json from uuid import UUID, uuid4 From e71419cb4c9114c04ae0387edb24e026228b2a4f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 13 Apr 2026 22:46:28 -0700 Subject: [PATCH 09/10] Apply suggestion from @ikreymer --- .../migrations/migration_0056_crawl_logs_deleted_cleanup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py index 1f39501129..59af1980b6 100644 --- a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py +++ b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py @@ -78,7 +78,7 @@ async def migrate_up(self): for qa_run_id in log_qa_run_ids: qa_index += 1 - res = await crawls_mdb.find_one({f"qaFinished.{qa_run_id}": {"$exists": 1}}) + res = await crawls_mdb.find_one({f"qaFinished.{qa_run_id}": {"$exists": True}}) if res is None: qa_run_logs_to_delete.append(qa_run_id) From 984001fbddca58aba3dc3a88014dc38773e20282 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 13 Apr 2026 22:50:57 -0700 Subject: [PATCH 10/10] format fix --- .../migrations/migration_0056_crawl_logs_deleted_cleanup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py index 59af1980b6..966c58146d 100644 --- a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py +++ b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py @@ -78,7 +78,9 @@ async def migrate_up(self): for qa_run_id in log_qa_run_ids: qa_index += 1 - res = await crawls_mdb.find_one({f"qaFinished.{qa_run_id}": {"$exists": True}}) + res = await crawls_mdb.find_one( + {f"qaFinished.{qa_run_id}": {"$exists": True}} + ) if res is None: qa_run_logs_to_delete.append(qa_run_id)