diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index b468d13b4a..d0f5b2adc7 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -60,10 +60,11 @@ from .webhooks import EventWebhookOps from .background_jobs import BackgroundJobOps from .pages import PageOps + from .crawl_logs import CrawlLogOps else: CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object - StorageOps = EventWebhookOps = BackgroundJobOps = object + StorageOps = EventWebhookOps = BackgroundJobOps = CrawlLogOps = object # ============================================================================ @@ -80,6 +81,7 @@ class BaseCrawlOps: storage_ops: StorageOps event_webhook_ops: EventWebhookOps background_job_ops: BackgroundJobOps + crawl_log_ops: CrawlLogOps page_ops: PageOps def __init__( @@ -92,6 +94,7 @@ def __init__( storage_ops: StorageOps, event_webhook_ops: EventWebhookOps, background_job_ops: BackgroundJobOps, + crawl_log_ops: CrawlLogOps, ): self.crawls = mdb["crawls"] self.presigned_urls = mdb["presigned_urls"] @@ -102,6 +105,7 @@ def __init__( self.storage_ops = storage_ops self.event_webhook_ops = event_webhook_ops self.background_job_ops = background_job_ops + self.crawl_log_ops = crawl_log_ops self.page_ops = cast(PageOps, None) def set_page_ops(self, page_ops): @@ -421,6 +425,7 @@ async def delete_crawls( ) await self.page_ops.delete_crawl_pages(crawl_id, org.id) + await self.crawl_log_ops.delete_crawl_logs(crawl_id, org.id) if crawl.collectionIds: for coll_id in crawl.collectionIds: diff --git a/backend/btrixcloud/crawl_logs.py b/backend/btrixcloud/crawl_logs.py index ac738f7dea..5d316a75ab 100644 --- a/backend/btrixcloud/crawl_logs.py +++ b/backend/btrixcloud/crawl_logs.py @@ -179,3 +179,13 @@ async def get_crawl_logs( log_lines = [CrawlLogLine.from_dict(res) for res in items] return log_lines, total + + async def delete_crawl_logs( + self, crawl_id: str, oid: UUID, qa_run_id: Optional[str] = None + ): + """Delete all logs from a specific crawl""" + query: dict[str, str | UUID] = {"crawlId": crawl_id, "oid": oid} + if qa_run_id: + query["qaRunId"] = qa_run_id + + return await self.logs.delete_many(query) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index a856e6fc20..7ee4da8d2f 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -42,7 +42,6 @@ ) from .basecrawls import BaseCrawlOps from .crawlmanager import CrawlManager -from .crawl_logs import CrawlLogOps from .models import ( ListFilterType, UpdateCrawl, @@ -102,12 +101,10 @@ class CrawlOps(BaseCrawlOps): def __init__( self, crawl_manager: CrawlManager, - log_ops: CrawlLogOps, *args, ): super().__init__(*args) self.crawl_manager = crawl_manager - self.log_ops = log_ops self.crawl_configs.set_crawl_ops(self) self.colls.set_crawl_ops(self) self.event_webhook_ops.set_crawl_ops(self) @@ -1076,6 +1073,7 @@ async def delete_crawl_qa_runs( for qa_run_id in delete_list.qa_run_ids: await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id) await self.delete_crawl_qa_run_files(crawl_id, qa_run_id, org) + await self.crawl_log_ops.delete_crawl_logs(crawl_id, org.id, qa_run_id) res = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl"}, @@ -1258,7 +1256,7 @@ async def get_crawl_logs( qa_run_id: Optional[str] = None, ) -> Tuple[list[CrawlLogLine], int]: """get crawl logs""" - return await self.log_ops.get_crawl_logs( + return await self.crawl_log_ops.get_crawl_logs( org, crawl_id, page_size=page_size, @@ -1364,7 +1362,6 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): # pylint: disable=too-many-arguments, too-many-locals, too-many-statements def init_crawls_api( crawl_manager: CrawlManager, - crawl_log_ops: CrawlLogOps, app, user_dep, *args, @@ -1372,7 +1369,7 @@ def init_crawls_api( """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code - ops = CrawlOps(crawl_manager, crawl_log_ops, *args) + ops = CrawlOps(crawl_manager, *args) org_viewer_dep = ops.orgs.org_viewer_dep org_crawl_dep = ops.orgs.org_crawl_dep diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 746a279cbb..5f5e0f7312 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -44,7 +44,7 @@ ) = object -CURR_DB_VERSION = "0055" +CURR_DB_VERSION = "0056" MIN_DB_VERSION = 7.0 diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 0773423dee..da6343e978 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -259,11 +259,12 @@ def main() -> None: storage_ops, event_webhook_ops, background_job_ops, + crawl_log_ops, ) base_crawl_ops = init_base_crawls_api(*base_crawl_init) - crawls = init_crawls_api(crawl_manager, crawl_log_ops, *base_crawl_init) + crawls = init_crawls_api(crawl_manager, *base_crawl_init) upload_ops = init_uploads_api(*base_crawl_init) diff --git a/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py new file mode 100644 index 0000000000..966c58146d --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0056_crawl_logs_deleted_cleanup.py @@ -0,0 +1,107 @@ +""" +Migration 0056 - Remove logs for deleted crawls +""" + +from motor.motor_asyncio import AsyncIOMotorDatabase + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0056" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb: AsyncIOMotorDatabase, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Delete crawl logs from database for crawls and orgs that + have since been deleted, as well as for QA runs that were + deleted for still-existing crawls. + """ + # pylint: disable=duplicate-code, too-many-locals + crawl_logs_mdb = self.mdb["crawl_logs"] + crawls_mdb = self.mdb["crawls"] + + # DELETED CRAWLS + + crawl_logs_to_delete: list[str] = [] + + log_crawl_ids = await crawl_logs_mdb.distinct("crawlId", {}) + + crawl_count = len(log_crawl_ids) + index = 0 + + for crawl_id in log_crawl_ids: + index += 1 + res = await crawls_mdb.find_one({"_id": crawl_id}) + if res is None: + crawl_logs_to_delete.append(crawl_id) + + if index % 100 == 0: + print( + f"Checked {index} of {crawl_count} crawls for logs to delete", + flush=True, + ) + + if crawl_logs_to_delete: + del_count = len(crawl_logs_to_delete) + print( + f"Checked {index} crawls, deleting logs for {del_count} deleted crawls", + flush=True, + ) + + try: + res = await crawl_logs_mdb.delete_many( + {"crawlId": {"$in": crawl_logs_to_delete}} + ) + print(f"Deleted {res.deleted_count} crawl log lines", flush=True) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error deleting crawl logs from deleted crawls: {err}", flush=True + ) + + # DELETED QA RUNS + + qa_run_logs_to_delete: list[str] = [] + + log_qa_run_ids = await crawl_logs_mdb.distinct("qaRunId", {}) + + qa_run_count = len(log_qa_run_ids) + qa_index = 0 + + for qa_run_id in log_qa_run_ids: + qa_index += 1 + res = await crawls_mdb.find_one( + {f"qaFinished.{qa_run_id}": {"$exists": True}} + ) + if res is None: + qa_run_logs_to_delete.append(qa_run_id) + + if qa_index % 100 == 0: + print( + f"Checked {qa_index} of {qa_run_count} QA runs for logs to delete", + flush=True, + ) + + if qa_run_logs_to_delete: + qa_del_count = len(qa_run_logs_to_delete) + print( + f"Checked {qa_index} QA runs, deleting logs for {qa_del_count} deleted runs", + flush=True, + ) + + try: + res = await crawl_logs_mdb.delete_many( + {"qaRunId": {"$in": qa_run_logs_to_delete}} + ) + print(f"Deleted {res.deleted_count} QA run log lines", flush=True) + # pylint: disable=broad-exception-caught + except Exception as err: + print(f"Error deleting logs from deleted QA runs: {err}", flush=True) diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index ebcecf322f..708145e1af 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -99,11 +99,12 @@ def init_ops() -> Tuple[ storage_ops, event_webhook_ops, background_job_ops, + crawl_log_ops, ) base_crawl_ops = BaseCrawlOps(*base_crawl_init) - crawl_ops = CrawlOps(crawl_manager, crawl_log_ops, *base_crawl_init) + crawl_ops = CrawlOps(crawl_manager, *base_crawl_init) upload_ops = UploadOps(*base_crawl_init)