Skip to content
7 changes: 6 additions & 1 deletion backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,11 @@
from .webhooks import EventWebhookOps
from .background_jobs import BackgroundJobOps
from .pages import PageOps
from .crawl_logs import CrawlLogOps

else:
CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object
StorageOps = EventWebhookOps = BackgroundJobOps = object
StorageOps = EventWebhookOps = BackgroundJobOps = CrawlLogOps = object


# ============================================================================
Expand All @@ -80,6 +81,7 @@ class BaseCrawlOps:
storage_ops: StorageOps
event_webhook_ops: EventWebhookOps
background_job_ops: BackgroundJobOps
crawl_log_ops: CrawlLogOps
page_ops: PageOps

def __init__(
Expand All @@ -92,6 +94,7 @@ def __init__(
storage_ops: StorageOps,
event_webhook_ops: EventWebhookOps,
background_job_ops: BackgroundJobOps,
crawl_log_ops: CrawlLogOps,
):
self.crawls = mdb["crawls"]
self.presigned_urls = mdb["presigned_urls"]
Expand All @@ -102,6 +105,7 @@ def __init__(
self.storage_ops = storage_ops
self.event_webhook_ops = event_webhook_ops
self.background_job_ops = background_job_ops
self.crawl_log_ops = crawl_log_ops
self.page_ops = cast(PageOps, None)

def set_page_ops(self, page_ops):
Expand Down Expand Up @@ -421,6 +425,7 @@ async def delete_crawls(
)

await self.page_ops.delete_crawl_pages(crawl_id, org.id)
await self.crawl_log_ops.delete_crawl_logs(crawl_id, org.id)

if crawl.collectionIds:
for coll_id in crawl.collectionIds:
Expand Down
10 changes: 10 additions & 0 deletions backend/btrixcloud/crawl_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,13 @@ async def get_crawl_logs(
log_lines = [CrawlLogLine.from_dict(res) for res in items]

return log_lines, total

async def delete_crawl_logs(
self, crawl_id: str, oid: UUID, qa_run_id: Optional[str] = None
):
"""Delete all logs from a specific crawl"""
query: dict[str, str | UUID] = {"crawlId": crawl_id, "oid": oid}
if qa_run_id:
query["qaRunId"] = qa_run_id

return await self.logs.delete_many(query)
9 changes: 3 additions & 6 deletions backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
)
from .basecrawls import BaseCrawlOps
from .crawlmanager import CrawlManager
from .crawl_logs import CrawlLogOps
from .models import (
ListFilterType,
UpdateCrawl,
Expand Down Expand Up @@ -102,12 +101,10 @@ class CrawlOps(BaseCrawlOps):
def __init__(
self,
crawl_manager: CrawlManager,
log_ops: CrawlLogOps,
*args,
):
super().__init__(*args)
self.crawl_manager = crawl_manager
self.log_ops = log_ops
self.crawl_configs.set_crawl_ops(self)
self.colls.set_crawl_ops(self)
self.event_webhook_ops.set_crawl_ops(self)
Expand Down Expand Up @@ -1076,6 +1073,7 @@ async def delete_crawl_qa_runs(
for qa_run_id in delete_list.qa_run_ids:
await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id)
await self.delete_crawl_qa_run_files(crawl_id, qa_run_id, org)
await self.crawl_log_ops.delete_crawl_logs(crawl_id, org.id, qa_run_id)

res = await self.crawls.find_one_and_update(
{"_id": crawl_id, "type": "crawl"},
Expand Down Expand Up @@ -1258,7 +1256,7 @@ async def get_crawl_logs(
qa_run_id: Optional[str] = None,
) -> Tuple[list[CrawlLogLine], int]:
"""get crawl logs"""
return await self.log_ops.get_crawl_logs(
return await self.crawl_log_ops.get_crawl_logs(
org,
crawl_id,
page_size=page_size,
Expand Down Expand Up @@ -1364,15 +1362,14 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str):
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
def init_crawls_api(
crawl_manager: CrawlManager,
crawl_log_ops: CrawlLogOps,
app,
user_dep,
*args,
):
"""API for crawl management, including crawl done callback"""
# pylint: disable=invalid-name, duplicate-code

ops = CrawlOps(crawl_manager, crawl_log_ops, *args)
ops = CrawlOps(crawl_manager, *args)

org_viewer_dep = ops.orgs.org_viewer_dep
org_crawl_dep = ops.orgs.org_crawl_dep
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
) = object


CURR_DB_VERSION = "0055"
CURR_DB_VERSION = "0056"

MIN_DB_VERSION = 7.0

Expand Down
3 changes: 2 additions & 1 deletion backend/btrixcloud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,12 @@ def main() -> None:
storage_ops,
event_webhook_ops,
background_job_ops,
crawl_log_ops,
)

base_crawl_ops = init_base_crawls_api(*base_crawl_init)

crawls = init_crawls_api(crawl_manager, crawl_log_ops, *base_crawl_init)
crawls = init_crawls_api(crawl_manager, *base_crawl_init)

upload_ops = init_uploads_api(*base_crawl_init)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""
Migration 0056 - Remove logs for deleted crawls
"""

from motor.motor_asyncio import AsyncIOMotorDatabase

from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0056"


class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb: AsyncIOMotorDatabase, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

async def migrate_up(self):
"""Perform migration up.

Delete crawl logs from database for crawls and orgs that
have since been deleted, as well as for QA runs that were
deleted for still-existing crawls.
"""
# pylint: disable=duplicate-code, too-many-locals
crawl_logs_mdb = self.mdb["crawl_logs"]
crawls_mdb = self.mdb["crawls"]

# DELETED CRAWLS

crawl_logs_to_delete: list[str] = []

log_crawl_ids = await crawl_logs_mdb.distinct("crawlId", {})

crawl_count = len(log_crawl_ids)
index = 0

for crawl_id in log_crawl_ids:
index += 1
res = await crawls_mdb.find_one({"_id": crawl_id})
if res is None:
crawl_logs_to_delete.append(crawl_id)

if index % 100 == 0:
print(
f"Checked {index} of {crawl_count} crawls for logs to delete",
flush=True,
)

if crawl_logs_to_delete:
del_count = len(crawl_logs_to_delete)
print(
f"Checked {index} crawls, deleting logs for {del_count} deleted crawls",
flush=True,
)

try:
res = await crawl_logs_mdb.delete_many(
{"crawlId": {"$in": crawl_logs_to_delete}}
)
print(f"Deleted {res.deleted_count} crawl log lines", flush=True)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error deleting crawl logs from deleted crawls: {err}", flush=True
)

# DELETED QA RUNS

qa_run_logs_to_delete: list[str] = []

log_qa_run_ids = await crawl_logs_mdb.distinct("qaRunId", {})

qa_run_count = len(log_qa_run_ids)
qa_index = 0

for qa_run_id in log_qa_run_ids:
qa_index += 1
res = await crawls_mdb.find_one(
{f"qaFinished.{qa_run_id}": {"$exists": True}}
)
if res is None:
qa_run_logs_to_delete.append(qa_run_id)

if qa_index % 100 == 0:
print(
f"Checked {qa_index} of {qa_run_count} QA runs for logs to delete",
flush=True,
)

if qa_run_logs_to_delete:
qa_del_count = len(qa_run_logs_to_delete)
print(
f"Checked {qa_index} QA runs, deleting logs for {qa_del_count} deleted runs",
flush=True,
)

try:
res = await crawl_logs_mdb.delete_many(
{"qaRunId": {"$in": qa_run_logs_to_delete}}
)
print(f"Deleted {res.deleted_count} QA run log lines", flush=True)
# pylint: disable=broad-exception-caught
except Exception as err:
print(f"Error deleting logs from deleted QA runs: {err}", flush=True)
3 changes: 2 additions & 1 deletion backend/btrixcloud/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,12 @@ def init_ops() -> Tuple[
storage_ops,
event_webhook_ops,
background_job_ops,
crawl_log_ops,
)

base_crawl_ops = BaseCrawlOps(*base_crawl_init)

crawl_ops = CrawlOps(crawl_manager, crawl_log_ops, *base_crawl_init)
crawl_ops = CrawlOps(crawl_manager, *base_crawl_init)

upload_ops = UploadOps(*base_crawl_init)

Expand Down
Loading