From d41d4535db3b303df9c0f64b94ee39e83550ce0c Mon Sep 17 00:00:00 2001 From: Ksenia Berezina Date: Mon, 23 Mar 2026 21:21:19 -0400 Subject: [PATCH 1/3] Add backfill job for missing ML classification and translations --- server/reportmanager/cron.py | 5 + .../commands/backfill_missing_report_data.py | 196 ++++++++++++++++++ .../commands/import_reports_from_bigquery.py | 15 +- server/reportmanager/utils.py | 33 ++- server/server/settings.py | 4 + tests/test_utils.py | 32 ++- 6 files changed, 270 insertions(+), 15 deletions(-) create mode 100644 server/reportmanager/management/commands/backfill_missing_report_data.py diff --git a/server/reportmanager/cron.py b/server/reportmanager/cron.py index a5e2667a..b5a456eb 100644 --- a/server/reportmanager/cron.py +++ b/server/reportmanager/cron.py @@ -102,3 +102,8 @@ def import_reports(): ) call_command("import_reports_from_bigquery", since=since) + + +@app.task(ignore_result=True) +def backfill_missing_report_data(): + call_command("backfill_missing_report_data") diff --git a/server/reportmanager/management/commands/backfill_missing_report_data.py b/server/reportmanager/management/commands/backfill_missing_report_data.py new file mode 100644 index 00000000..50c8c506 --- /dev/null +++ b/server/reportmanager/management/commands/backfill_missing_report_data.py @@ -0,0 +1,196 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Backfill missing ML classifications and translations from BigQuery. + +This command queries BigQuery for ML classification results and translations +that are missing from the database and updates ReportEntry records accordingly. + +Background +---------- +The broken_site_report_ml ETL job in docker-etl performs two operations: +1. Gets ML classification from bugbug for each report +2. Translates reports using ML.TRANSLATE + +However, some reports in the local database may be missing this data after +we import them with import_reports_from_bigquery due to failures in the ETL pipeline, +i.e. bugbug not returning classifications results or the job is stopped +for whatever reason. + +By the time the ETL job receives the results, reports already might +be imported into the dashboard DB. This backfill job periodically queries +BigQuery for missing data and updates the local database. + +Impact on Clustering +-------------------- +Reports with ml_valid_probability=NULL are excluded from clustering entirely +and assigned to domain-based buckets. + +When reports receive new ML classifications or translations, they need to be re-triaged. +All reports receiving updates have their bucket_id cleared so triage_new_reports can +reassign them to proper cluster-based or domain-based bucket. + +Note: this backfill job only selecting reports with missing ML classification and +not missing translation to fetch updates for. It's possible that ML.TRANSLATE is unable +to translate text, but it's rather an edge case and mainly happens because text +is too long (i.e. entire html page contents) or contains unprocessable characters. +Once missing classification is received the job also checks if translation +was missing and updates it, however missing ML classification is the +deciding factor for updates. + +""" + +from dataclasses import dataclass +from itertools import batched +from logging import getLogger + +from django.conf import settings +from django.core.management import BaseCommand +from google.cloud import bigquery +from google.oauth2 import service_account + +from reportmanager.models import ReportEntry +from reportmanager.utils import preprocess_text, transform_ml_label + +LOG = getLogger("reportmanager.backfill") + + +@dataclass +class BackfillData: + ml_valid_probability: float | None + language_code: str | None + translated_text: str | None + + +class Command(BaseCommand): + help = "Backfill missing ML classification and translations from BigQuery" + + BATCH_SIZE = 500 + + def handle(self, *args, **options) -> None: + # Find reports needing ML updates (only those with non-empty comments) + reports_to_update = ReportEntry.objects.filter( + ml_valid_probability__isnull=True, comments__isnull=False + ).exclude(comments="") + + total_reports = reports_to_update.count() + LOG.info("Found %d reports needing ML backfill", total_reports) + + if total_reports == 0: + LOG.info("No reports need ML backfill") + return + + all_reports = list(reports_to_update) + batches = list(batched(all_reports, self.BATCH_SIZE)) + total_updated: int = 0 + + params = { + "project": settings.BIGQUERY_PROJECT, + } + + if svc_acct := getattr(settings, "BIGQUERY_SERVICE_ACCOUNT", None): + params["credentials"] = ( + service_account.Credentials.from_service_account_info(svc_acct) + ) + + client: bigquery.Client = bigquery.Client(**params) + + for batch_num, report_batch in enumerate(batches, 1): + LOG.info( + "Processing batch %d (total %d reports)...", + batch_num, + len(report_batch), + ) + + uuid_batch: list[str] = [str(report.uuid) for report in report_batch] + + query: str = f""" + SELECT r.uuid, + c.label as ml_label, c.probability as ml_probability, + t.language_code, t.translated_text + FROM `{settings.BIGQUERY_TABLE}` as r + INNER JOIN `{settings.BIGQUERY_CLASSIFICATION_TABLE}` c + ON r.uuid = c.report_uuid + LEFT JOIN `{settings.BIGQUERY_TRANSLATIONS_TABLE}` t + ON r.uuid = t.report_uuid + WHERE r.uuid IN UNNEST(@uuids) + """ + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ArrayQueryParameter("uuids", "STRING", uuid_batch) + ] + ) + + result = client.query(query, job_config=job_config) + + bq_data: dict[str, BackfillData] = {} + for row in result: + ml_valid_probability = transform_ml_label( + row.ml_label, row.ml_probability + ) + bq_data[row.uuid] = BackfillData( + ml_valid_probability=ml_valid_probability, + language_code=row.language_code, + translated_text=row.translated_text, + ) + + LOG.info("Fetched data for %d reports from BigQuery", len(bq_data)) + + if not bq_data: + continue + + reports_to_update: list[ReportEntry] = [] + + for report in report_batch: + uuid = str(report.uuid) + + if uuid in bq_data: + data = bq_data[uuid] + ml_updated = False + translation_updated = False + + if ( + report.ml_valid_probability is None + and data.ml_valid_probability is not None + ): + report.ml_valid_probability = data.ml_valid_probability + ml_updated = True + + if ( + report.comments_translated is None + and data.translated_text is not None + ): + report.comments_translated = data.translated_text + report.comments_original_language = data.language_code + report.comments_preprocessed = preprocess_text( + data.translated_text + ) + translation_updated = True + + if ml_updated or translation_updated: + reports_to_update.append(report) + + # Clear bucket assignment to re-triage these reports + if report.cluster_id is None: + report.bucket_id = None + + if reports_to_update: + ReportEntry.objects.bulk_update( + reports_to_update, + [ + "ml_valid_probability", + "comments_translated", + "comments_original_language", + "comments_preprocessed", + "bucket_id", + ], + ) + total_updated += len(reports_to_update) + LOG.info( + "Updated %d reports in batch (cleared buckets for re-triaging)", + len(reports_to_update), + ) + + LOG.info("Backfill complete: %d reports updated", total_updated) diff --git a/server/reportmanager/management/commands/import_reports_from_bigquery.py b/server/reportmanager/management/commands/import_reports_from_bigquery.py index 7256c168..f6f8d920 100644 --- a/server/reportmanager/management/commands/import_reports_from_bigquery.py +++ b/server/reportmanager/management/commands/import_reports_from_bigquery.py @@ -14,6 +14,7 @@ from google.oauth2 import service_account from reportmanager.models import ReportEntry +from reportmanager.utils import transform_ml_label from webcompat.models import Report LOG = getLogger("reportmanager.import") @@ -57,19 +58,7 @@ def handle(self, *args, **options): ) for row in result: - # The BugBot ML prediction can assign two labels, invalid or valid, - # with a probability between 0 and 1. Having two labels makes - # filtering and sorting harder, so let's transform "invalid 95%" - # into "valid 5%". - # There is a rare chance that a bug will have no score. In this case, - # we just assign None, which will get treated as invalid in the - # frontend. - ml_valid_probability = None - match row.ml_label: - case "invalid": - ml_valid_probability = 1 - row.ml_probability - case "valid": - ml_valid_probability = row.ml_probability + ml_valid_probability = transform_ml_label(row.ml_label, row.ml_probability) report_obj = Report( app_name=row.app_name, diff --git a/server/reportmanager/utils.py b/server/reportmanager/utils.py index 9a54e1e9..94fa07a1 100644 --- a/server/reportmanager/utils.py +++ b/server/reportmanager/utils.py @@ -5,7 +5,7 @@ import re -def preprocess_text(text): +def preprocess_text(text: str | None) -> str: if not text or text == "": return "" @@ -13,3 +13,34 @@ def preprocess_text(text): text = str(text).strip() text = re.sub(r"\s+", " ", text) return text + + +def transform_ml_label( + ml_label: str | None, ml_probability: float | None +) -> float | None: + """Transform ML label and probability into a valid probability. + + The BugBot ML prediction can assign two labels, "invalid" or "valid", + with a probability between 0 and 1. Having two labels makes filtering + and sorting harder, so we transform "invalid 95%" into "valid 5%". + + There is a chance that a bug will have no label and score. In this case, + we just assign None, which will get treated as invalid in the + frontend. + + Args: + ml_label: The ML label ("invalid" or "valid"), or None if missing + ml_probability: The probability value (0-1), or None if missing + + Returns: + The probability that the report is valid, or None if label is unknown + """ + ml_valid_probability: float | None = None + match ml_label: + case "invalid": + ml_valid_probability = ( + 1 - ml_probability if ml_probability is not None else None + ) + case "valid": + ml_valid_probability = ml_probability + return ml_valid_probability diff --git a/server/server/settings.py b/server/server/settings.py index 15e7c8bd..05ddbdc4 100644 --- a/server/server/settings.py +++ b/server/server/settings.py @@ -315,6 +315,10 @@ def resolver_context_processor(request): "task": "reportmanager.cron.unhide_buckets", "schedule": 60, }, + "Backfill missing report data evry 12 hours": { + "task": "reportmanager.cron.backfill_missing_report_data", + "schedule": 60 * 60 * 12, + }, } # Email diff --git a/tests/test_utils.py b/tests/test_utils.py index ac5f3bea..1f9d760b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,8 @@ """Tests for reportmanager.utils.""" -from reportmanager.utils import preprocess_text +import pytest + +from reportmanager.utils import preprocess_text, transform_ml_label class TestPreprocessText: @@ -83,3 +85,31 @@ def test_combined_transformations(self): # Multiple issues in one string input_text = "\t The & symbol is\nescaped " assert preprocess_text(input_text) == "The & symbol is escaped" + + +class TestTransformMLLabel: + """Tests for transform_ml_label function.""" + + def test_valid_label_with_high_probability(self): + """Test transformation of 'valid' label with high probability.""" + assert transform_ml_label("valid", 0.95) == 0.95 + + def test_valid_label_with_low_probability(self): + """Test transformation of 'valid' label with mid probability.""" + assert transform_ml_label("valid", 0.53) == 0.53 + + def test_invalid_label_with_high_probability(self): + """Test transformation of 'invalid' label with high probability.""" + assert transform_ml_label("invalid", 0.95) == pytest.approx(0.05) + + def test_invalid_label_mid_probability(self): + """Test transformation of 'invalid' label with mid probability.""" + assert transform_ml_label("invalid", 0.6) == pytest.approx(0.4) + + def test_none_label_returns_none(self): + """Test transformation of None label returns None.""" + assert transform_ml_label(None, 0.5) is None + + def test_empty_string_label_returns_none(self): + """Test transformation of empty string label returns None.""" + assert transform_ml_label("", 0.5) is None From 3fed8be3ad0cdc4d418e821177caa625e6a534d6 Mon Sep 17 00:00:00 2001 From: Ksenia Berezina Date: Tue, 24 Mar 2026 13:51:05 -0400 Subject: [PATCH 2/3] Add joblock for backfill job --- .../commands/backfill_missing_report_data.py | 11 ++++++++++- server/reportmanager/migrations/0016_joblock.py | 2 +- server/reportmanager/models.py | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/server/reportmanager/management/commands/backfill_missing_report_data.py b/server/reportmanager/management/commands/backfill_missing_report_data.py index 50c8c506..78d311a1 100644 --- a/server/reportmanager/management/commands/backfill_missing_report_data.py +++ b/server/reportmanager/management/commands/backfill_missing_report_data.py @@ -50,7 +50,8 @@ from google.cloud import bigquery from google.oauth2 import service_account -from reportmanager.models import ReportEntry +from reportmanager.locking import JobLockError, acquire_job_lock +from reportmanager.models import JobLock, ReportEntry from reportmanager.utils import preprocess_text, transform_ml_label LOG = getLogger("reportmanager.backfill") @@ -69,6 +70,14 @@ class Command(BaseCommand): BATCH_SIZE = 500 def handle(self, *args, **options) -> None: + try: + with acquire_job_lock(JobLock.LockTypes.BACKFILL): + self.run_backfill() + except JobLockError as e: + LOG.warning(f"Cannot start backfill: {e}") + return + + def run_backfill(self) -> None: # Find reports needing ML updates (only those with non-empty comments) reports_to_update = ReportEntry.objects.filter( ml_valid_probability__isnull=True, comments__isnull=False diff --git a/server/reportmanager/migrations/0016_joblock.py b/server/reportmanager/migrations/0016_joblock.py index 6ea00717..71969c95 100644 --- a/server/reportmanager/migrations/0016_joblock.py +++ b/server/reportmanager/migrations/0016_joblock.py @@ -31,7 +31,7 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('singleton_key', models.PositiveSmallIntegerField(default=1, editable=False, help_text='Singleton key constrained to value 1 by check constraint', unique=True)), - ('lock_name', models.CharField(blank=True, choices=[('clustering', 'Clustering'), ('cleanup', 'Cleanup')], help_text='Name of operation holding the lock', max_length=50)), + ('lock_name', models.CharField(blank=True, choices=[('clustering', 'Clustering'), ('cleanup', 'Cleanup'), ('backfill', 'Backfill')], help_text='Name of operation holding the lock', max_length=50)), ('acquired_at', models.DateTimeField(blank=True, null=True)), ('acquired_by', models.CharField(blank=True, help_text='hostname:pid of process holding lock', max_length=255)), ], diff --git a/server/reportmanager/models.py b/server/reportmanager/models.py index b5ef33af..07d01e5a 100644 --- a/server/reportmanager/models.py +++ b/server/reportmanager/models.py @@ -536,6 +536,7 @@ class JobLock(models.Model): class LockTypes(models.TextChoices): CLUSTERING = "clustering", "Clustering" CLEANUP = "cleanup", "Cleanup" + BACKFILL = "backfill", "Backfill" # Locks older than 3 hours are considered stale STALE_LOCK_HOURS = 3 From bcf08fa645e7738ecdb8578aa860c593ad9edbf8 Mon Sep 17 00:00:00 2001 From: Ksenia Berezina Date: Tue, 24 Mar 2026 19:38:07 -0400 Subject: [PATCH 3/3] Code review changes --- .../commands/backfill_missing_report_data.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/server/reportmanager/management/commands/backfill_missing_report_data.py b/server/reportmanager/management/commands/backfill_missing_report_data.py index 78d311a1..4f7d34ea 100644 --- a/server/reportmanager/management/commands/backfill_missing_report_data.py +++ b/server/reportmanager/management/commands/backfill_missing_report_data.py @@ -67,7 +67,8 @@ class BackfillData: class Command(BaseCommand): help = "Backfill missing ML classification and translations from BigQuery" - BATCH_SIZE = 500 + BQ_BATCH_SIZE = 5000 + DB_BATCH_SIZE = 1000 def handle(self, *args, **options) -> None: try: @@ -84,14 +85,15 @@ def run_backfill(self) -> None: ).exclude(comments="") total_reports = reports_to_update.count() - LOG.info("Found %d reports needing ML backfill", total_reports) if total_reports == 0: LOG.info("No reports need ML backfill") return + LOG.info("Found %d reports needing ML backfill", total_reports) + all_reports = list(reports_to_update) - batches = list(batched(all_reports, self.BATCH_SIZE)) + batches = list(batched(all_reports, self.BQ_BATCH_SIZE)) total_updated: int = 0 params = { @@ -157,15 +159,14 @@ def run_backfill(self) -> None: if uuid in bq_data: data = bq_data[uuid] - ml_updated = False - translation_updated = False + updated = False if ( report.ml_valid_probability is None and data.ml_valid_probability is not None ): report.ml_valid_probability = data.ml_valid_probability - ml_updated = True + updated = True if ( report.comments_translated is None @@ -176,9 +177,9 @@ def run_backfill(self) -> None: report.comments_preprocessed = preprocess_text( data.translated_text ) - translation_updated = True + updated = True - if ml_updated or translation_updated: + if updated: reports_to_update.append(report) # Clear bucket assignment to re-triage these reports @@ -195,6 +196,7 @@ def run_backfill(self) -> None: "comments_preprocessed", "bucket_id", ], + batch_size=self.DB_BATCH_SIZE, ) total_updated += len(reports_to_update) LOG.info(