Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions server/reportmanager/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,8 @@ def import_reports():
)

call_command("import_reports_from_bigquery", since=since)


@app.task(ignore_result=True)
def backfill_missing_report_data():
call_command("backfill_missing_report_data")
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""Backfill missing ML classifications and translations from BigQuery.

This command queries BigQuery for ML classification results and translations
that are missing from the database and updates ReportEntry records accordingly.

Background
----------
The broken_site_report_ml ETL job in docker-etl performs two operations:
1. Gets ML classification from bugbug for each report
2. Translates reports using ML.TRANSLATE

However, some reports in the local database may be missing this data after
we import them with import_reports_from_bigquery due to failures in the ETL pipeline,
i.e. bugbug not returning classifications results or the job is stopped
for whatever reason.

By the time the ETL job receives the results, reports already might
be imported into the dashboard DB. This backfill job periodically queries
BigQuery for missing data and updates the local database.

Impact on Clustering
--------------------
Reports with ml_valid_probability=NULL are excluded from clustering entirely
and assigned to domain-based buckets.

When reports receive new ML classifications or translations, they need to be re-triaged.
All reports receiving updates have their bucket_id cleared so triage_new_reports can
reassign them to proper cluster-based or domain-based bucket.

Note: this backfill job only selecting reports with missing ML classification and
not missing translation to fetch updates for. It's possible that ML.TRANSLATE is unable
to translate text, but it's rather an edge case and mainly happens because text
is too long (i.e. entire html page contents) or contains unprocessable characters.
Once missing classification is received the job also checks if translation
was missing and updates it, however missing ML classification is the
deciding factor for updates.

"""

from dataclasses import dataclass
from itertools import batched
from logging import getLogger

from django.conf import settings
from django.core.management import BaseCommand
from google.cloud import bigquery
from google.oauth2 import service_account

from reportmanager.locking import JobLockError, acquire_job_lock
from reportmanager.models import JobLock, ReportEntry
from reportmanager.utils import preprocess_text, transform_ml_label

LOG = getLogger("reportmanager.backfill")


@dataclass
class BackfillData:
ml_valid_probability: float | None
language_code: str | None
translated_text: str | None


class Command(BaseCommand):
help = "Backfill missing ML classification and translations from BigQuery"

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might need to add locking to this command as well once #140 is merged

BQ_BATCH_SIZE = 5000
DB_BATCH_SIZE = 1000

def handle(self, *args, **options) -> None:
try:
with acquire_job_lock(JobLock.LockTypes.BACKFILL):
self.run_backfill()
except JobLockError as e:
LOG.warning(f"Cannot start backfill: {e}")
return

def run_backfill(self) -> None:
# Find reports needing ML updates (only those with non-empty comments)
reports_to_update = ReportEntry.objects.filter(
ml_valid_probability__isnull=True, comments__isnull=False
).exclude(comments="")

total_reports = reports_to_update.count()

if total_reports == 0:
LOG.info("No reports need ML backfill")
return

LOG.info("Found %d reports needing ML backfill", total_reports)

all_reports = list(reports_to_update)
batches = list(batched(all_reports, self.BQ_BATCH_SIZE))
total_updated: int = 0

params = {
"project": settings.BIGQUERY_PROJECT,
}

if svc_acct := getattr(settings, "BIGQUERY_SERVICE_ACCOUNT", None):
params["credentials"] = (
service_account.Credentials.from_service_account_info(svc_acct)
)

client: bigquery.Client = bigquery.Client(**params)

for batch_num, report_batch in enumerate(batches, 1):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to do this in batches of 500? I'd kind of expect BigQuery to be happy with, well, big queries, but maybe this is better for some reason?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's batched mostly because of ReportEntry.objects.bulk_update below, though I think it accepts batch_size as a parameter, so maybe I'll use that and increase the outer batch size

LOG.info(
"Processing batch %d (total %d reports)...",
batch_num,
len(report_batch),
)

uuid_batch: list[str] = [str(report.uuid) for report in report_batch]

query: str = f"""
SELECT r.uuid,
c.label as ml_label, c.probability as ml_probability,
t.language_code, t.translated_text
FROM `{settings.BIGQUERY_TABLE}` as r
INNER JOIN `{settings.BIGQUERY_CLASSIFICATION_TABLE}` c
ON r.uuid = c.report_uuid
LEFT JOIN `{settings.BIGQUERY_TRANSLATIONS_TABLE}` t
ON r.uuid = t.report_uuid
WHERE r.uuid IN UNNEST(@uuids)
"""

job_config = bigquery.QueryJobConfig(
query_parameters=[
bigquery.ArrayQueryParameter("uuids", "STRING", uuid_batch)
]
)

result = client.query(query, job_config=job_config)

bq_data: dict[str, BackfillData] = {}
for row in result:
ml_valid_probability = transform_ml_label(
row.ml_label, row.ml_probability
)
bq_data[row.uuid] = BackfillData(
ml_valid_probability=ml_valid_probability,
language_code=row.language_code,
translated_text=row.translated_text,
)

LOG.info("Fetched data for %d reports from BigQuery", len(bq_data))

if not bq_data:
continue

reports_to_update: list[ReportEntry] = []

for report in report_batch:
uuid = str(report.uuid)

if uuid in bq_data:
data = bq_data[uuid]
updated = False

if (
report.ml_valid_probability is None
and data.ml_valid_probability is not None
):
report.ml_valid_probability = data.ml_valid_probability
updated = True

if (
report.comments_translated is None
and data.translated_text is not None
):
report.comments_translated = data.translated_text
report.comments_original_language = data.language_code
report.comments_preprocessed = preprocess_text(
data.translated_text
)
updated = True

if updated:
reports_to_update.append(report)

# Clear bucket assignment to re-triage these reports
if report.cluster_id is None:
report.bucket_id = None

if reports_to_update:
ReportEntry.objects.bulk_update(
reports_to_update,
[
"ml_valid_probability",
"comments_translated",
"comments_original_language",
"comments_preprocessed",
"bucket_id",
],
batch_size=self.DB_BATCH_SIZE,
)
total_updated += len(reports_to_update)
LOG.info(
"Updated %d reports in batch (cleared buckets for re-triaging)",
len(reports_to_update),
)

LOG.info("Backfill complete: %d reports updated", total_updated)
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from google.oauth2 import service_account

from reportmanager.models import ReportEntry
from reportmanager.utils import transform_ml_label
from webcompat.models import Report

LOG = getLogger("reportmanager.import")
Expand Down Expand Up @@ -57,19 +58,7 @@ def handle(self, *args, **options):
)

for row in result:
# The BugBot ML prediction can assign two labels, invalid or valid,
# with a probability between 0 and 1. Having two labels makes
# filtering and sorting harder, so let's transform "invalid 95%"
# into "valid 5%".
# There is a rare chance that a bug will have no score. In this case,
# we just assign None, which will get treated as invalid in the
# frontend.
ml_valid_probability = None
match row.ml_label:
case "invalid":
ml_valid_probability = 1 - row.ml_probability
case "valid":
ml_valid_probability = row.ml_probability
ml_valid_probability = transform_ml_label(row.ml_label, row.ml_probability)

report_obj = Report(
app_name=row.app_name,
Expand Down
2 changes: 1 addition & 1 deletion server/reportmanager/migrations/0016_joblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class Migration(migrations.Migration):
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('singleton_key', models.PositiveSmallIntegerField(default=1, editable=False, help_text='Singleton key constrained to value 1 by check constraint', unique=True)),
('lock_name', models.CharField(blank=True, choices=[('clustering', 'Clustering'), ('cleanup', 'Cleanup')], help_text='Name of operation holding the lock', max_length=50)),
('lock_name', models.CharField(blank=True, choices=[('clustering', 'Clustering'), ('cleanup', 'Cleanup'), ('backfill', 'Backfill')], help_text='Name of operation holding the lock', max_length=50)),
('acquired_at', models.DateTimeField(blank=True, null=True)),
('acquired_by', models.CharField(blank=True, help_text='hostname:pid of process holding lock', max_length=255)),
],
Expand Down
1 change: 1 addition & 0 deletions server/reportmanager/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ class JobLock(models.Model):
class LockTypes(models.TextChoices):
CLUSTERING = "clustering", "Clustering"
CLEANUP = "cleanup", "Cleanup"
BACKFILL = "backfill", "Backfill"

# Locks older than 3 hours are considered stale
STALE_LOCK_HOURS = 3
Expand Down
33 changes: 32 additions & 1 deletion server/reportmanager/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,42 @@
import re


def preprocess_text(text):
def preprocess_text(text: str | None) -> str:
if not text or text == "":
return ""

text = html.unescape(text)
text = str(text).strip()
text = re.sub(r"\s+", " ", text)
return text


def transform_ml_label(
ml_label: str | None, ml_probability: float | None
) -> float | None:
"""Transform ML label and probability into a valid probability.

The BugBot ML prediction can assign two labels, "invalid" or "valid",
with a probability between 0 and 1. Having two labels makes filtering
and sorting harder, so we transform "invalid 95%" into "valid 5%".

There is a chance that a bug will have no label and score. In this case,
we just assign None, which will get treated as invalid in the
frontend.

Args:
ml_label: The ML label ("invalid" or "valid"), or None if missing
ml_probability: The probability value (0-1), or None if missing

Returns:
The probability that the report is valid, or None if label is unknown
"""
ml_valid_probability: float | None = None
match ml_label:
case "invalid":
ml_valid_probability = (
1 - ml_probability if ml_probability is not None else None
)
case "valid":
ml_valid_probability = ml_probability
return ml_valid_probability
4 changes: 4 additions & 0 deletions server/server/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,10 @@ def resolver_context_processor(request):
"task": "reportmanager.cron.unhide_buckets",
"schedule": 60,
},
"Backfill missing report data evry 12 hours": {
"task": "reportmanager.cron.backfill_missing_report_data",
"schedule": 60 * 60 * 12,
},
}

# Email
Expand Down
32 changes: 31 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Tests for reportmanager.utils."""

from reportmanager.utils import preprocess_text
import pytest

from reportmanager.utils import preprocess_text, transform_ml_label


class TestPreprocessText:
Expand Down Expand Up @@ -83,3 +85,31 @@ def test_combined_transformations(self):
# Multiple issues in one string
input_text = "\t The & symbol is\nescaped "
assert preprocess_text(input_text) == "The & symbol is escaped"


class TestTransformMLLabel:
"""Tests for transform_ml_label function."""

def test_valid_label_with_high_probability(self):
"""Test transformation of 'valid' label with high probability."""
assert transform_ml_label("valid", 0.95) == 0.95

def test_valid_label_with_low_probability(self):
"""Test transformation of 'valid' label with mid probability."""
assert transform_ml_label("valid", 0.53) == 0.53

def test_invalid_label_with_high_probability(self):
"""Test transformation of 'invalid' label with high probability."""
assert transform_ml_label("invalid", 0.95) == pytest.approx(0.05)

def test_invalid_label_mid_probability(self):
"""Test transformation of 'invalid' label with mid probability."""
assert transform_ml_label("invalid", 0.6) == pytest.approx(0.4)

def test_none_label_returns_none(self):
"""Test transformation of None label returns None."""
assert transform_ml_label(None, 0.5) is None

def test_empty_string_label_returns_none(self):
"""Test transformation of empty string label returns None."""
assert transform_ml_label("", 0.5) is None
Loading