Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion collectoss/application/cli/_cli_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import psutil
import signal
from urllib.parse import urlparse
import sqlalchemy as s

from collectoss.tasks.init.redis_connection import get_redis_connection

Expand Down Expand Up @@ -69,4 +70,21 @@ def raise_open_file_limit(num_files):

resource.setrlimit(resource.RLIMIT_NOFILE, (num_files, current_hard))

return
return

def get_db_version(engine):
""" a helper function to return the database version
used in print_db_version and upgrade_db_version in the db cli module
"""

db_version_sql = s.sql.text(
"""
SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version'
"""
)

with engine.connect() as connection:
result = int(connection.execute(db_version_sql).fetchone()[2])

engine.dispose()
return result
17 changes: 2 additions & 15 deletions collectoss/application/cli/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
process_repo_group_csv,
)

from ._cli_util import get_db_version
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
W0611: Unused get_db_version imported from _cli_util (unused-import)


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -258,21 +260,6 @@ def add_github_org(ctx, organization_name):
controller.add_cli_org(organization_name)


# get_db_version is a helper function to print_db_version and upgrade_db_version
def get_db_version(engine):
db_version_sql = s.sql.text(
"""
SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version'
"""
)

with engine.connect() as connection:
result = int(connection.execute(db_version_sql).fetchone()[2])

engine.dispose()
return result


@cli.command("print-db-version")
@test_connection
@test_db_connection
Expand Down
171 changes: 171 additions & 0 deletions collectoss/application/cli/selftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# SPDX-License-Identifier: MIT
import logging
from typing import Union
import click
from sqlalchemy import Constraint, ForeignKeyConstraint, Index, select, func, Table

from collectoss.application.cli import (
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
W0611: Unused test_connection imported from collectoss.application.cli (unused-import)

test_db_connection,
with_database,
DatabaseContext,
)
from collectoss.application.db.models.augur_data import Commit

from alembic.config import Config
from alembic import command
from alembic.util.exc import AutogenerateDiffsDetected


logger = logging.getLogger(__name__)


@click.group("selftest", short_help="CollectOSS self-testing utilities")
@click.pass_context
def cli(ctx):
ctx.obj = DatabaseContext()


def check_database_parity_with_versioned_model() -> list:
"""Checks for alignent between the versioned SQLAlchemy models and the actual database state

Returns:
list: a list of diff entries in alembic's format
"""
alembic_cfg = Config("alembic.ini")
try:
# 2. Call the 'check' command
# This will raise an AutogenerateDiffsDetected error if
# the database is out of sync with the models.
command.check(alembic_cfg)
return []


except AutogenerateDiffsDetected as e:
return list(e.diffs)


def humanize_alembic_diff(diff: list[Union[tuple, list]], debug=False) -> list[str]:
"""transform an alembic diff for printing in a more intuitive way

Args:
diff (list): A diff list created by Alembic

Returns:
list[str]: A list of findings as strings
"""
output: list[str] = []

if len(diff) == 0:
return output

# Alembic diffs use their own bespoke format, documented in https://alembic.sqlalchemy.org/en/latest/api/autogenerate.html#alembic.autogenerate.compare_metadata
for item in diff:
# coerce every item into a mini-diff list to make processing easier
changes_to_process = item if isinstance(item, list) else [item]

for change in changes_to_process:
finding = ""

if debug:
# its not the most ideal to do this here rather than bypassing the whole function,
# but debug mode still benefits from the type coercion above
finding = str(change)
output.append(finding)

continue

action, category = change[0].split("_")

action_text = ""

if action == "add":
action_text = "found in models but not database"
elif action == "modify":
action_text = "modified"
elif action == "remove":
action_text = "found in database but not models"


if category == "table":
table_def:Table = change[1]
finding += f"Table {table_def.name} in schema {table_def.schema}"

elif category == "column":
_, _, table_name, col_def = change
finding += f"Column {col_def.name} in table {table_name}"

elif category == "fk":
fk_def:ForeignKeyConstraint = change[1]
fk_name = f" called '{fk_def.name}'" if fk_def.name else ""
finding += f"Foreign Key{fk_name} on `{fk_def.table.columns[0]}` linking to `{fk_def.table}`"

attrs = []
if fk_def.onupdate:
attrs.append(f"ON UPDATE {fk_def.onupdate}")
if fk_def.ondelete:
attrs.append(f"ON DELETE {fk_def.ondelete}")
if fk_def.deferrable:
attrs.append("DEFERRABLE")
if fk_def.initially:
attrs.append(f"INITIALLY {fk_def.initially}")

if attrs:
finding += f"\n\tRules: {' | '.join(attrs)}"
elif category == "index":
index_def:Index = change[1]
cols = [c.name for c in index_def.columns]
finding += f"Index {index_def.name} on '{index_def.table}' covering {','.join(cols)}"

elif category == "constraint":
constraint_def:Constraint = change[1]
constraint_name = f" called '{constraint_def.name}'" if constraint_def.name else ""
constraint_type = constraint_def.__class__.__name__ or "Constraint"
constraint_covers = [c.name for c in constraint_def.columns]
finding += f"{constraint_type}{constraint_name} covering {','.join(constraint_covers)}"

elif category == "nullable":
_, schema, table_name, col_name, _, old, new = change
print(f"Column {col_name} in table {table_name} nullable changed: {old} -> {new}")

else:
finding += "unknown type found in alembic diff:\n"
finding += str(change)

finding += f"\n\t{action_text}"
output.append(finding)

return output


@cli.command("report")
@click.option("--debug", is_flag=True, default=False, help="Show more detailed information for debuging purposes")
@test_db_connection
@with_database
@click.pass_context
def run_selftest_report(ctx, debug):
"""
Run queries to evaluate various aspects of the collectoss system's functioning and produce a report
"""
click.echo('Generating CollectOSS selftest report....')

model_diffs = check_database_parity_with_versioned_model()

cmt_author_name_issue_233_query = (
select(func.count())
.select_from(Commit)
.where(Commit.cmt_author_name == '')
)
cmt_author_name_issue_233_count = None

with ctx.obj.engine.begin() as connection:
cmt_author_name_issue_233_count = connection.execute(cmt_author_name_issue_233_query).scalar_one()
click.echo(f'Issue 233 count: {cmt_author_name_issue_233_count} commit changes in the `commits` table contain authors with an empty string as their name')

model_diff_findings = humanize_alembic_diff(model_diffs, debug=debug)

if len(model_diff_findings) == 0:
print("✅ No drift detected. Database is in sync with models.")
else:
print(f"❌ Database drift detected:")
for f in model_diff_findings:
print(f)
Loading