From 95fab8551e6114d4a90441c904f83767f39bdbfa Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Fri, 6 Mar 2026 13:38:02 -0800 Subject: [PATCH 1/2] fix: capture return value of query filter builder in search count query The count query in `search_score_sets` was discarding the return value of `build_search_score_sets_query_filter`, so the count would reflect all score sets in the database rather than only those matching the search filters. --- src/mavedb/lib/score_sets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 134457f7..27f611fc 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -292,7 +292,7 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: # query. score_sets = score_sets[: search.limit] count_query = db.query(ScoreSet) - build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search) + count_query = build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search) num_score_sets = count_query.order_by(None).limit(None).count() save_to_logging_context({"matching_resources": num_score_sets}) From 6c77ef0b9640f01bf9e6aa091b8900c1321c33ce Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Fri, 6 Mar 2026 13:45:31 -0800 Subject: [PATCH 2/2] fix: use selectinload for experiment relationships in score set search Switch one-to-many experiment relationship loading (keyword_objs, doi_identifiers, publication_identifier_associations, raw_read_identifiers) from joinedload to selectinload inside the contains_eager block. This prevents row multiplication from causing the SQL LIMIT to apply to multiplied rows rather than unique score sets, which resulted in search returning fewer results than expected on databases with rich experiment metadata. --- src/mavedb/lib/score_sets.py | 16 +++++++++--- tests/routers/test_score_set.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 27f611fc..d6fa605f 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -238,18 +238,26 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: score_sets: list[ScoreSet] = ( query.join(ScoreSet.experiment) .options( + # Use selectinload for one-to-many experiment relationships to avoid row + # multiplication in the main query. joinedload would LEFT OUTER JOIN these + # into the main SQL query, and because they're nested inside contains_eager, + # SQLAlchemy's subquery-wrapping logic doesn't protect the LIMIT clause from + # being applied to multiplied rows rather than unique score sets. This would + # cause the count of returned score sets to be less than the requested limit, + # and the count query would be triggered even when the number of unique score + # sets in the main query results exceeds the limit. contains_eager(ScoreSet.experiment).options( joinedload(Experiment.experiment_set), - joinedload(Experiment.keyword_objs).joinedload( + selectinload(Experiment.keyword_objs).joinedload( ExperimentControlledKeywordAssociation.controlled_keyword ), joinedload(Experiment.created_by), joinedload(Experiment.modified_by), - joinedload(Experiment.doi_identifiers), - joinedload(Experiment.publication_identifier_associations).joinedload( + selectinload(Experiment.doi_identifiers), + selectinload(Experiment.publication_identifier_associations).joinedload( ExperimentPublicationIdentifierAssociation.publication ), - joinedload(Experiment.raw_read_identifiers), + selectinload(Experiment.raw_read_identifiers), selectinload(Experiment.score_sets).options( joinedload(ScoreSet.doi_identifiers), joinedload(ScoreSet.publication_identifier_associations).joinedload( diff --git a/tests/routers/test_score_set.py b/tests/routers/test_score_set.py index e170bedd..f412b16a 100644 --- a/tests/routers/test_score_set.py +++ b/tests/routers/test_score_set.py @@ -38,6 +38,7 @@ TEST_BRNICH_SCORE_CALIBRATION_CLASS_BASED, TEST_BRNICH_SCORE_CALIBRATION_RANGE_BASED, TEST_CROSSREF_IDENTIFIER, + TEST_EXPERIMENT_WITH_KEYWORD, TEST_GNOMAD_DATA_VERSION, TEST_INACTIVE_LICENSE, TEST_MAPPED_VARIANT_WITH_HGVS_G_EXPRESSION, @@ -2403,6 +2404,51 @@ def test_search_filter_options_hidden_by_published_superseding_version( assert target_name not in target_names +def test_search_score_sets_reports_correct_total_count_with_limit( + session, data_provider, client, setup_router_db, data_files +): + """When more published score sets exist than the search limit, num_score_sets should reflect the true total.""" + num_score_sets = 3 + for i in range(num_score_sets): + experiment = create_experiment(client, {"title": f"Experiment {i}"}) + score_set = create_seq_score_set(client, experiment["urn"], update={"title": f"Score Set {i}"}) + score_set = mock_worker_variant_insertion(client, session, data_provider, score_set, data_files / "scores.csv") + + with patch.object(arq.ArqRedis, "enqueue_job", return_value=None): + publish_score_set(client, score_set["urn"]) + + search_payload = {"limit": 2} + response = client.post("/api/v1/score-sets/search", json=search_payload) + assert response.status_code == 200 + assert len(response.json()["scoreSets"]) == 2 + assert response.json()["numScoreSets"] == num_score_sets + + +def test_search_score_sets_not_affected_by_experiment_metadata( + session, data_provider, client, setup_router_db, data_files +): + """Experiments with multiple keywords should not reduce the number of score sets returned by search. + + This is a regression test for a bug where joinedload on one-to-many experiment relationships caused row + multiplication in the main SQL query. The LIMIT clause was applied to the multiplied rows rather than unique + score sets, resulting in fewer results than expected. + """ + num_score_sets = 3 + for i in range(num_score_sets): + experiment = create_experiment(client, {**TEST_EXPERIMENT_WITH_KEYWORD, "title": f"Experiment {i}"}) + score_set = create_seq_score_set(client, experiment["urn"], update={"title": f"Score Set {i}"}) + score_set = mock_worker_variant_insertion(client, session, data_provider, score_set, data_files / "scores.csv") + + with patch.object(arq.ArqRedis, "enqueue_job", return_value=None): + publish_score_set(client, score_set["urn"]) + + search_payload = {"limit": 2} + response = client.post("/api/v1/score-sets/search", json=search_payload) + assert response.status_code == 200 + assert len(response.json()["scoreSets"]) == 2 + assert response.json()["numScoreSets"] == num_score_sets + + ######################################################################################################################## # Score set deletion ########################################################################################################################