diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 134457f7..d6fa605f 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -238,18 +238,26 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: score_sets: list[ScoreSet] = ( query.join(ScoreSet.experiment) .options( + # Use selectinload for one-to-many experiment relationships to avoid row + # multiplication in the main query. joinedload would LEFT OUTER JOIN these + # into the main SQL query, and because they're nested inside contains_eager, + # SQLAlchemy's subquery-wrapping logic doesn't protect the LIMIT clause from + # being applied to multiplied rows rather than unique score sets. This would + # cause the count of returned score sets to be less than the requested limit, + # and the count query would be triggered even when the number of unique score + # sets in the main query results exceeds the limit. contains_eager(ScoreSet.experiment).options( joinedload(Experiment.experiment_set), - joinedload(Experiment.keyword_objs).joinedload( + selectinload(Experiment.keyword_objs).joinedload( ExperimentControlledKeywordAssociation.controlled_keyword ), joinedload(Experiment.created_by), joinedload(Experiment.modified_by), - joinedload(Experiment.doi_identifiers), - joinedload(Experiment.publication_identifier_associations).joinedload( + selectinload(Experiment.doi_identifiers), + selectinload(Experiment.publication_identifier_associations).joinedload( ExperimentPublicationIdentifierAssociation.publication ), - joinedload(Experiment.raw_read_identifiers), + selectinload(Experiment.raw_read_identifiers), selectinload(Experiment.score_sets).options( joinedload(ScoreSet.doi_identifiers), joinedload(ScoreSet.publication_identifier_associations).joinedload( @@ -292,7 +300,7 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search: # query. score_sets = score_sets[: search.limit] count_query = db.query(ScoreSet) - build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search) + count_query = build_search_score_sets_query_filter(db, count_query, owner_or_contributor, search) num_score_sets = count_query.order_by(None).limit(None).count() save_to_logging_context({"matching_resources": num_score_sets}) diff --git a/tests/routers/test_score_set.py b/tests/routers/test_score_set.py index e170bedd..f412b16a 100644 --- a/tests/routers/test_score_set.py +++ b/tests/routers/test_score_set.py @@ -38,6 +38,7 @@ TEST_BRNICH_SCORE_CALIBRATION_CLASS_BASED, TEST_BRNICH_SCORE_CALIBRATION_RANGE_BASED, TEST_CROSSREF_IDENTIFIER, + TEST_EXPERIMENT_WITH_KEYWORD, TEST_GNOMAD_DATA_VERSION, TEST_INACTIVE_LICENSE, TEST_MAPPED_VARIANT_WITH_HGVS_G_EXPRESSION, @@ -2403,6 +2404,51 @@ def test_search_filter_options_hidden_by_published_superseding_version( assert target_name not in target_names +def test_search_score_sets_reports_correct_total_count_with_limit( + session, data_provider, client, setup_router_db, data_files +): + """When more published score sets exist than the search limit, num_score_sets should reflect the true total.""" + num_score_sets = 3 + for i in range(num_score_sets): + experiment = create_experiment(client, {"title": f"Experiment {i}"}) + score_set = create_seq_score_set(client, experiment["urn"], update={"title": f"Score Set {i}"}) + score_set = mock_worker_variant_insertion(client, session, data_provider, score_set, data_files / "scores.csv") + + with patch.object(arq.ArqRedis, "enqueue_job", return_value=None): + publish_score_set(client, score_set["urn"]) + + search_payload = {"limit": 2} + response = client.post("/api/v1/score-sets/search", json=search_payload) + assert response.status_code == 200 + assert len(response.json()["scoreSets"]) == 2 + assert response.json()["numScoreSets"] == num_score_sets + + +def test_search_score_sets_not_affected_by_experiment_metadata( + session, data_provider, client, setup_router_db, data_files +): + """Experiments with multiple keywords should not reduce the number of score sets returned by search. + + This is a regression test for a bug where joinedload on one-to-many experiment relationships caused row + multiplication in the main SQL query. The LIMIT clause was applied to the multiplied rows rather than unique + score sets, resulting in fewer results than expected. + """ + num_score_sets = 3 + for i in range(num_score_sets): + experiment = create_experiment(client, {**TEST_EXPERIMENT_WITH_KEYWORD, "title": f"Experiment {i}"}) + score_set = create_seq_score_set(client, experiment["urn"], update={"title": f"Score Set {i}"}) + score_set = mock_worker_variant_insertion(client, session, data_provider, score_set, data_files / "scores.csv") + + with patch.object(arq.ArqRedis, "enqueue_job", return_value=None): + publish_score_set(client, score_set["urn"]) + + search_payload = {"limit": 2} + response = client.post("/api/v1/score-sets/search", json=search_payload) + assert response.status_code == 200 + assert len(response.json()["scoreSets"]) == 2 + assert response.json()["numScoreSets"] == num_score_sets + + ######################################################################################################################## # Score set deletion ########################################################################################################################