From 185fece775fb9f1aff8419178c42291e4cada3ac Mon Sep 17 00:00:00 2001 From: anna-parker <50943381+anna-parker@users.noreply.github.com> Date: Tue, 12 May 2026 20:00:37 +0200 Subject: [PATCH] feat(prepro): fix alignment ANY requirement - prepro should error if the segment cant be assigned and warn if can be assigned but not aligned --- kubernetes/loculus/values.schema.json | 2 +- .../src/loculus_preprocessing/config.py | 4 +- .../src/loculus_preprocessing/nextclade.py | 38 +++------------ .../tests/test_nextclade_preprocessing.py | 47 ++++++++----------- 4 files changed, 29 insertions(+), 62 deletions(-) diff --git a/kubernetes/loculus/values.schema.json b/kubernetes/loculus/values.schema.json index c81b46a3c9..795392ca47 100644 --- a/kubernetes/loculus/values.schema.json +++ b/kubernetes/loculus/values.schema.json @@ -714,7 +714,7 @@ "docsIncludePrefix": false, "type": "string", "enum": ["ALL", "ANY"], - "description": "If multi-segmented viruses should require ALL segments (that the user provides) align or ANY segment aligns" + "description": "If multi-segmented viruses should require ALL segments (that the user provides) align or ANY segment aligns (in case ANY it must still be possible to assign the segments)" }, "segment_classification_method": { "groups": ["nextcladePipelineConfigFile"], diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py index f720a9ac5c..4ef56009fe 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/config.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/config.py @@ -44,8 +44,8 @@ class ProcessingSpec(BaseModel): class AlignmentRequirement(StrEnum): # Determines whether ALL or ANY segments that a user provides must align. - # ANY: warn if some segments fail and some segments align - # ALL: error if any segment fails even if some segments align + # ANY: warn if some segments can be assigned and not aligned while other segments align + # ALL: error if any segment fails alignment even if some segments align # NONE: do not align any segments, just process them as-is # - set if no nextclade dataset is provided ANY = "ANY" diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 20dfb488f3..7cca9f7835 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -17,7 +17,7 @@ from loculus_preprocessing.sequence_checks import error_on_excess_sequences -from .config import AlignmentRequirement, Config, NextcladeSequenceAndDataset, SequenceName +from .config import Config, NextcladeSequenceAndDataset, SequenceName from .datatypes import ( AccessionVersion, Alert, @@ -376,7 +376,7 @@ def is_valid_dataset_match(method, best_dataset_id, dataset): return best_dataset_id in accepted__dataset_matches_or_default(dataset) -def assign_segment( # noqa: C901 +def assign_segment( entry: UnprocessedEntry, id_map: dict[tuple[AccessionVersion, FastaId], str], best_hits: pd.DataFrame, @@ -385,33 +385,24 @@ def assign_segment( # noqa: C901 """ Assign sequences to segments based on best hits from nextclade align, nextclade sort or diamond If a segment has multiple references assign to the reference with highest alignment score - 1. If no best hit for a sequence, add error/warning about unaligned sequence - 2. If best hit does not match any accepted reference, add error/warning about unaligned sequence + 1. If no best hit for a sequence, add error about unaligned sequence + 2. If best hit does not match any accepted reference, add error about unaligned sequence 3. If multiple sequences match the same segment (also if they match different references of that segment), add error about duplicate segments - 4. If no sequences assigned and no errors about unaligned/duplicate, add error about - no sequence data found (e.g. when alignment requirement is ANY and all sequences miss) """ sort_results_map: dict[SegmentName, list[AssignedSequence]] = defaultdict(list) sequence_assignment = SequenceAssignment() - has_unaligned_sequence = False - has_duplicate_segments = False - for fasta_id in entry.data.unalignedNucleotideSequences: seq_id = id_map[entry.accessionVersion, fasta_id] if seq_id not in best_hits[SequenceIdentifier].unique(): - has_unaligned_sequence = True method = config.segment_classification_method.display_name annotation = sequence_annotation( f"Sequence with fasta id {fasta_id} does not match any reference for " f"organism: {config.organism} per `{method}`. " "Double check you are submitting to the correct organism." ) - if config.alignment_requirement == AlignmentRequirement.ALL: - sequence_assignment.alert.errors.append(annotation) - else: - sequence_assignment.alert.warnings.append(annotation) + sequence_assignment.alert.errors.append(annotation) continue best_hit = best_hits[best_hits[SequenceIdentifier] == seq_id] @@ -429,21 +420,16 @@ def assign_segment( # noqa: C901 break if not_found: - has_unaligned_sequence = True annotation = sequence_annotation( f"Sequence {fasta_id} best matches {best_dataset_id}, " "which is currently not an accepted option for organism: " f"{config.organism}. It is therefore not possible to release. " "Contact the administrator if you think this message is an error." ) - if config.alignment_requirement == AlignmentRequirement.ALL: - sequence_assignment.alert.errors.append(annotation) - else: - sequence_assignment.alert.warnings.append(annotation) + sequence_assignment.alert.errors.append(annotation) for segment, ids in sort_results_map.items(): if len(ids) > 1: - has_duplicate_segments = True sequence_assignment.alert.errors.append( sequence_annotation( f"Multiple sequences (with fasta ids: {', '.join([id.fasta_id for id in ids])})" @@ -457,18 +443,6 @@ def assign_segment( # noqa: C901 entry.data.unalignedNucleotideSequences[ids[0].fasta_id] ) - if ( - len(sequence_assignment.unalignedNucleotideSequences) == 0 - and not has_duplicate_segments - and (not has_unaligned_sequence or config.alignment_requirement == AlignmentRequirement.ANY) - ): - sequence_assignment.alert.errors.append( - sequence_annotation( - "No sequence data could be classified - " - "check you are submitting to the correct organism.", - ) - ) - return sequence_assignment diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 8744efa722..d1de045cad 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -101,6 +101,11 @@ def sequence_with_mutation( return str(seq[: pos - 1]) + "A" + str(seq[pos:]) +def fragment(type: Literal["single"] | Literal["ebola-sudan"] | Literal["ebola-zaire"]) -> str: + seq = consensus_sequence(type) + return str(seq[:90]) + + def ebola_sudan_aa(nuc: str, gene: Literal["VP35", "NP"]) -> str: match gene: case "NP": @@ -739,14 +744,6 @@ def invalid_sequence() -> str: "length_ebola-zaire": 0, }, expected_errors=build_processing_annotations( - [ - ProcessingAnnotationHelper.sequence_annotation_helper( - "No sequence data could be classified - " - "check you are submitting to the correct organism.", - ) - ] - ), - expected_warnings=build_processing_annotations( [ ProcessingAnnotationHelper.sequence_annotation_helper( "Sequence with fasta id fastaHeader1 does not match any reference " @@ -755,6 +752,7 @@ def invalid_sequence() -> str: ) ] ), + expected_warnings=[], expected_processed_alignment=ProcessedAlignment( unalignedNucleotideSequences={}, alignedNucleotideSequences={}, @@ -765,10 +763,10 @@ def invalid_sequence() -> str: ), ), Case( - name="with one failed alignment, one succeeded", + name="with one failed alignment (but classified), one succeeded", input_metadata={}, input_sequence={ - "fastaHeader1": invalid_sequence(), + "fastaHeader1": fragment("ebola-sudan"), "fastaHeader2": sequence_with_mutation("ebola-zaire"), }, accession_id="1", @@ -776,7 +774,7 @@ def invalid_sequence() -> str: "totalInsertedNucs_ebola-sudan": None, "totalSnps_ebola-sudan": None, "totalDeletedNucs_ebola-sudan": None, - "length_ebola-sudan": 0, + "length_ebola-sudan": len(fragment("ebola-sudan")), "totalInsertedNucs_ebola-zaire": 0, "totalSnps_ebola-zaire": 1, "totalDeletedNucs_ebola-zaire": 0, @@ -785,16 +783,18 @@ def invalid_sequence() -> str: expected_errors=[], expected_warnings=build_processing_annotations( [ - ProcessingAnnotationHelper.sequence_annotation_helper( - "Sequence with fasta id fastaHeader1 does not match any reference" - " for organism: multi-ebola-test per `nextclade sort`. " - "Double check you are submitting to the correct organism.", - ) + ProcessingAnnotationHelper( + ["ebola-sudan"], + ["ebola-sudan"], + "Nucleotide sequence for ebola-sudan failed to align", + AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), ] ), expected_processed_alignment=ProcessedAlignment( unalignedNucleotideSequences={ "ebola-zaire": sequence_with_mutation("ebola-zaire"), + "ebola-sudan": fragment("ebola-sudan"), }, alignedNucleotideSequences={ "ebola-zaire": sequence_with_mutation("ebola-zaire"), @@ -805,7 +805,7 @@ def invalid_sequence() -> str: "LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"), }, aminoAcidInsertions={}, - sequenceNameToFastaId={"ebola-zaire": "fastaHeader2"}, + sequenceNameToFastaId={"ebola-zaire": "fastaHeader2", "ebola-sudan": "fastaHeader1"}, ), ), ] @@ -827,14 +827,6 @@ def invalid_sequence() -> str: "length_ebola-zaire": 0, }, expected_errors=build_processing_annotations( - [ - ProcessingAnnotationHelper.sequence_annotation_helper( - "No sequence data could be classified - " - "check you are submitting to the correct organism.", - ) - ] - ), - expected_warnings=build_processing_annotations( [ ProcessingAnnotationHelper.sequence_annotation_helper( "Sequence with fasta id fastaHeader1 does not match any reference for " @@ -843,6 +835,7 @@ def invalid_sequence() -> str: ) ] ), + expected_warnings=[], expected_processed_alignment=ProcessedAlignment( unalignedNucleotideSequences={}, alignedNucleotideSequences={}, @@ -870,8 +863,8 @@ def invalid_sequence() -> str: "totalDeletedNucs_ebola-zaire": 0, "length_ebola-zaire": len(consensus_sequence("ebola-zaire")), }, - expected_errors=[], - expected_warnings=build_processing_annotations( + expected_warnings=[], + expected_errors=build_processing_annotations( [ ProcessingAnnotationHelper.sequence_annotation_helper( "Sequence with fasta id fastaHeader1 does not match any reference for "