Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kubernetes/loculus/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,7 @@
"docsIncludePrefix": false,
"type": "string",
"enum": ["ALL", "ANY"],
"description": "If multi-segmented viruses should require ALL segments (that the user provides) align or ANY segment aligns"
"description": "If multi-segmented viruses should require ALL segments (that the user provides) align or ANY segment aligns (in case ANY it must still be possible to assign the segments)"
},
"segment_classification_method": {
"groups": ["nextcladePipelineConfigFile"],
Expand Down
4 changes: 2 additions & 2 deletions preprocessing/nextclade/src/loculus_preprocessing/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class ProcessingSpec(BaseModel):

class AlignmentRequirement(StrEnum):
# Determines whether ALL or ANY segments that a user provides must align.
# ANY: warn if some segments fail and some segments align
# ALL: error if any segment fails even if some segments align
# ANY: warn if some segments can be assigned and not aligned while other segments align
# ALL: error if any segment fails alignment even if some segments align
# NONE: do not align any segments, just process them as-is
# - set if no nextclade dataset is provided
ANY = "ANY"
Expand Down
38 changes: 6 additions & 32 deletions preprocessing/nextclade/src/loculus_preprocessing/nextclade.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from loculus_preprocessing.sequence_checks import error_on_excess_sequences

from .config import AlignmentRequirement, Config, NextcladeSequenceAndDataset, SequenceName
from .config import Config, NextcladeSequenceAndDataset, SequenceName
from .datatypes import (
AccessionVersion,
Alert,
Expand Down Expand Up @@ -376,7 +376,7 @@ def is_valid_dataset_match(method, best_dataset_id, dataset):
return best_dataset_id in accepted__dataset_matches_or_default(dataset)


def assign_segment( # noqa: C901
def assign_segment(
entry: UnprocessedEntry,
id_map: dict[tuple[AccessionVersion, FastaId], str],
best_hits: pd.DataFrame,
Expand All @@ -385,33 +385,24 @@ def assign_segment( # noqa: C901
"""
Assign sequences to segments based on best hits from nextclade align, nextclade sort or diamond
If a segment has multiple references assign to the reference with highest alignment score
1. If no best hit for a sequence, add error/warning about unaligned sequence
2. If best hit does not match any accepted reference, add error/warning about unaligned sequence
1. If no best hit for a sequence, add error about unaligned sequence
2. If best hit does not match any accepted reference, add error about unaligned sequence
3. If multiple sequences match the same segment (also if they match different references of
that segment), add error about duplicate segments
4. If no sequences assigned and no errors about unaligned/duplicate, add error about
no sequence data found (e.g. when alignment requirement is ANY and all sequences miss)
"""
sort_results_map: dict[SegmentName, list[AssignedSequence]] = defaultdict(list)
sequence_assignment = SequenceAssignment()

has_unaligned_sequence = False
has_duplicate_segments = False

for fasta_id in entry.data.unalignedNucleotideSequences:
seq_id = id_map[entry.accessionVersion, fasta_id]
if seq_id not in best_hits[SequenceIdentifier].unique():
has_unaligned_sequence = True
method = config.segment_classification_method.display_name
annotation = sequence_annotation(
f"Sequence with fasta id {fasta_id} does not match any reference for "
f"organism: {config.organism} per `{method}`. "
"Double check you are submitting to the correct organism."
)
if config.alignment_requirement == AlignmentRequirement.ALL:
sequence_assignment.alert.errors.append(annotation)
else:
sequence_assignment.alert.warnings.append(annotation)
sequence_assignment.alert.errors.append(annotation)
continue

best_hit = best_hits[best_hits[SequenceIdentifier] == seq_id]
Expand All @@ -429,21 +420,16 @@ def assign_segment( # noqa: C901
break

if not_found:
has_unaligned_sequence = True
annotation = sequence_annotation(
f"Sequence {fasta_id} best matches {best_dataset_id}, "
"which is currently not an accepted option for organism: "
f"{config.organism}. It is therefore not possible to release. "
"Contact the administrator if you think this message is an error."
)
if config.alignment_requirement == AlignmentRequirement.ALL:
sequence_assignment.alert.errors.append(annotation)
else:
sequence_assignment.alert.warnings.append(annotation)
sequence_assignment.alert.errors.append(annotation)

for segment, ids in sort_results_map.items():
if len(ids) > 1:
has_duplicate_segments = True
sequence_assignment.alert.errors.append(
sequence_annotation(
f"Multiple sequences (with fasta ids: {', '.join([id.fasta_id for id in ids])})"
Expand All @@ -457,18 +443,6 @@ def assign_segment( # noqa: C901
entry.data.unalignedNucleotideSequences[ids[0].fasta_id]
)

if (
len(sequence_assignment.unalignedNucleotideSequences) == 0
and not has_duplicate_segments
and (not has_unaligned_sequence or config.alignment_requirement == AlignmentRequirement.ANY)
):
sequence_assignment.alert.errors.append(
sequence_annotation(
"No sequence data could be classified - "
"check you are submitting to the correct organism.",
)
)

return sequence_assignment
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Keep empty-segment submissions from passing classification

Returning immediately here drops the previous safeguard that emitted an error when sequence_assignment.unalignedNucleotideSequences stayed empty without any duplicate/unaligned diagnostics. In multi-segment mode, an entry with no uploaded nucleotide sequences now reaches later processing with no classification error, which can make a no-sequence submission look valid instead of being rejected with an explicit alignment/classification message.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If classification fails then nothing will align and the function alignment_errors_warnings will add an error so this should be fine



Expand Down
47 changes: 20 additions & 27 deletions preprocessing/nextclade/tests/test_nextclade_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ def sequence_with_mutation(
return str(seq[: pos - 1]) + "A" + str(seq[pos:])


def fragment(type: Literal["single"] | Literal["ebola-sudan"] | Literal["ebola-zaire"]) -> str:
seq = consensus_sequence(type)
return str(seq[:90])


def ebola_sudan_aa(nuc: str, gene: Literal["VP35", "NP"]) -> str:
match gene:
case "NP":
Expand Down Expand Up @@ -761,14 +766,6 @@ def invalid_sequence() -> str:
"length_ebola-zaire": 0,
},
expected_errors=build_processing_annotations(
[
ProcessingAnnotationHelper.sequence_annotation_helper(
"No sequence data could be classified - "
"check you are submitting to the correct organism.",
)
]
),
expected_warnings=build_processing_annotations(
[
ProcessingAnnotationHelper.sequence_annotation_helper(
"Sequence with fasta id fastaHeader1 does not match any reference "
Expand All @@ -777,6 +774,7 @@ def invalid_sequence() -> str:
)
]
),
expected_warnings=[],
expected_processed_alignment=ProcessedAlignment(
unalignedNucleotideSequences={},
alignedNucleotideSequences={},
Expand All @@ -787,18 +785,18 @@ def invalid_sequence() -> str:
),
),
Case(
name="with one failed alignment, one succeeded",
name="with one failed alignment (but classified), one succeeded",
input_metadata={},
input_sequence={
"fastaHeader1": invalid_sequence(),
"fastaHeader1": fragment("ebola-sudan"),
"fastaHeader2": sequence_with_mutation("ebola-zaire"),
},
accession_id="1",
expected_metadata={
"totalInsertedNucs_ebola-sudan": None,
"totalSnps_ebola-sudan": None,
"totalDeletedNucs_ebola-sudan": None,
"length_ebola-sudan": 0,
"length_ebola-sudan": len(fragment("ebola-sudan")),
"totalInsertedNucs_ebola-zaire": 0,
"totalSnps_ebola-zaire": 1,
"totalDeletedNucs_ebola-zaire": 0,
Expand All @@ -807,16 +805,18 @@ def invalid_sequence() -> str:
expected_errors=[],
expected_warnings=build_processing_annotations(
[
ProcessingAnnotationHelper.sequence_annotation_helper(
"Sequence with fasta id fastaHeader1 does not match any reference"
" for organism: multi-ebola-test per `nextclade sort`. "
"Double check you are submitting to the correct organism.",
)
ProcessingAnnotationHelper(
["ebola-sudan"],
["ebola-sudan"],
"Nucleotide sequence for ebola-sudan failed to align",
AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
),
]
),
expected_processed_alignment=ProcessedAlignment(
unalignedNucleotideSequences={
"ebola-zaire": sequence_with_mutation("ebola-zaire"),
"ebola-sudan": fragment("ebola-sudan"),
},
alignedNucleotideSequences={
"ebola-zaire": sequence_with_mutation("ebola-zaire"),
Expand All @@ -827,7 +827,7 @@ def invalid_sequence() -> str:
"LEbolaZaire": ebola_zaire_aa(sequence_with_mutation("ebola-zaire"), "L"),
},
aminoAcidInsertions={},
sequenceNameToFastaId={"ebola-zaire": "fastaHeader2"},
sequenceNameToFastaId={"ebola-zaire": "fastaHeader2", "ebola-sudan": "fastaHeader1"},
),
),
]
Expand All @@ -849,14 +849,6 @@ def invalid_sequence() -> str:
"length_ebola-zaire": 0,
},
expected_errors=build_processing_annotations(
[
ProcessingAnnotationHelper.sequence_annotation_helper(
"No sequence data could be classified - "
"check you are submitting to the correct organism.",
)
]
),
expected_warnings=build_processing_annotations(
[
ProcessingAnnotationHelper.sequence_annotation_helper(
"Sequence with fasta id fastaHeader1 does not match any reference for "
Expand All @@ -865,6 +857,7 @@ def invalid_sequence() -> str:
)
]
),
expected_warnings=[],
expected_processed_alignment=ProcessedAlignment(
unalignedNucleotideSequences={},
alignedNucleotideSequences={},
Expand Down Expand Up @@ -892,8 +885,8 @@ def invalid_sequence() -> str:
"totalDeletedNucs_ebola-zaire": 0,
"length_ebola-zaire": len(consensus_sequence("ebola-zaire")),
},
expected_errors=[],
expected_warnings=build_processing_annotations(
expected_warnings=[],
expected_errors=build_processing_annotations(
[
ProcessingAnnotationHelper.sequence_annotation_helper(
"Sequence with fasta id fastaHeader1 does not match any reference for "
Expand Down
Loading