From df0aaf355caf70233bbe446df71f518e3b48e80e Mon Sep 17 00:00:00 2001 From: Steve Wooster Date: Mon, 9 Mar 2026 14:12:23 -0700 Subject: [PATCH 1/5] Test that sequences actually strip spaces, not allow them through. --- src/rust_api.rs | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/rust_api.rs b/src/rust_api.rs index ca8837c..05e7573 100644 --- a/src/rust_api.rs +++ b/src/rust_api.rs @@ -769,19 +769,21 @@ mod tests { } #[test] - fn test_empty_spaces() { - // this test will unwrap() if it cannot parse the DNA - dna("gcantacctaangtnattag "); - dna(" gcantac\tctaangtnattag "); - dna(" gca ntac ctaangtnattag \t"); - - dna_strict("gcactacctaacgtcattag "); - dna_strict(" gcactac\tctaacgtcattag "); - dna_strict(" gca ctac ctaacgtcattag \t"); - - protein("angtnattag "); - protein(" angtnattag "); - protein(" an gtnattag \t"); + fn test_empty_spaces_are_stripped() { + let expected = dna("gcantacctaangtnattag"); + assert_eq!(dna("gcantacctaangtnattag "), expected); + assert_eq!(dna(" gcantac\tctaangtnattag "), expected); + assert_eq!(dna(" gca ntac ctaangtnattag \t"), expected); + + let expected = dna_strict("gcactacctaacgtcattag"); + assert_eq!(dna_strict("gcactacctaacgtcattag "), expected); + assert_eq!(dna_strict(" gcactac\tctaacgtcattag "), expected); + assert_eq!(dna_strict(" gca ctac ctaacgtcattag \t"), expected); + + let expected = protein("angtnattag"); + assert_eq!(protein("angtnattag "), expected); + assert_eq!(protein(" angtnattag "), expected); + assert_eq!(protein(" an gtnattag \t"), expected); } #[test] From 429b88e7dd5ca164cea4898b8ca3d06d82061508 Mon Sep 17 00:00:00 2001 From: Steve Wooster Date: Mon, 9 Mar 2026 14:15:56 -0700 Subject: [PATCH 2/5] Fix proteins not stripping spaces. --- src/rust_api.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rust_api.rs b/src/rust_api.rs index 05e7573..5c88d05 100644 --- a/src/rust_api.rs +++ b/src/rust_api.rs @@ -135,6 +135,7 @@ impl TryFrom<&[u8]> for ProteinSequence { fn try_from(value: &[u8]) -> Result { if value.is_ascii() { let mut vec = value.to_vec(); + vec.retain(|c| *c != b' ' && *c != b'\t'); vec.make_ascii_uppercase(); Ok(Self { amino_acids: vec }) } else { From 39edec51306e21c81472d2a79f87aa168351978e Mon Sep 17 00:00:00 2001 From: Steve Wooster Date: Mon, 9 Mar 2026 14:18:28 -0700 Subject: [PATCH 3/5] Add test for parsing invalid proteins. --- src/errors.rs | 2 ++ src/fasta.rs | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/errors.rs b/src/errors.rs index 6108a2f..2c3fc6b 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -22,6 +22,8 @@ pub enum TranslationError { NonAsciiChar(char), #[error("bad nucleotide: {:?}", .0)] BadNucleotide(char), + #[error("bad amino acid: {:?}", .0)] + BadAminoAcid(char), #[error("unexpected ambiguous nucleotide: {:?}", .0)] UnexpectedAmbiguousNucleotide(char), #[error("not a ncbi translation table: {}", .0)] diff --git a/src/fasta.rs b/src/fasta.rs index 64e7cf8..597ec49 100644 --- a/src/fasta.rs +++ b/src/fasta.rs @@ -1339,6 +1339,19 @@ mod tests { ); } + #[test] + fn test_protein_invalid_fasta() { + // Note the missing newline between records. + assert_parse_err!( + ">Virus1\nAAAA\nAAAA>Virus2\nCCCC\nCCCC\n", + FastaParser::::default(), + Located { + line_number: 3, + error: FastaParseError::ParseError(TranslationError::BadAminoAcid('>')) + } + ); + } + #[test] fn test_to_string() { let parser = FastaParser::>::default(); From 3cb5306fb0fce10dc08e93b3a1f5d2e9566cde0b Mon Sep 17 00:00:00 2001 From: Steve Wooster Date: Mon, 9 Mar 2026 14:20:56 -0700 Subject: [PATCH 4/5] Fix proteins accepting invalid amino acids. --- src/rust_api.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/rust_api.rs b/src/rust_api.rs index 5c88d05..8908655 100644 --- a/src/rust_api.rs +++ b/src/rust_api.rs @@ -133,14 +133,21 @@ impl TryFrom<&[u8]> for ProteinSequence { type Error = TranslationError; fn try_from(value: &[u8]) -> Result { - if value.is_ascii() { + let is_bad_aa = |&&c: &&u8| match c { + b'*' | b' ' | b'\t' => false, + c => !c.is_ascii_alphabetic(), + }; + if let Some(bad_aa) = value.iter().find(is_bad_aa) { + if bad_aa.is_ascii() { + Err(TranslationError::BadAminoAcid(char::from(*bad_aa))) + } else { + Err(TranslationError::NonAsciiByte(*bad_aa)) + } + } else { let mut vec = value.to_vec(); - vec.retain(|c| *c != b' ' && *c != b'\t'); vec.make_ascii_uppercase(); + vec.retain(|c| *c != b' ' && *c != b'\t'); Ok(Self { amino_acids: vec }) - } else { - let first_non_ascii = *value.iter().find(|b| !b.is_ascii()).unwrap(); - Err(TranslationError::NonAsciiByte(first_non_ascii)) } } } From bafd80fa8cddff17dc6445f91b380f301a220ebc Mon Sep 17 00:00:00 2001 From: Steve Wooster Date: Mon, 9 Mar 2026 20:17:52 -0700 Subject: [PATCH 5/5] Refactor `is_bad_aa` to `is_seq_char`. (also refactor two dereferences into one reference destructuring) --- src/rust_api.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/rust_api.rs b/src/rust_api.rs index 8908655..a1c98b9 100644 --- a/src/rust_api.rs +++ b/src/rust_api.rs @@ -133,15 +133,12 @@ impl TryFrom<&[u8]> for ProteinSequence { type Error = TranslationError; fn try_from(value: &[u8]) -> Result { - let is_bad_aa = |&&c: &&u8| match c { - b'*' | b' ' | b'\t' => false, - c => !c.is_ascii_alphabetic(), - }; - if let Some(bad_aa) = value.iter().find(is_bad_aa) { + let is_seq_char = |c| matches!(c, b'*' | b' ' | b'\t') || c.is_ascii_alphabetic(); + if let Some(&bad_aa) = value.iter().find(|&&c| !is_seq_char(c)) { if bad_aa.is_ascii() { - Err(TranslationError::BadAminoAcid(char::from(*bad_aa))) + Err(TranslationError::BadAminoAcid(char::from(bad_aa))) } else { - Err(TranslationError::NonAsciiByte(*bad_aa)) + Err(TranslationError::NonAsciiByte(bad_aa)) } } else { let mut vec = value.to_vec();