From 99b63868c88afb6ffdfa3516aae34a6694eb893f Mon Sep 17 00:00:00 2001 From: gp201 Date: Tue, 10 Mar 2026 11:32:36 -0700 Subject: [PATCH 1/5] Warn if many samples missing in FASTA Only print per-sample missing warnings when debug=True. By default print a summary warning. --- barcodeforge/ref_muts.py | 15 ++++-- tests/test_ref_muts.py | 101 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 5 deletions(-) diff --git a/barcodeforge/ref_muts.py b/barcodeforge/ref_muts.py index 3f24aae..e17dab7 100755 --- a/barcodeforge/ref_muts.py +++ b/barcodeforge/ref_muts.py @@ -163,6 +163,7 @@ def process_and_reroot_lineages( ] root_seqs = [] + missing_count = 0 for sample_id, muts_str in valid.itertuples(index=False, name=None): # build root mutations and fetch the sequence by direct dict lookup root_muts = _reverse_mutations_to_root( @@ -171,12 +172,20 @@ def process_and_reroot_lineages( seq = seqs.get(sample_id, None) if seq is None: # It's better to raise an error or handle this case explicitly - console.print( - f"[{STYLES['warning']}]Warning: Sample {sample_id} not found in FASTA file. Skipping.[/{STYLES['warning']}]" - ) + if debug: + console.print( + f"[{STYLES['warning']}]Warning: Sample {sample_id} not found in FASTA file. Skipping.[/{STYLES['warning']}]" + ) + missing_count += 1 continue root_seqs.append(_construct_root_sequence(root_muts, seq)) + total_count = len(valid) + if missing_count > 0: + console.print( + f"[{STYLES['warning']}]Warning: {missing_count} out of {total_count} samples ({missing_count / total_count:.1%}) were not found in the FASTA file.[/{STYLES['warning']}]" + ) + if not root_seqs: raise ValueError( "No valid root sequences could be generated. Check input FASTA and sample mutations." diff --git a/tests/test_ref_muts.py b/tests/test_ref_muts.py index 43a97a5..bf43265 100644 --- a/tests/test_ref_muts.py +++ b/tests/test_ref_muts.py @@ -368,6 +368,7 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( mocker, ): # sample_muts_path with sampleA (valid) and sampleMissing (not in FASTA) + # 1 out of 2 samples missing, so summary warning is printed muts_content = "sampleA\tgene1:A2T,A7G\nsampleMissing\tgene1:C1G" muts_file = tmp_path / "missing_fasta_sample_muts.tsv" muts_file.write_text(muts_content) @@ -400,15 +401,22 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( assert output_additional_muts.exists() assert output_rerooted_lineages.exists() - # Verify warning for missing sample + # Verify summary warning is shown (any missing samples triggers warning) expected_warning_call_substr = ( - "[yellow]Warning: Sample sampleMissing not found in FASTA file. Skipping." + "1 out of 2 samples" ) assert any( expected_warning_call_substr in str(c_args) for c_args in mocked_console.print.call_args_list ) + # Verify per-sample warning is NOT shown when debug=False + per_sample_warning_substr = "Sample sampleMissing not found in FASTA file. Skipping." + assert not any( + per_sample_warning_substr in str(c_args) + for c_args in mocked_console.print.call_args_list + ) + # Verify inference based on sampleA df_add_muts = pd.read_csv(output_additional_muts, sep="\t") assert "position" in df_add_muts.columns or df_add_muts.empty @@ -424,3 +432,92 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( ) else: pd.testing.assert_frame_equal(original_lineages_df, rerooted_lineages_df) + + +def test_process_and_reroot_lineages_debug_shows_per_sample_warning( + sample_ref_fasta_file, # ref_genome + sample_lineage_paths_file, + tmp_path, + mocker, +): + # When debug=True, the per-sample warning should be shown for each missing sample + muts_content = "sampleA\tgene1:A2T,A7G\nsampleMissing\tgene1:C1G" + muts_file = tmp_path / "debug_missing_muts.tsv" + muts_file.write_text(muts_content) + + seqs_content = ">sampleA\nATAAAAAGAA\n>sampleOther\nCCCCCCCCCC" + seqs_file = tmp_path / "debug_missing_seqs.fasta" + seqs_file.write_text(seqs_content) + + output_additional_muts = tmp_path / "additional_muts_debug.tsv" + output_rerooted_lineages = tmp_path / "rerooted_lineages_debug.tsv" + + mocked_console = MagicMock(spec=Console) + mocker.patch("barcodeforge.ref_muts.console", mocked_console) + + process_and_reroot_lineages( + debug=True, + sample_muts_path=str(muts_file), + reference_fasta_path=sample_ref_fasta_file, + sequences_fasta_path=str(seqs_file), + input_lineage_paths_path=sample_lineage_paths_file, + output_additional_muts_path=str(output_additional_muts), + output_rerooted_lineage_paths_path=str(output_rerooted_lineages), + ) + + # Verify per-sample warning IS shown when debug=True + per_sample_warning_substr = "Sample sampleMissing not found in FASTA file. Skipping." + assert any( + per_sample_warning_substr in str(c_args) + for c_args in mocked_console.print.call_args_list + ) + + +def test_process_and_reroot_lineages_summary_warning_any_missing( + sample_ref_fasta_file, + sample_lineage_paths_file, + tmp_path, + mocker, +): + # 1 out of 5 samples missing — warning is always shown when any samples are missing + muts_content = ( + "sampleA\tgene1:A2T\n" + "sampleB\tgene1:A6G\n" + "sampleC\tgene1:A3T\n" + "sampleD\tgene1:A4T\n" + "sampleMissing\tgene1:C1G" + ) + muts_file = tmp_path / "below_threshold_muts.tsv" + muts_file.write_text(muts_content) + + seqs_content = ( + ">sampleA\nATAAAAAGAA\n" + ">sampleB\nAAAAAAGAAA\n" + ">sampleC\nAAATAAAAAA\n" + ">sampleD\nAAAATAAAAA\n" + ) + seqs_file = tmp_path / "below_threshold_seqs.fasta" + seqs_file.write_text(seqs_content) + + output_additional_muts = tmp_path / "additional_muts_below.tsv" + output_rerooted_lineages = tmp_path / "rerooted_lineages_below.tsv" + + mocked_console = MagicMock(spec=Console) + mocker.patch("barcodeforge.ref_muts.console", mocked_console) + + process_and_reroot_lineages( + debug=False, + sample_muts_path=str(muts_file), + reference_fasta_path=sample_ref_fasta_file, + sequences_fasta_path=str(seqs_file), + input_lineage_paths_path=sample_lineage_paths_file, + output_additional_muts_path=str(output_additional_muts), + output_rerooted_lineage_paths_path=str(output_rerooted_lineages), + ) + + # Verify summary warning IS shown even when only 1 out of 5 samples is missing + summary_warning_substr = "were not found in the FASTA file" + assert any( + summary_warning_substr in str(c_args) + for c_args in mocked_console.print.call_args_list + ) From 99eaa53073e022e80cbb8c71077c02d82612e7ef Mon Sep 17 00:00:00 2001 From: gp201 Date: Tue, 10 Mar 2026 11:33:45 -0700 Subject: [PATCH 2/5] lint: fix lint for test_ref_muts.py --- tests/test_ref_muts.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_ref_muts.py b/tests/test_ref_muts.py index bf43265..03f44a0 100644 --- a/tests/test_ref_muts.py +++ b/tests/test_ref_muts.py @@ -402,16 +402,16 @@ def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( assert output_rerooted_lineages.exists() # Verify summary warning is shown (any missing samples triggers warning) - expected_warning_call_substr = ( - "1 out of 2 samples" - ) + expected_warning_call_substr = "1 out of 2 samples" assert any( expected_warning_call_substr in str(c_args) for c_args in mocked_console.print.call_args_list ) # Verify per-sample warning is NOT shown when debug=False - per_sample_warning_substr = "Sample sampleMissing not found in FASTA file. Skipping." + per_sample_warning_substr = ( + "Sample sampleMissing not found in FASTA file. Skipping." + ) assert not any( per_sample_warning_substr in str(c_args) for c_args in mocked_console.print.call_args_list @@ -466,7 +466,9 @@ def test_process_and_reroot_lineages_debug_shows_per_sample_warning( ) # Verify per-sample warning IS shown when debug=True - per_sample_warning_substr = "Sample sampleMissing not found in FASTA file. Skipping." + per_sample_warning_substr = ( + "Sample sampleMissing not found in FASTA file. Skipping." + ) assert any( per_sample_warning_substr in str(c_args) for c_args in mocked_console.print.call_args_list From a94bc99583a097ec50910d7c69f323274e0117ce Mon Sep 17 00:00:00 2001 From: gp201 Date: Tue, 10 Mar 2026 11:48:15 -0700 Subject: [PATCH 3/5] update comment --- barcodeforge/ref_muts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/barcodeforge/ref_muts.py b/barcodeforge/ref_muts.py index e17dab7..c78efdb 100755 --- a/barcodeforge/ref_muts.py +++ b/barcodeforge/ref_muts.py @@ -171,7 +171,7 @@ def process_and_reroot_lineages( ) seq = seqs.get(sample_id, None) if seq is None: - # It's better to raise an error or handle this case explicitly + # In debug mode, log a per-sample warning and track missing samples for a summary warning below. if debug: console.print( f"[{STYLES['warning']}]Warning: Sample {sample_id} not found in FASTA file. Skipping.[/{STYLES['warning']}]" From 5a26b6378222aef71bca5c5fae0986732997b007 Mon Sep 17 00:00:00 2001 From: gp201 Date: Tue, 10 Mar 2026 12:02:48 -0700 Subject: [PATCH 4/5] Warn about missing samples and remove ValueError --- barcodeforge/ref_muts.py | 22 +++++++++------------ tests/test_ref_muts.py | 41 ---------------------------------------- 2 files changed, 9 insertions(+), 54 deletions(-) diff --git a/barcodeforge/ref_muts.py b/barcodeforge/ref_muts.py index c78efdb..9eee585 100755 --- a/barcodeforge/ref_muts.py +++ b/barcodeforge/ref_muts.py @@ -133,6 +133,15 @@ def process_and_reroot_lineages( seqs = SeqIO.to_dict(SeqIO.parse(sequences_fasta_path, "fasta")) ref = SeqIO.read(reference_fasta_path, "fasta") + sample_ids = set(sample_muts_df["sample"]) + missing_count = len(sample_ids - set(seqs.keys())) + if missing_count: + total_count = len(sample_ids) + console.print( + f"[{STYLES['warning']}]Warning: {missing_count} out of {total_count} samples ({missing_count / total_count:.1%}) were not found in the FASTA file.[/{STYLES['warning']}]" + ) + + # if reference in the sample mutations file, use that as the root if sample_muts_df[sample_muts_df["sample"] == ref.id].shape[0] > 0: console.print( @@ -163,7 +172,6 @@ def process_and_reroot_lineages( ] root_seqs = [] - missing_count = 0 for sample_id, muts_str in valid.itertuples(index=False, name=None): # build root mutations and fetch the sequence by direct dict lookup root_muts = _reverse_mutations_to_root( @@ -176,21 +184,9 @@ def process_and_reroot_lineages( console.print( f"[{STYLES['warning']}]Warning: Sample {sample_id} not found in FASTA file. Skipping.[/{STYLES['warning']}]" ) - missing_count += 1 continue root_seqs.append(_construct_root_sequence(root_muts, seq)) - total_count = len(valid) - if missing_count > 0: - console.print( - f"[{STYLES['warning']}]Warning: {missing_count} out of {total_count} samples ({missing_count / total_count:.1%}) were not found in the FASTA file.[/{STYLES['warning']}]" - ) - - if not root_seqs: - raise ValueError( - "No valid root sequences could be generated. Check input FASTA and sample mutations." - ) - root = _derive_root_sequence(root_seqs) additional_muts = _compare_sequences(ref, root) diff --git a/tests/test_ref_muts.py b/tests/test_ref_muts.py index 03f44a0..377b35f 100644 --- a/tests/test_ref_muts.py +++ b/tests/test_ref_muts.py @@ -320,47 +320,6 @@ def test_process_and_reroot_lineages_ref_not_in_muts_infer_root( pd.testing.assert_frame_equal(original_lineages_df, rerooted_lineages_df) -def test_process_and_reroot_lineages_value_error_empty_root_seqs( - sample_ref_fasta_file, - sample_lineage_paths_file, - tmp_path, - mocker, -): - # sample_muts_df is not empty, but no corresponding sequences are found. - muts_content = ( - "sampleD\tgene1:A1T\nsampleE\tgene1:C2G" # These samples are not in seqs_file - ) - muts_file = tmp_path / "empty_root_seqs_muts.tsv" - muts_file.write_text(muts_content) - - # Empty sequences.fasta or sequences that don't match sampleD/sampleE - seqs_content = ">sampleA\nATGC\n>sampleB\nCGTA" - seqs_file = tmp_path / "empty_root_seqs_sequences.fasta" - seqs_file.write_text(seqs_content) - - output_additional_muts = tmp_path / "additional_muts_value_error.tsv" - output_rerooted_lineages = tmp_path / "rerooted_lineages_value_error.tsv" - - mocked_console = MagicMock( - spec=Console - ) # Though not strictly needed for ValueError, good for consistency - mocker.patch("barcodeforge.ref_muts.console", mocked_console) - - with pytest.raises( - ValueError, - match="No valid root sequences could be generated. Check input FASTA and sample mutations.", - ): - process_and_reroot_lineages( - debug=False, - sample_muts_path=str(muts_file), - reference_fasta_path=sample_ref_fasta_file, - sequences_fasta_path=str(seqs_file), - input_lineage_paths_path=sample_lineage_paths_file, - output_additional_muts_path=str(output_additional_muts), - output_rerooted_lineage_paths_path=str(output_rerooted_lineages), - ) - - def test_process_and_reroot_lineages_warning_missing_sample_in_fasta( sample_ref_fasta_file, # ref_genome sample_lineage_paths_file, From 72851937fd4d0ce16d65b96ad36776c8e6c0d19c Mon Sep 17 00:00:00 2001 From: gp201 Date: Tue, 10 Mar 2026 12:03:57 -0700 Subject: [PATCH 5/5] lint: lint fix --- barcodeforge/ref_muts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/barcodeforge/ref_muts.py b/barcodeforge/ref_muts.py index 9eee585..70f7807 100755 --- a/barcodeforge/ref_muts.py +++ b/barcodeforge/ref_muts.py @@ -141,7 +141,6 @@ def process_and_reroot_lineages( f"[{STYLES['warning']}]Warning: {missing_count} out of {total_count} samples ({missing_count / total_count:.1%}) were not found in the FASTA file.[/{STYLES['warning']}]" ) - # if reference in the sample mutations file, use that as the root if sample_muts_df[sample_muts_df["sample"] == ref.id].shape[0] > 0: console.print(