22import itertools
33import os
44import tarfile
5+ from collections import defaultdict
56from typing import Callable , Dict , List , Optional , Tuple , Union
67
78import ngs_tools as ngs
@@ -71,6 +72,10 @@ def generate_mismatches(name, sequence):
7172 lengths = set ()
7273 features = {}
7374 variants = {}
75+
76+ # Store all original sequences to check for collisions with variants
77+ original_sequences = set ()
78+
7479 # Generate all feature barcode variations before saving to check for collisions.
7580 for i , row in df_features .iterrows ():
7681 # Check that the first column contains the sequence
@@ -83,6 +88,8 @@ def generate_mismatches(name, sequence):
8388
8489 lengths .add (len (row .sequence ))
8590 features [row ['name' ]] = row .sequence
91+ original_sequences .add (row .sequence )
92+
8693 variants [row ['name' ]] = {
8794 name : seq
8895 for name , seq in generate_mismatches (row ['name' ], row .sequence )
@@ -103,45 +110,36 @@ def generate_mismatches(name, sequence):
103110 ',' .join (str (l ) for l in lengths ) # noqa
104111 )
105112 )
106- # Find & remove collisions between barcode and variants
107- for feature in variants .keys ():
108- _variants = variants [feature ]
109- collisions = set (_variants .values ()) & set (features .values ())
110- if collisions :
111- # Remove collisions
113+
114+ # Invert variants: sequence -> list of (feature_name, variant_name)
115+ seq_to_variants = defaultdict (list )
116+ for feature_name , feature_variants in variants .items ():
117+ for variant_name , seq in feature_variants .items ():
118+ seq_to_variants [seq ].append ((feature_name , variant_name ))
119+
120+ # Process collisions
121+ for seq , variant_list in seq_to_variants .items ():
122+ # 1. Check collision with original barcodes
123+ if seq in original_sequences :
112124 logger .warning (
113- f'Colision detected between variants of feature barcode { feature } '
114- 'and feature barcode(s) . These variants will be removed.'
125+ f'Collision detected between variants of feature barcode(s) { "," . join ( set ( v [ 0 ] for v in variant_list )) } '
126+ f 'and original feature barcode { seq } . These variants will be removed.'
115127 )
116- variants [feature ] = {
117- name : seq
118- for name , seq in _variants .items ()
119- if seq not in collisions
120- }
121-
122- # Find & remove collisions between variants
123- for f1 , f2 in itertools .combinations (variants .keys (), 2 ):
124- v1 = variants [f1 ]
125- v2 = variants [f2 ]
126-
127- collisions = set (v1 .values ()) & set (v2 .values ())
128- if collisions :
128+ for feature_name , variant_name in variant_list :
129+ if variant_name in variants [feature_name ]:
130+ del variants [feature_name ][variant_name ]
131+ continue
132+
133+ # 2. Check collision between variants of DIFFERENT features
134+ features_involved = set (v [0 ] for v in variant_list )
135+ if len (features_involved ) > 1 :
129136 logger .warning (
130- f'Collision(s) detected between variants of feature barcodes { f1 } and { f2 } : '
131- f'{ "," . join ( collisions ) } . These variants will be removed.'
137+ f'Collision(s) detected between variants of feature barcodes { "," . join ( features_involved ) } : '
138+ f'{ seq } . These variants will be removed.'
132139 )
133-
134- # Remove collisions
135- variants [f1 ] = {
136- name : seq
137- for name , seq in v1 .items ()
138- if seq not in collisions
139- }
140- variants [f2 ] = {
141- name : seq
142- for name , seq in v2 .items ()
143- if seq not in collisions
144- }
140+ for feature_name , variant_name in variant_list :
141+ if variant_name in variants [feature_name ]:
142+ del variants [feature_name ][variant_name ]
145143
146144 # Write FASTA
147145 with ngs .fasta .Fasta (out_path , 'w' ) as f :
0 commit comments