Skip to content

Commit 373e649

Browse files
authored
Merge pull request #308 from svdrecbd/optimize-scaling-and-parity
High-performance scaling optimizations for large-scale datasets
2 parents 0391b45 + 7005275 commit 373e649

5 files changed

Lines changed: 228 additions & 192 deletions

File tree

kb_python/ref.py

Lines changed: 33 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import itertools
33
import os
44
import tarfile
5+
from collections import defaultdict
56
from typing import Callable, Dict, List, Optional, Tuple, Union
67

78
import ngs_tools as ngs
@@ -71,6 +72,10 @@ def generate_mismatches(name, sequence):
7172
lengths = set()
7273
features = {}
7374
variants = {}
75+
76+
# Store all original sequences to check for collisions with variants
77+
original_sequences = set()
78+
7479
# Generate all feature barcode variations before saving to check for collisions.
7580
for i, row in df_features.iterrows():
7681
# Check that the first column contains the sequence
@@ -83,6 +88,8 @@ def generate_mismatches(name, sequence):
8388

8489
lengths.add(len(row.sequence))
8590
features[row['name']] = row.sequence
91+
original_sequences.add(row.sequence)
92+
8693
variants[row['name']] = {
8794
name: seq
8895
for name, seq in generate_mismatches(row['name'], row.sequence)
@@ -103,45 +110,36 @@ def generate_mismatches(name, sequence):
103110
','.join(str(l) for l in lengths) # noqa
104111
)
105112
)
106-
# Find & remove collisions between barcode and variants
107-
for feature in variants.keys():
108-
_variants = variants[feature]
109-
collisions = set(_variants.values()) & set(features.values())
110-
if collisions:
111-
# Remove collisions
113+
114+
# Invert variants: sequence -> list of (feature_name, variant_name)
115+
seq_to_variants = defaultdict(list)
116+
for feature_name, feature_variants in variants.items():
117+
for variant_name, seq in feature_variants.items():
118+
seq_to_variants[seq].append((feature_name, variant_name))
119+
120+
# Process collisions
121+
for seq, variant_list in seq_to_variants.items():
122+
# 1. Check collision with original barcodes
123+
if seq in original_sequences:
112124
logger.warning(
113-
f'Colision detected between variants of feature barcode {feature} '
114-
'and feature barcode(s). These variants will be removed.'
125+
f'Collision detected between variants of feature barcode(s) {",".join(set(v[0] for v in variant_list))} '
126+
f'and original feature barcode {seq}. These variants will be removed.'
115127
)
116-
variants[feature] = {
117-
name: seq
118-
for name, seq in _variants.items()
119-
if seq not in collisions
120-
}
121-
122-
# Find & remove collisions between variants
123-
for f1, f2 in itertools.combinations(variants.keys(), 2):
124-
v1 = variants[f1]
125-
v2 = variants[f2]
126-
127-
collisions = set(v1.values()) & set(v2.values())
128-
if collisions:
128+
for feature_name, variant_name in variant_list:
129+
if variant_name in variants[feature_name]:
130+
del variants[feature_name][variant_name]
131+
continue
132+
133+
# 2. Check collision between variants of DIFFERENT features
134+
features_involved = set(v[0] for v in variant_list)
135+
if len(features_involved) > 1:
129136
logger.warning(
130-
f'Collision(s) detected between variants of feature barcodes {f1} and {f2}: '
131-
f'{",".join(collisions)}. These variants will be removed.'
137+
f'Collision(s) detected between variants of feature barcodes {",".join(features_involved)}: '
138+
f'{seq}. These variants will be removed.'
132139
)
133-
134-
# Remove collisions
135-
variants[f1] = {
136-
name: seq
137-
for name, seq in v1.items()
138-
if seq not in collisions
139-
}
140-
variants[f2] = {
141-
name: seq
142-
for name, seq in v2.items()
143-
if seq not in collisions
144-
}
140+
for feature_name, variant_name in variant_list:
141+
if variant_name in variants[feature_name]:
142+
del variants[feature_name][variant_name]
145143

146144
# Write FASTA
147145
with ngs.fasta.Fasta(out_path, 'w') as f:

0 commit comments

Comments
 (0)