2222 format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ,
2323 handlers = [
2424 logging .FileHandler ("demux_kmeans.log" ),
25- logging .StreamHandler (sys .stdout )
26- ]
25+ logging .StreamHandler (sys .stdout ),
26+ ],
2727)
2828
2929
30- def hto_demux (path_hto_umi_count_dir ):
30+ def hto_demux (path_hto_umi_count_dir : str , mode : int , min_count_threshold : int ):
3131
32- matrix = scipy .io .mmread (
33- os .path .join (path_hto_umi_count_dir , "matrix.mtx.gz" )
34- )
32+ matrix = scipy .io .mmread (os .path .join (path_hto_umi_count_dir , "matrix.mtx.gz" ))
3533 barcodes = pd .read_csv (
36- os .path .join (path_hto_umi_count_dir , "barcodes.tsv.gz" ),
37- header = None
34+ os .path .join (path_hto_umi_count_dir , "barcodes.tsv.gz" ), header = None
3835 )[0 ]
3936 features = pd .read_csv (
40- os .path .join (path_hto_umi_count_dir , "features.tsv.gz" ),
41- header = None
37+ os .path .join (path_hto_umi_count_dir , "features.tsv.gz" ), header = None
4238 )[0 ]
4339
4440 # convert to numeric cell barcode
4541 dna3bit = DNA3Bit ()
4642 numeric_barcodes = barcodes .apply (lambda cb : dna3bit .encode (cb ))
4743
48- df_umi = pd .DataFrame (
49- matrix .todense (),
50- columns = numeric_barcodes ,
51- index = features
52- ).T
44+ df_umi = pd .DataFrame (matrix .todense (), columns = numeric_barcodes , index = features ).T
5345
5446 logger .info (
55- "Loaded HTO UMI count matrix ({} x {})" .format (
56- df_umi .shape [0 ], df_umi .shape [1 ]
57- )
47+ "Loaded HTO UMI count matrix ({} x {})" .format (df_umi .shape [0 ], df_umi .shape [1 ])
5848 )
5949
6050 # drop the column `unmapped`
6151 df_umi = df_umi .iloc [:, 0 :- 1 ]
6252
63- logger .info ("Computing centered log-ratio (CLR)..." )
64- # centered log-ratio (CLR) transformation
65- # HTO_301-ACCCACCAGTAAGAC HTO_302-GGTCGAGAGCATTCA HTO_303-CTTGCCGCATGTCAT HTO_304-AAAGCATTCTTCACG
66- # 227929296066909 2.609550 0.076485 2.049975 0.137688
67- # 164640656084404 2.477301 0.054396 0.046804 3.561632
68- # 121748877338358 2.501004 0.091309 0.034176 3.327706
69- # 134463437596589 3.060824 2.458869 0.053883
70- df_clr = df_umi .apply (lambda row : np .log1p (
71- (row + 1 ) / scipy .stats .mstats .gmean (row + 1 )), axis = 1 )
53+ logger .info (f"Running in mode { mode } ..." )
54+ if mode == 1 :
55+ # centered log-ratio (CLR) transformation
56+ # HTO_301-ACCCACCAGTAAGAC HTO_302-GGTCGAGAGCATTCA HTO_303-CTTGCCGCATGTCAT HTO_304-AAAGCATTCTTCACG
57+ # 227929296066909 2.609550 0.076485 2.049975 0.137688
58+ # 164640656084404 2.477301 0.054396 0.046804 3.561632
59+ # 121748877338358 2.501004 0.091309 0.034176 3.327706
60+ # 134463437596589 3.060824 2.458869 0.053883
61+ df_clr = df_umi .apply (
62+ lambda row : np .log1p ((row + 1 ) / scipy .stats .mstats .gmean (row + 1 )), axis = 1
63+ )
64+ elif mode == 2 :
65+ # very noisy methanol-based
66+ df_clr = df_umi .apply (lambda row : row - np .mean (row ), axis = 1 )
67+ df_clr = df_clr .applymap (lambda x : 0 if x < 0 else x )
68+ df_clr = df_clr .apply (
69+ lambda row : np .log1p ((row + 1 ) / scipy .stats .mstats .gmean (row + 1 )), axis = 1
70+ )
71+ elif mode == 3 :
72+ # aggresively rescue from doublets if in doubt
73+ df_clr = df_umi .apply (
74+ lambda row : row / scipy .stats .mstats .gmean (row + 1 ), axis = 1
75+ )
76+ else :
77+ raise Exception ("Unrecognized mode..." )
7278
7379 # change column name to column index so that we can access by e.g. x[1]
7480 df_tmp = df_umi
@@ -95,12 +101,14 @@ def kemans_per_row(row):
95101 df_kmeans = df_clr .apply (lambda row : kemans_per_row (row ), axis = 1 )
96102
97103 df_kmeans_hotencoded = df_kmeans .apply (
98- lambda x : "" .join (str (y ) for y in x )).to_frame ()
104+ lambda x : "" .join (str (y ) for y in x )
105+ ).to_frame ()
99106
100107 # shorten and replace _ with -
101108 # ['HTO-301', 'HTO-302', 'HTO-303', 'HTO-304']
102- hto_names = list (map (lambda name : name .split (
103- "-" )[0 ].replace ("_" , "-" ), df_clr .columns ))
109+ hto_names = list (
110+ map (lambda name : name .split ("-" )[0 ].replace ("_" , "-" ), df_clr .columns )
111+ )
104112
105113 def demux_pass2 (cb ):
106114
@@ -123,15 +131,15 @@ def demux_pass2(cb):
123131 )
124132 df_class .columns = ["CB" , "hashID" ]
125133 df_class .set_index ("CB" , inplace = True )
126- df_class
134+
135+ # mark as negative
136+ # if the total count for a given CB is less than min-count threshold
137+ mask_negative = df_umi .sum (axis = 1 ) < min_count_threshold
138+ df_class .where (~ mask_negative , other = "Negative" , inplace = True )
127139
128140 logger .debug (df_class .groupby (by = "hashID" ).size ())
129141
130- df_class .to_csv (
131- "classification.tsv.gz" ,
132- sep = "\t " ,
133- compression = "gzip"
134- )
142+ df_class .to_csv ("classification.tsv.gz" , sep = "\t " , compression = "gzip" )
135143
136144 return df_class
137145
@@ -154,7 +162,25 @@ def parse_arguments():
154162 action = "store" ,
155163 dest = "path_hto_umi_count_dir" ,
156164 help = "path to UMI count outputs generated by CITE-Seq-Count" ,
157- required = True
165+ required = True ,
166+ )
167+
168+ parser .add_argument (
169+ "--min-count" ,
170+ action = "store" ,
171+ dest = "min_count_threshold" ,
172+ type = int ,
173+ help = "total count for CB less than this threshold will be marked as negative (unreliable observations)" ,
174+ default = 0 ,
175+ )
176+
177+ parser .add_argument (
178+ "--mode" ,
179+ action = "store" ,
180+ dest = "mode" ,
181+ type = int ,
182+ help = "processing mode (1=default)" ,
183+ default = 1 ,
158184 )
159185
160186 # parse arguments
@@ -170,7 +196,7 @@ def parse_arguments():
170196 logger .info ("Starting..." )
171197
172198 df_class = hto_demux (
173- params .path_hto_umi_count_dir
199+ params .path_hto_umi_count_dir , params . mode , params . min_count_threshold
174200 )
175201
176202 logger .info ("Writing statistics..." )
0 commit comments