Skip to content

Commit 0148a2f

Browse files
committed
Merge branch 'dev'
2 parents d4702c2 + 79c2960 commit 0148a2f

19 files changed

Lines changed: 3407 additions & 2691 deletions

.vscode/launch.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,20 @@
44
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
55
"version": "0.2.0",
66
"configurations": [
7+
{
8+
"name": "Python: hto-demux-kmeans/demux_kmeans.py",
9+
"type": "python",
10+
"request": "launch",
11+
"program": "${workspaceFolder}/dockers/hto-demux-kmeans/demux_kmeans.py",
12+
"console": "integratedTerminal",
13+
"cwd": "${workspaceFolder}/dockers/hto-demux-kmeans/",
14+
"args": [
15+
// "--hto-umi-count-dir", "../../scratch/umi_count/",
16+
"--hto-umi-count-dir", "/Users/chunj/projects/sharp/manual-inspection/hashtag/2537_CellLines/67179a12-baeb-4a31-95a3-ff861aeb2963/umis/",
17+
"--mode", "3",
18+
"--min-count", "0",
19+
]
20+
},
721
{
822
"name": "Python: hto-adt-postprocess/combine.py",
923
"type": "python",

Hashtag.wdl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,16 @@ workflow Sharp {
5151
File denseCountMatrix
5252

5353
Boolean runSeuratDemux = false
54+
Int demuxMode = 1
55+
Int minCount = 0
5456

5557
Map[String, Int] resourceSpec
5658
}
5759

60+
parameter_meta {
61+
demuxMode: { help: "1=default, 2=noisy methanol, 3=aggressively rescue from doublets" }
62+
}
63+
5864
call Preprocess.Preprocess {
5965
input:
6066
uriFastqR1 = uriFastqR1,
@@ -85,7 +91,9 @@ workflow Sharp {
8591
# HTO demux using KMeans
8692
call HtoDemuxKMeans.HtoDemuxKMeans {
8793
input:
88-
umiCountFiles = Preprocess.umiCountMatrix
94+
umiCountFiles = Preprocess.umiCountMatrix,
95+
minCount = minCount,
96+
mode = demuxMode
8997
}
9098
9199
# HTO demux using Seurat

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ Explanation about the output:
3030
## Setup
3131

3232
```bash
33-
aws s3 cp s3://dp-lab-home/software/install-sharp-0.0.6.sh - | bash
33+
aws s3 cp s3://dp-lab-home/software/install-sharp-0.0.7.sh - | bash
3434
```
3535

3636
```

Sharp.deps.zip

89 Bytes
Binary file not shown.

configs/cellplex.inputs.json

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"Sharp.uriFastqR1": [
3+
"s3://dp-lab-test/cellplex/datasets/cell-lines/2537_CellLines_CellPlex_test_CPL_IGO_11874_11_S11_L001_R1_001.fastq.gz",
4+
"s3://dp-lab-test/cellplex/datasets/cell-lines/2537_CellLines_CellPlex_test_CPL_IGO_11874_11_S11_L002_R1_001.fastq.gz"
5+
],
6+
"Sharp.uriFastqR2": [
7+
"s3://dp-lab-test/cellplex/datasets/cell-lines/2537_CellLines_CellPlex_test_CPL_IGO_11874_11_S11_L001_R2_001.fastq.gz",
8+
"s3://dp-lab-test/cellplex/datasets/cell-lines/2537_CellLines_CellPlex_test_CPL_IGO_11874_11_S11_L002_R2_001.fastq.gz"
9+
],
10+
"Sharp.sampleName": "2537_CellLines",
11+
"Sharp.scRnaSeqPlatform": "10x_v3",
12+
"Sharp.lengthR1": 28,
13+
"Sharp.lengthR2": 15,
14+
"Sharp.cellBarcodeWhitelistUri": "s3://dp-lab-data/collaborators/lowe/CellplexPilot/CellLines_CellPlex_test/seqc-results/2537_CellLines_CellPlex_test_IGO_11963_2_dense.csv",
15+
"Sharp.cellBarcodeWhiteListMethod": "SeqcDenseCountsMatrixCsv",
16+
"Sharp.translate10XBarcodes": true,
17+
"Sharp.hashTagList": "s3://dp-lab-test/cellplex/datasets/cell-lines/tag-list.csv",
18+
"Sharp.cbStartPos": 1,
19+
"Sharp.cbEndPos": 16,
20+
"Sharp.umiStartPos": 17,
21+
"Sharp.umiEndPos": 28,
22+
"Sharp.slidingWindowSearch": false,
23+
"Sharp.cbCollapsingDistance": 1,
24+
"Sharp.umiCollapsingDistance": 1,
25+
"Sharp.numExpectedCells": 0,
26+
"Sharp.denseCountMatrix": "s3://dp-lab-data/collaborators/lowe/CellplexPilot/CellLines_CellPlex_test/seqc-results/2537_CellLines_CellPlex_test_IGO_11963_2_dense.csv",
27+
"Sharp.resourceSpec": {
28+
"cpu": 32,
29+
"memory": 64
30+
},
31+
"Sharp.demuxMode": 3
32+
}

configs/cellplex.labels.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"pipelineType": "Sharp",
3+
"project": "Project Dev",
4+
"sample": "2537_CellLines",
5+
"owner": "chunj",
6+
"destination": "s3://dp-lab-test/sharp/cellplex/cell-lines/",
7+
"transfer": "skip",
8+
"comment": "CellPlex"
9+
}

dockers/hto-demux-kmeans/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM ubuntu:18.04
1+
FROM ubuntu:20.04
22

33
LABEL maintainer="Jaeyoung Chun (chunj@mskcc.org)"
44

dockers/hto-demux-kmeans/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
```bash
66
$ docker run -it --rm \
77
-v /Users/chunj/projects/sharp/scratch:/data/ \
8-
cromwell-hto-demux-kmeans:0.4.0
8+
cromwell-hto-demux-kmeans:0.5.0
99
```
1010

1111
```bash
1212
$ python3 demux_kmeans.py \
13-
--hto-umi-count-dir /data/umi_count
13+
--hto-umi-count-dir /data/umi_count \
14+
--min-count 0 \
15+
--mode 1
1416
```

dockers/hto-demux-kmeans/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
#!/bin/bash
22

3-
docker build -t cromwell-hto-demux-kmeans:0.4.0 .
3+
docker build -t cromwell-hto-demux-kmeans:0.5.0 .

dockers/hto-demux-kmeans/demux_kmeans.py

Lines changed: 64 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -22,53 +22,59 @@
2222
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
2323
handlers=[
2424
logging.FileHandler("demux_kmeans.log"),
25-
logging.StreamHandler(sys.stdout)
26-
]
25+
logging.StreamHandler(sys.stdout),
26+
],
2727
)
2828

2929

30-
def hto_demux(path_hto_umi_count_dir):
30+
def hto_demux(path_hto_umi_count_dir: str, mode: int, min_count_threshold: int):
3131

32-
matrix = scipy.io.mmread(
33-
os.path.join(path_hto_umi_count_dir, "matrix.mtx.gz")
34-
)
32+
matrix = scipy.io.mmread(os.path.join(path_hto_umi_count_dir, "matrix.mtx.gz"))
3533
barcodes = pd.read_csv(
36-
os.path.join(path_hto_umi_count_dir, "barcodes.tsv.gz"),
37-
header=None
34+
os.path.join(path_hto_umi_count_dir, "barcodes.tsv.gz"), header=None
3835
)[0]
3936
features = pd.read_csv(
40-
os.path.join(path_hto_umi_count_dir, "features.tsv.gz"),
41-
header=None
37+
os.path.join(path_hto_umi_count_dir, "features.tsv.gz"), header=None
4238
)[0]
4339

4440
# convert to numeric cell barcode
4541
dna3bit = DNA3Bit()
4642
numeric_barcodes = barcodes.apply(lambda cb: dna3bit.encode(cb))
4743

48-
df_umi = pd.DataFrame(
49-
matrix.todense(),
50-
columns=numeric_barcodes,
51-
index=features
52-
).T
44+
df_umi = pd.DataFrame(matrix.todense(), columns=numeric_barcodes, index=features).T
5345

5446
logger.info(
55-
"Loaded HTO UMI count matrix ({} x {})".format(
56-
df_umi.shape[0], df_umi.shape[1]
57-
)
47+
"Loaded HTO UMI count matrix ({} x {})".format(df_umi.shape[0], df_umi.shape[1])
5848
)
5949

6050
# drop the column `unmapped`
6151
df_umi = df_umi.iloc[:, 0:-1]
6252

63-
logger.info("Computing centered log-ratio (CLR)...")
64-
# centered log-ratio (CLR) transformation
65-
# HTO_301-ACCCACCAGTAAGAC HTO_302-GGTCGAGAGCATTCA HTO_303-CTTGCCGCATGTCAT HTO_304-AAAGCATTCTTCACG
66-
# 227929296066909 2.609550 0.076485 2.049975 0.137688
67-
# 164640656084404 2.477301 0.054396 0.046804 3.561632
68-
# 121748877338358 2.501004 0.091309 0.034176 3.327706
69-
# 134463437596589 3.060824 2.458869 0.053883
70-
df_clr = df_umi.apply(lambda row: np.log1p(
71-
(row + 1) / scipy.stats.mstats.gmean(row + 1)), axis=1)
53+
logger.info(f"Running in mode {mode}...")
54+
if mode == 1:
55+
# centered log-ratio (CLR) transformation
56+
# HTO_301-ACCCACCAGTAAGAC HTO_302-GGTCGAGAGCATTCA HTO_303-CTTGCCGCATGTCAT HTO_304-AAAGCATTCTTCACG
57+
# 227929296066909 2.609550 0.076485 2.049975 0.137688
58+
# 164640656084404 2.477301 0.054396 0.046804 3.561632
59+
# 121748877338358 2.501004 0.091309 0.034176 3.327706
60+
# 134463437596589 3.060824 2.458869 0.053883
61+
df_clr = df_umi.apply(
62+
lambda row: np.log1p((row + 1) / scipy.stats.mstats.gmean(row + 1)), axis=1
63+
)
64+
elif mode == 2:
65+
# very noisy methanol-based
66+
df_clr = df_umi.apply(lambda row: row - np.mean(row), axis=1)
67+
df_clr = df_clr.applymap(lambda x: 0 if x < 0 else x)
68+
df_clr = df_clr.apply(
69+
lambda row: np.log1p((row + 1) / scipy.stats.mstats.gmean(row + 1)), axis=1
70+
)
71+
elif mode == 3:
72+
# aggresively rescue from doublets if in doubt
73+
df_clr = df_umi.apply(
74+
lambda row: row / scipy.stats.mstats.gmean(row + 1), axis=1
75+
)
76+
else:
77+
raise Exception("Unrecognized mode...")
7278

7379
# change column name to column index so that we can access by e.g. x[1]
7480
df_tmp = df_umi
@@ -95,12 +101,14 @@ def kemans_per_row(row):
95101
df_kmeans = df_clr.apply(lambda row: kemans_per_row(row), axis=1)
96102

97103
df_kmeans_hotencoded = df_kmeans.apply(
98-
lambda x: "".join(str(y) for y in x)).to_frame()
104+
lambda x: "".join(str(y) for y in x)
105+
).to_frame()
99106

100107
# shorten and replace _ with -
101108
# ['HTO-301', 'HTO-302', 'HTO-303', 'HTO-304']
102-
hto_names = list(map(lambda name: name.split(
103-
"-")[0].replace("_", "-"), df_clr.columns))
109+
hto_names = list(
110+
map(lambda name: name.split("-")[0].replace("_", "-"), df_clr.columns)
111+
)
104112

105113
def demux_pass2(cb):
106114

@@ -123,15 +131,15 @@ def demux_pass2(cb):
123131
)
124132
df_class.columns = ["CB", "hashID"]
125133
df_class.set_index("CB", inplace=True)
126-
df_class
134+
135+
# mark as negative
136+
# if the total count for a given CB is less than min-count threshold
137+
mask_negative = df_umi.sum(axis=1) < min_count_threshold
138+
df_class.where(~mask_negative, other="Negative", inplace=True)
127139

128140
logger.debug(df_class.groupby(by="hashID").size())
129141

130-
df_class.to_csv(
131-
"classification.tsv.gz",
132-
sep="\t",
133-
compression="gzip"
134-
)
142+
df_class.to_csv("classification.tsv.gz", sep="\t", compression="gzip")
135143

136144
return df_class
137145

@@ -154,7 +162,25 @@ def parse_arguments():
154162
action="store",
155163
dest="path_hto_umi_count_dir",
156164
help="path to UMI count outputs generated by CITE-Seq-Count",
157-
required=True
165+
required=True,
166+
)
167+
168+
parser.add_argument(
169+
"--min-count",
170+
action="store",
171+
dest="min_count_threshold",
172+
type=int,
173+
help="total count for CB less than this threshold will be marked as negative (unreliable observations)",
174+
default=0,
175+
)
176+
177+
parser.add_argument(
178+
"--mode",
179+
action="store",
180+
dest="mode",
181+
type=int,
182+
help="processing mode (1=default)",
183+
default=1,
158184
)
159185

160186
# parse arguments
@@ -170,7 +196,7 @@ def parse_arguments():
170196
logger.info("Starting...")
171197

172198
df_class = hto_demux(
173-
params.path_hto_umi_count_dir
199+
params.path_hto_umi_count_dir, params.mode, params.min_count_threshold
174200
)
175201

176202
logger.info("Writing statistics...")

0 commit comments

Comments
 (0)