hisplan
diff --git a/‎.vscode/launch.json‎
Lines changed: 14 additions & 0 deletions b/‎.vscode/launch.json‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎Hashtag.wdl‎
Lines changed: 9 additions & 1 deletion b/‎Hashtag.wdl‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Sharp.deps.zip‎
89 Bytes b/‎Sharp.deps.zip‎
89 Bytes
diff --git a/‎configs/cellplex.inputs.json‎
Lines changed: 32 additions & 0 deletions b/‎configs/cellplex.inputs.json‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎configs/cellplex.labels.json‎
Lines changed: 9 additions & 0 deletions b/‎configs/cellplex.labels.json‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎dockers/hto-demux-kmeans/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎dockers/hto-demux-kmeans/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dockers/hto-demux-kmeans/README.md‎
Lines changed: 4 additions & 2 deletions b/‎dockers/hto-demux-kmeans/README.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎dockers/hto-demux-kmeans/build.sh‎
Lines changed: 1 addition & 1 deletion b/‎dockers/hto-demux-kmeans/build.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dockers/hto-demux-kmeans/demux_kmeans.py‎
Lines changed: 64 additions & 38 deletions b/‎dockers/hto-demux-kmeans/demux_kmeans.py‎
Lines changed: 64 additions & 38 deletions
@@ -4,6 +4,20 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
+        {
+            "name": "Python: hto-demux-kmeans/demux_kmeans.py",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/dockers/hto-demux-kmeans/demux_kmeans.py",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}/dockers/hto-demux-kmeans/",
+            "args": [
+                // "--hto-umi-count-dir", "../../scratch/umi_count/",
+                "--hto-umi-count-dir", "/Users/chunj/projects/sharp/manual-inspection/hashtag/2537_CellLines/67179a12-baeb-4a31-95a3-ff861aeb2963/umis/",
+                "--mode", "3",
+                "--min-count", "0",
+            ]
+        },        
         {
             "name": "Python: hto-adt-postprocess/combine.py",
             "type": "python",
 
@@ -51,10 +51,16 @@ workflow Sharp {
         File denseCountMatrix
 
         Boolean runSeuratDemux = false
+        Int demuxMode = 1
+        Int minCount = 0
 
         Map[String, Int] resourceSpec
     }
 
+    parameter_meta {
+        demuxMode: { help: "1=default, 2=noisy methanol, 3=aggressively rescue from doublets" }
+    }
+
     call Preprocess.Preprocess {
         input:
             uriFastqR1 = uriFastqR1,
@@ -85,7 +91,9 @@ workflow Sharp {
     # HTO demux using KMeans
     call HtoDemuxKMeans.HtoDemuxKMeans {
         input:
-            umiCountFiles = Preprocess.umiCountMatrix
+            umiCountFiles = Preprocess.umiCountMatrix,
+            minCount = minCount,
+            mode = demuxMode
     }
 
     # HTO demux using Seurat
 
@@ -30,7 +30,7 @@ Explanation about the output:
 ## Setup
 
 ```bash
-aws s3 cp s3://dp-lab-home/software/install-sharp-0.0.6.sh - | bash
+aws s3 cp s3://dp-lab-home/software/install-sharp-0.0.7.sh - | bash
 ```
 
 ```
 
@@ -0,0 +1,32 @@
+{
+    "Sharp.uriFastqR1": [
+        "s3://dp-lab-test/cellplex/datasets/cell-lines/2537_CellLines_CellPlex_test_CPL_IGO_11874_11_S11_L001_R1_001.fastq.gz",
+        "s3://dp-lab-test/cellplex/datasets/cell-lines/2537_CellLines_CellPlex_test_CPL_IGO_11874_11_S11_L002_R1_001.fastq.gz"
+    ],
+    "Sharp.uriFastqR2": [
+        "s3://dp-lab-test/cellplex/datasets/cell-lines/2537_CellLines_CellPlex_test_CPL_IGO_11874_11_S11_L001_R2_001.fastq.gz",
+        "s3://dp-lab-test/cellplex/datasets/cell-lines/2537_CellLines_CellPlex_test_CPL_IGO_11874_11_S11_L002_R2_001.fastq.gz"
+    ],
+    "Sharp.sampleName": "2537_CellLines",
+    "Sharp.scRnaSeqPlatform": "10x_v3",
+    "Sharp.lengthR1": 28,
+    "Sharp.lengthR2": 15,
+    "Sharp.cellBarcodeWhitelistUri": "s3://dp-lab-data/collaborators/lowe/CellplexPilot/CellLines_CellPlex_test/seqc-results/2537_CellLines_CellPlex_test_IGO_11963_2_dense.csv",
+    "Sharp.cellBarcodeWhiteListMethod": "SeqcDenseCountsMatrixCsv",
+    "Sharp.translate10XBarcodes": true,
+    "Sharp.hashTagList": "s3://dp-lab-test/cellplex/datasets/cell-lines/tag-list.csv",
+    "Sharp.cbStartPos": 1,
+    "Sharp.cbEndPos": 16,
+    "Sharp.umiStartPos": 17,
+    "Sharp.umiEndPos": 28,
+    "Sharp.slidingWindowSearch": false,
+    "Sharp.cbCollapsingDistance": 1,
+    "Sharp.umiCollapsingDistance": 1,
+    "Sharp.numExpectedCells": 0,
+    "Sharp.denseCountMatrix": "s3://dp-lab-data/collaborators/lowe/CellplexPilot/CellLines_CellPlex_test/seqc-results/2537_CellLines_CellPlex_test_IGO_11963_2_dense.csv",
+    "Sharp.resourceSpec": {
+        "cpu": 32,
+        "memory": 64
+    },
+    "Sharp.demuxMode": 3
+}
@@ -0,0 +1,9 @@
+{
+    "pipelineType": "Sharp",
+    "project": "Project Dev",
+    "sample": "2537_CellLines",
+    "owner": "chunj",
+    "destination": "s3://dp-lab-test/sharp/cellplex/cell-lines/",
+    "transfer": "skip",
+    "comment": "CellPlex"
+}
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 
 LABEL maintainer="Jaeyoung Chun (chunj@mskcc.org)"
 
 
@@ -5,10 +5,12 @@
 ```bash
 $ docker run -it --rm \
     -v /Users/chunj/projects/sharp/scratch:/data/ \
-    cromwell-hto-demux-kmeans:0.4.0
+    cromwell-hto-demux-kmeans:0.5.0
 ```
 
 ```bash
 $ python3 demux_kmeans.py \
-    --hto-umi-count-dir /data/umi_count
+    --hto-umi-count-dir /data/umi_count \
+    --min-count 0 \
+    --mode 1
 ```
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-docker build -t cromwell-hto-demux-kmeans:0.4.0 .
+docker build -t cromwell-hto-demux-kmeans:0.5.0 .
@@ -22,53 +22,59 @@
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     handlers=[
         logging.FileHandler("demux_kmeans.log"),
-        logging.StreamHandler(sys.stdout)
-    ]
+        logging.StreamHandler(sys.stdout),
+    ],
 )
 
 
-def hto_demux(path_hto_umi_count_dir):
+def hto_demux(path_hto_umi_count_dir: str, mode: int, min_count_threshold: int):
 
-    matrix = scipy.io.mmread(
-        os.path.join(path_hto_umi_count_dir, "matrix.mtx.gz")
-    )
+    matrix = scipy.io.mmread(os.path.join(path_hto_umi_count_dir, "matrix.mtx.gz"))
     barcodes = pd.read_csv(
-        os.path.join(path_hto_umi_count_dir, "barcodes.tsv.gz"),
-        header=None
+        os.path.join(path_hto_umi_count_dir, "barcodes.tsv.gz"), header=None
     )[0]
     features = pd.read_csv(
-        os.path.join(path_hto_umi_count_dir, "features.tsv.gz"),
-        header=None
+        os.path.join(path_hto_umi_count_dir, "features.tsv.gz"), header=None
     )[0]
 
     # convert to numeric cell barcode
     dna3bit = DNA3Bit()
     numeric_barcodes = barcodes.apply(lambda cb: dna3bit.encode(cb))
 
-    df_umi = pd.DataFrame(
-        matrix.todense(),
-        columns=numeric_barcodes,
-        index=features
-    ).T
+    df_umi = pd.DataFrame(matrix.todense(), columns=numeric_barcodes, index=features).T
 
     logger.info(
-        "Loaded HTO UMI count matrix ({} x {})".format(
-            df_umi.shape[0], df_umi.shape[1]
-        )
+        "Loaded HTO UMI count matrix ({} x {})".format(df_umi.shape[0], df_umi.shape[1])
     )
 
     # drop the column `unmapped`
     df_umi = df_umi.iloc[:, 0:-1]
 
-    logger.info("Computing centered log-ratio (CLR)...")
-    # centered log-ratio (CLR) transformation
-    #     	            HTO_301-ACCCACCAGTAAGAC	HTO_302-GGTCGAGAGCATTCA	HTO_303-CTTGCCGCATGTCAT	HTO_304-AAAGCATTCTTCACG
-    # 227929296066909	2.609550	0.076485	2.049975	0.137688
-    # 164640656084404	2.477301	0.054396	0.046804	3.561632
-    # 121748877338358	2.501004	0.091309	0.034176	3.327706
-    # 134463437596589	3.060824	2.458869	0.053883
-    df_clr = df_umi.apply(lambda row: np.log1p(
-        (row + 1) / scipy.stats.mstats.gmean(row + 1)), axis=1)
+    logger.info(f"Running in mode {mode}...")
+    if mode == 1:
+        # centered log-ratio (CLR) transformation
+        #     	            HTO_301-ACCCACCAGTAAGAC	HTO_302-GGTCGAGAGCATTCA	HTO_303-CTTGCCGCATGTCAT	HTO_304-AAAGCATTCTTCACG
+        # 227929296066909	2.609550	0.076485	2.049975	0.137688
+        # 164640656084404	2.477301	0.054396	0.046804	3.561632
+        # 121748877338358	2.501004	0.091309	0.034176	3.327706
+        # 134463437596589	3.060824	2.458869	0.053883
+        df_clr = df_umi.apply(
+            lambda row: np.log1p((row + 1) / scipy.stats.mstats.gmean(row + 1)), axis=1
+        )
+    elif mode == 2:
+        # very noisy methanol-based
+        df_clr = df_umi.apply(lambda row: row - np.mean(row), axis=1)
+        df_clr = df_clr.applymap(lambda x: 0 if x < 0 else x)
+        df_clr = df_clr.apply(
+            lambda row: np.log1p((row + 1) / scipy.stats.mstats.gmean(row + 1)), axis=1
+        )
+    elif mode == 3:
+        # aggresively rescue from doublets if in doubt
+        df_clr = df_umi.apply(
+            lambda row: row / scipy.stats.mstats.gmean(row + 1), axis=1
+        )
+    else:
+        raise Exception("Unrecognized mode...")
 
     # change column name to column index so that we can access by e.g. x[1]
     df_tmp = df_umi
@@ -95,12 +101,14 @@ def kemans_per_row(row):
         df_kmeans = df_clr.apply(lambda row: kemans_per_row(row), axis=1)
 
     df_kmeans_hotencoded = df_kmeans.apply(
-        lambda x: "".join(str(y) for y in x)).to_frame()
+        lambda x: "".join(str(y) for y in x)
+    ).to_frame()
 
     # shorten and replace _ with -
     # ['HTO-301', 'HTO-302', 'HTO-303', 'HTO-304']
-    hto_names = list(map(lambda name: name.split(
-        "-")[0].replace("_", "-"), df_clr.columns))
+    hto_names = list(
+        map(lambda name: name.split("-")[0].replace("_", "-"), df_clr.columns)
+    )
 
     def demux_pass2(cb):
 
@@ -123,15 +131,15 @@ def demux_pass2(cb):
     )
     df_class.columns = ["CB", "hashID"]
     df_class.set_index("CB", inplace=True)
-    df_class
+
+    # mark as negative
+    # if the total count for a given CB is less than min-count threshold
+    mask_negative = df_umi.sum(axis=1) < min_count_threshold
+    df_class.where(~mask_negative, other="Negative", inplace=True)
 
     logger.debug(df_class.groupby(by="hashID").size())
 
-    df_class.to_csv(
-        "classification.tsv.gz",
-        sep="\t",
-        compression="gzip"
-    )
+    df_class.to_csv("classification.tsv.gz", sep="\t", compression="gzip")
 
     return df_class
 
@@ -154,7 +162,25 @@ def parse_arguments():
         action="store",
         dest="path_hto_umi_count_dir",
         help="path to UMI count outputs generated by CITE-Seq-Count",
-        required=True
+        required=True,
+    )
+
+    parser.add_argument(
+        "--min-count",
+        action="store",
+        dest="min_count_threshold",
+        type=int,
+        help="total count for CB less than this threshold will be marked as negative (unreliable observations)",
+        default=0,
+    )
+
+    parser.add_argument(
+        "--mode",
+        action="store",
+        dest="mode",
+        type=int,
+        help="processing mode (1=default)",
+        default=1,
     )
 
     # parse arguments
@@ -170,7 +196,7 @@ def parse_arguments():
     logger.info("Starting...")
 
     df_class = hto_demux(
-        params.path_hto_umi_count_dir
+        params.path_hto_umi_count_dir, params.mode, params.min_count_threshold
     )
 
     logger.info("Writing statistics...")
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM ubuntu:18.04`
	`1`	`+FROM ubuntu:20.04`
`2`	`2`
`3`	`3`	`LABEL maintainer="Jaeyoung Chun (chunj@mskcc.org)"`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`#!/bin/bash`
`2`	`2`
`3`		`-docker build -t cromwell-hto-demux-kmeans:0.4.0 .`
	`3`	`+docker build -t cromwell-hto-demux-kmeans:0.5.0 .`