FertigLab · dimalvovs · Jul 9, 2025 · May 9, 2025 · May 9, 2025 · May 9, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -9,9 +9,8 @@
 ^inst/benchmarks/
 ^inst/profiling/
 ^inst/vignettes/
-^nextflow
 
-^.git
+^\.git
 
 ^src/Makevars(?!.in$|.win$)
 ^src/Cogaps.o
@@ -66,3 +65,11 @@
 ^src/gibbs_sampler/SparseNormalModel.o
 
 ^src/math/VectorMath\.o$
+
+^nextflow*
+^main\.nf
+^work*
+\.nextflow
+\.cirro
+^tests/nextflow
+
diff --git a/.cirro/process-compute.config b/.cirro/process-compute.config
@@ -0,0 +1,5 @@
+process {
+    executor = 'awsbatch'
+    errorStrategy = 'retry'
+    maxRetries = 2
+}
diff --git a/.cirro/process-form.json b/.cirro/process-form.json
@@ -0,0 +1,42 @@
+{
+  "form": {
+    "type": "object",
+    "properties": {
+      "npatterns": {
+        "title": "npatterns",
+        "description": "number of patterns to find e.g 5 or 5,6,7",
+        "type": "string"
+      },
+      "niterations": {
+        "title": "niterations",
+        "description": "number of iterations to run",
+        "type": "integer",
+        "default": 1000
+      },
+      "n_top_genes": {
+        "title": "n_top_genes",
+        "description": "number of top genes (by var) to use",
+        "type": "integer",
+        "default": 1000
+      },
+      "distributed": {
+        "title": "distributed",
+        "description": "single run: none, distributed: genome-wide, single-cell",
+        "type": "string",
+        "enum": [
+          "none",
+          "genome-wide",
+          "single-cell"
+        ]
+      },
+      "nsets": {
+        "title": "nsets",
+        "description": "If distributed, number of sets to split into",
+        "type": "integer",
+        "default": 4
+      }
+    },
+    "required": []
+  },
+  "ui": {}
+}
diff --git a/.cirro/process-input.json b/.cirro/process-input.json
@@ -0,0 +1,10 @@
+
+{
+  "niterations": "$.dataset.params.niterations",
+  "n_top_genes": "$.dataset.params.n_top_genes",
+  "distributed": "$.dataset.params.distributed",
+  "nsets": "$.dataset.params.nsets",
+  "outdir": "$.dataset.dataPath",
+  "input": "$.inputs.[*].dataPath",
+  "npatterns": "$.dataset.params.npatterns"
+}
diff --git a/.cirro/process-output.json b/.cirro/process-output.json
@@ -0,0 +1,3 @@
+{
+    "commands": []
+}
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: CoGAPS
-Version: 3.27.4
+Version: 3.27.5
 Date: 2025-03-11
 Title: Coordinated Gene Activity in Pattern Sets
 Author: Jeanette Johnson, Ashley Tsang, Jacob Mitchell, Thomas Sherman, Wai-shing Lee, Conor Kelton, Ondrej Maxian, Jacob Carey,

diff --git a/README.md b/README.md
@@ -28,3 +28,23 @@ devtools::install_github("FertigLab/CoGAPS")
 # Using CoGAPS
 
 Follow the vignette [here](https://github.com/FertigLab/CoGAPS/blob/master/vignettes/CoGAPS.Rmd) and available as static html [here](https://rpubs.com/jeanettejohnson/1018399)
+
+# Run as nextflow pipeline
+The example below will attempt running CoGAPS with number of patterns 3 and 4 on every `.rds` and `.h5ad` file in the input folder (`tests/nextflow`).
+```
+nextflow run main.nf --input tests/nextflow --outdir out -c nextflow.config -profile docker --max_memory 10GB --npatterns 3,4
+```
+
+Supported CLI params and their defaults are:
+
+    npatterns = "5"
+    nsets = 1
+    niterations = 100
+    sparse = 0
+    seed = 42
+    distributed = "null"
+    nthreads = 1
+    max_memory = '128.GB'
+    max_cpus = 8
+    max_time = '72.h'
+    n_top_genes = 5000
diff --git a/nextflow/main.nf → main.nf b/nextflow/main.nf → main.nf
@@ -1,6 +1,6 @@
 process COGAPS {
   tag "$prefix"
-  label 'process_medium'
+  label 'process_high'
   label 'process_long'
   container 'ghcr.io/fertiglab/cogaps:master'
 
@@ -31,6 +31,14 @@ process COGAPS {
   mkdir -p "${prefix}"
   Rscript -e 'library("CoGAPS");
       sparse <- readRDS("$dgCMatrix");
+      #select top 5K genes
+      message("finding top ", ${params.n_top_genes}, " genes");
+      vars <- apply(sparse, 1, var);
+      ngenes <- min(length(vars),${params.n_top_genes});
+      top_genes <- order(vars, decreasing=TRUE)[1:ngenes];
+      sparse <- sparse[top_genes,];
+      message("selected top ", length(top_genes), " genes of ", length(vars));
+
       data <- as.matrix(sparse);
       #avoid errors with distributed params
       dist_param <- NULL;
@@ -42,10 +50,18 @@ process COGAPS {
                              sparseOptimization = as.logical($cparams.sparse),
                              distributed=dist_param);
       if (!(is.null(dist_param))){
-        params <- setDistributedParams(params, nSets = $cparams.nsets);
+        nsets <- $cparams.nsets;
+        allow_cpus <- as.numeric($task.cpus);
+        if( allow_cpus < 2){
+          stop("Error: distributed mode requires at least 2 cpus")
+        }
+        if (nsets > allow_cpus){
+          message("Warning: nsets is greater than available cpus. Setting nsets to ", allow_cpus);
+        } 
+        params <- setDistributedParams(params, nSets = min(nsets,allow_cpus));
       };
       cogapsResult <- CoGAPS(data = data, params = params, nThreads = $cparams.nthreads,
-                             outputFrequency = floor($cparams.niterations/10));
+                             outputFrequency = 100);
       saveRDS(cogapsResult, file = "${prefix}/cogapsResult.rds")'
 
   cat <<-END_VERSIONS > versions.yml
@@ -88,7 +104,6 @@ process COGAPS_TENX2DGC {
   mkdir "${prefix}"
 
   Rscript -e 'res <- Seurat::Read10X("$data/filtered_feature_bc_matrix/");
-              res <- Seurat::NormalizeData(res);
               saveRDS(res, file="${prefix}/dgCMatrix.rds")';
 
   cat <<-END_VERSIONS > versions.yml
@@ -101,7 +116,7 @@ process COGAPS_TENX2DGC {
 
 process COGAPS_ADATA2DGC {
   tag "$meta.id"
-  label 'process_low'
+  label 'process_medium'
   container 'docker.io/satijalab/seurat:5.0.0'
 
   input:
@@ -157,9 +172,6 @@ process COGAPS_ADATA2DGC {
               if(transpose){
                 res <- Matrix::t(res)
               }; 
-
-              message("Normalizing data");
-              res <- Seurat::NormalizeData(res);
               message("Saving dgCMatrix");
               saveRDS(res, file="${prefix}/dgCMatrix.rds")';
 
@@ -173,22 +185,39 @@ process COGAPS_ADATA2DGC {
   """
 }
 
-//example channel with data folders, for example
-ch_data = Channel.fromPath('./test/**gist.rds')
-  .map { tuple([id:it.getParent().getName()], it)}
 
-//example channel with cparams
-ch_cparams = Channel.of([npatterns: 7, niterations: 100, sparse: 1, distributed: 'null', nsets:1, nthreads:1],
-                        [npatterns: 7, niterations: 100, sparse: 0, distributed: 'null', nsets:1, nthreads:1])
+//example workflow
+workflow {
+  //example channel with data folders, for example
+  ch_adata = Channel.fromPath("${params.input}/**.h5ad")
+    .map { tuple([id:it.getName().replace('.', '-')], it)}
 
-// combine the two channels as input to CoGAPS
-ch_input = ch_data.combine(ch_cparams)
+  ch_rds = Channel.fromPath("${params.input}/**.rds")
+    .map { tuple([id:it.getName().replace('.', '-')], it)}
+
+  //make a channel with desired pattern number
+  def patterns = params.npatterns.split(',').collect { it.toInteger() }
+  ch_patterns = Channel.from(patterns)
+
+  //example channel with cparams
+  ch_fixed_params = Channel.of([niterations: params.niterations, sparse: params.sparse, distributed: params.distributed, nsets:params.nsets, nthreads:1])
-  ch_fixed_params = Channel.of([niterations: params.niterations, sparse: params.sparse, distributed: params.distributed, nsets:params.nsets, nthreads:1])
+  ch_fixed_params = Channel.of([niterations: params.niterations, sparse: params.sparse, distributed: params.distributed, nsets:params.nsets, nthreads: params.nthreads])
-  ch_fixed_params = Channel.of([niterations: params.niterations, sparse: params.sparse, distributed: params.distributed, nsets:params.nsets, nthreads:1])
+  ch_fixed_params = Channel.of([niterations: params.niterations, sparse: params.sparse, distributed: params.distributed, nsets:params.nsets, nthreads: params.nthreads])
+
+  ch_cparams = ch_patterns
+    .combine(ch_fixed_params)
+    .map { tuple([id:it[0].toString(), npatterns:it[0], niterations:it[1].niterations, sparse:it[1].sparse, distributed:it[1].distributed, nsets:it[1].nsets, nthreads:it[1].nthreads]) }
+
+  // convert adata to dgCMatrix
+  COGAPS_ADATA2DGC(ch_adata)
+
+  // ch_cogaps_input of converted adatas and rdses
+  ch_input = COGAPS_ADATA2DGC.out.dgCMatrix
+  ch_input = ch_input.mix(ch_rds)
+
+  // combine the two channels as input to CoGAPS
+  ch_input = ch_input.combine(ch_cparams)
 
-//run the workflow
-workflow {
   COGAPS(ch_input)
 }
 
 //example:
-//nextflow run main.nf -profile docker -resume
-//nextflow run main.nf -profile slurm -resume
+//nextflow run main.nf --input tests/nextflow --outdir out -c nextflow.config -profile docker 
diff --git a/nextflow/nextflow.config → nextflow.config b/nextflow/nextflow.config → nextflow.config
@@ -1,20 +1,22 @@
 // default params
 params {
-    outdir = 'out'
-    input = ''
+    outdir = null
+    input = null
 
     //cogaps params
-    npatterns = 7
+    npatterns = "5"
     nsets = 1
     niterations = 100
-    sparse = 1
+    sparse = 0
     seed = 42
-    distributed = 'null'
+    distributed = "null"
     nthreads = 1
 
-    max_memory = '8.GB'
-    max_cpus = 4
-    max_time = '48.h'
+    max_memory = '128.GB'
+    max_cpus = 8
+    max_time = '72.h'
+
+    n_top_genes = 5000
 }
 
 //reporting
@@ -151,8 +153,6 @@ profiles {
         singularity.autoMounts = true
         process {
             executor = 'slurm'
-            cpus = 10
-            memory = '10 GB'
         }
     }
     test {

diff --git a/tests/nextflow/gist.h5ad b/tests/nextflow/gist.h5ad
diff --git a/tests/nextflow/gist.rds b/tests/nextflow/gist.rds