FertigLab · dimalvovs · Sep 16, 2025 · Apr 15, 2025 · Apr 15, 2025 · May 9, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: CoGAPS
-Version: 3.27.5
+Version: 3.29.1
 Date: 2025-03-11
 Title: Coordinated Gene Activity in Pattern Sets
 Author: Jeanette Johnson, Ashley Tsang, Jacob Mitchell, Thomas Sherman, Wai-shing Lee, Conor Kelton, Ondrej Maxian, Jacob Carey,

diff --git a/Dockerfile b/Dockerfile
@@ -9,7 +9,7 @@ RUN sudo apt-get update -y && \
     apt-get install libhdf5-dev build-essential patch -y
 
 #packages below didn't install with devtools::install_deps, needed BiocManager
-RUN Rscript -e 'BiocManager::install(c("S4Vectors", "SingleCellExperiment", "SummarizedExperiment", "rhdf5", "fgsea"), ask=FALSE)'
+RUN Rscript -e 'BiocManager::install(c("S4Vectors", "SingleCellExperiment", "SummarizedExperiment", "rhdf5", "fgsea", "sparseMatrixStats"), ask=FALSE)'
 
 #install all other dependencies
 RUN Rscript -e 'devtools::install_deps(".", dependencies=TRUE)'

diff --git a/R/DistributedCogaps.R b/R/DistributedCogaps.R
@@ -87,7 +87,7 @@ distributedCogaps <- function(data, allParams, uncertainty)
     allParams$gaps@fixedPatterns <- matchedPatterns$consensus
     allParams$gaps@whichMatrixFixed <- ifelse(allParams$gaps@distributed
         == "genome-wide", "P", "A")
-        
+
     # run final phase with fixed matrix
     gapsCat(allParams, "Running Final Stage...\n\n")
     finalResult <- bplapply(1:length(sets), BPPARAM=allParams$BPPARAM,
@@ -233,7 +233,7 @@ stitchTogether <- function(result, allParams, sets)
         Asd <- do.call(rbind, lapply(result, function(x) x@loadingStdDev))
 
         # copy P matrix - same for all sets
-        Pmean <- result[[1]]@sampleFactors
+        Pmean <- result[[1]]@metadata$params@fixedPatterns
         Psd <- matrix(0, nrow=nrow(Pmean), ncol=ncol(Pmean))
 
         # if each feature was used once, re-order to match data
@@ -255,7 +255,7 @@ stitchTogether <- function(result, allParams, sets)
         Psd <- do.call(rbind, lapply(result, function(x) x@factorStdDev))
 
         # copy A matrix - same for all sets
-        Amean <- result[[1]]@featureLoadings
+        Amean <- result[[1]]@metadata$params@fixedPatterns
         Asd <- matrix(0, nrow=nrow(Amean), ncol=ncol(Amean))
 
         # if each sample was used once, re-order to match data
@@ -276,4 +276,3 @@ stitchTogether <- function(result, allParams, sets)
         "sampleNames"=rownames(Pmean),
         "meanChiSq"=sum(sapply(result, function(r) r@metadata$meanChiSq))))
 }
-
diff --git a/main.nf b/main.nf
@@ -11,34 +11,13 @@ process COGAPS {
     tuple val(meta), path("${prefix}/cogapsResult.rds"), emit: cogapsResult
     path  "versions.yml",                                emit: versions
 
-  stub:
-  def args = task.ext.args ?: ''
-  prefix = task.ext.prefix ?: "${meta.id}/${cparams.niterations}-${cparams.npatterns}-${cparams.sparse}-${cparams.distributed}"
-  """
-  mkdir "${prefix}"
-  touch "${prefix}/cogapsResult.rds"
-  cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        CoGAPS: \$(Rscript -e 'print(packageVersion("CoGAPS"))' | awk '{print \$2}')
-        R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
-  END_VERSIONS
-  """
-
   script:
   def args = task.ext.args ?: ''
   prefix = task.ext.prefix ?: "${meta.id}/${cparams.niterations}-${cparams.npatterns}-${cparams.sparse}-${cparams.distributed}"
   """
   mkdir -p "${prefix}"
   Rscript -e 'library("CoGAPS");
       sparse <- readRDS("$dgCMatrix");
-      #select top 5K genes
-      message("finding top ", ${params.n_top_genes}, " genes");
-      vars <- apply(sparse, 1, var);
-      ngenes <- min(length(vars),${params.n_top_genes});
-      top_genes <- order(vars, decreasing=TRUE)[1:ngenes];
-      sparse <- sparse[top_genes,];
-      message("selected top ", length(top_genes), " genes of ", length(vars));
-
       data <- as.matrix(sparse);
       #avoid errors with distributed params
       dist_param <- NULL;
@@ -70,6 +49,20 @@ process COGAPS {
         R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
   END_VERSIONS
   """
+
+  stub:
+  def args = task.ext.args ?: ''
+  prefix = task.ext.prefix ?: "${meta.id}/${cparams.niterations}-${cparams.npatterns}-${cparams.sparse}-${cparams.distributed}"
+  """
+  mkdir "${prefix}"
+  touch "${prefix}/cogapsResult.rds"
+  cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        CoGAPS: \$(Rscript -e 'print(packageVersion("CoGAPS"))' | awk '{print \$2}')
+        R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
+  END_VERSIONS
+  """
+
 }
 
 process COGAPS_TENX2DGC {
@@ -83,28 +76,30 @@ process COGAPS_TENX2DGC {
       tuple val(meta), path("${prefix}/dgCMatrix.rds"), emit: dgCMatrix
       path "versions.yml"                             , emit: versions
 
-  stub:
+
+  script:
   def args = task.ext.args ?: ''
   prefix = task.ext.prefix ?: "${meta.id}"
-
   """
   mkdir "${prefix}"
-  touch "${prefix}/dgCMatrix.rds"
+
+  Rscript -e 'res <- Seurat::Read10X("$data/filtered_feature_bc_matrix/");
+              saveRDS(res, file="${prefix}/dgCMatrix.rds")';
+
   cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         seurat: \$(Rscript -e 'print(packageVersion("Seurat"))' | awk '{print \$2}')
         R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
   END_VERSIONS
   """
 
-  script:
+  stub:
   def args = task.ext.args ?: ''
   prefix = task.ext.prefix ?: "${meta.id}"
+
   """
   mkdir "${prefix}"
-
-  Rscript -e 'res <- Seurat::Read10X("$data/filtered_feature_bc_matrix/");
-              saveRDS(res, file="${prefix}/dgCMatrix.rds")';
+  touch "${prefix}/dgCMatrix.rds"
 
   cat <<-END_VERSIONS > versions.yml
     "${task.process}":
@@ -125,20 +120,6 @@ process COGAPS_ADATA2DGC {
       tuple val(meta), path("${prefix}/dgCMatrix.rds"), emit: dgCMatrix
       path "versions.yml"                             , emit: versions
 
-  stub:
-  def args = task.ext.args ?: ''
-  prefix = task.ext.prefix ?: "${meta.id}"
-
-  """
-  mkdir "${prefix}"
-  touch "${prefix}/dgCMatrix.rds"
-  cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        hdf5r: \$(Rscript -e 'print(packageVersion("Seurat"))' | awk '{print \$2}')
-        R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
-  END_VERSIONS
-  """
-
   script:
   def args = task.ext.args ?: ''
   prefix = task.ext.prefix ?: "${meta.id}"
@@ -183,6 +164,88 @@ process COGAPS_ADATA2DGC {
         R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
   END_VERSIONS
   """
+
+  stub:
+  def args = task.ext.args ?: ''
+  prefix = task.ext.prefix ?: "${meta.id}"
+
+  """
+  mkdir "${prefix}"
+  touch "${prefix}/dgCMatrix.rds"
+  cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        hdf5r: \$(Rscript -e 'print(packageVersion("hdf5r"))' | awk '{print \$2}')
+        R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
+  END_VERSIONS
+  """
+}
+
+process COGAPS_PREPROCESS {
+  tag "$prefix"
+  label 'process_medium'
+  container 'ghcr.io/fertiglab/cogaps:master'
+
+  input:
+    tuple val(meta), path(dgCMatrix)
+
+  output:
+    tuple val(meta), path("${prefix}/dgCMatrix.rds"),    emit: dgCMatrix
+    path  "versions.yml",                                emit: versions
+
+  script:
+  def args = task.ext.args ?: ''
+  prefix = task.ext.prefix ?: "${meta.id}"
+  """
+  mkdir -p "${prefix}"
+  Rscript -e 'library("Matrix");
+      library("sparseMatrixStats")
+      sparse <- readRDS("$dgCMatrix");
+
+      #sparsity is
+      message("sparsity: ", sum(sparse==0)/ (nrow(sparse)*ncol(sparse)));
+
+      #drop rows with > 95% zero counts
+      message("filtering rows with >95% zeros");
+      nz <- rowSums(sparse != 0);
+      sparse <- sparse[nz > 0.05 * ncol(sparse),];
+      message("filtered to ", nrow(sparse), " columns of ", length(nz));
+
+      #drop columns with > 95% zero counts
+      message("filtering columns with >95% zeros");
+      nz <- colSums(sparse != 0);
+      sparse <- sparse[,nz > 0.05 * nrow(sparse)];
+      message("filtered to ", ncol(sparse), " rows of ", length(nz));
+
+      #resulting sparsity is
+      message("sparsity: ", sum(sparse==0)/ (nrow(sparse)*ncol(sparse)));
+
+      #select top N genes
+      message("finding top ", ${params.n_top_genes}, " genes");
+      vars <- rowVars(sparse);
+      ngenes <- min(length(vars),${params.n_top_genes});
+      top_genes <- order(vars, decreasing=TRUE)[1:ngenes];
+      sparse <- sparse[top_genes,];
+      message("selected top ", length(top_genes), " genes of ", length(vars));
+
+      saveRDS(sparse, file = "${prefix}/dgCMatrix.rds")'
+
+  cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
+  END_VERSIONS
+  """
+
+  stub:
+  def args = task.ext.args ?: ''
+  prefix = task.ext.prefix ?: "${meta.id}"
+  """
+  mkdir "${prefix}"
+  touch "${prefix}/dgCMatrix.rds"
+  cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        R: \$(Rscript -e 'print(packageVersion("base"))' | awk '{print \$2}')
+  END_VERSIONS
+  """
 }
 
 
@@ -209,9 +272,17 @@ workflow {
   // convert adata to dgCMatrix
   COGAPS_ADATA2DGC(ch_adata)
 
+  // preprocess dgCMatrix
+  ch_preprocess = COGAPS_ADATA2DGC.out.dgCMatrix
+    .map { tuple(it[0], it[1]) }
+
+  ch_preprocess = ch_preprocess.mix(ch_rds)
+
+  COGAPS_PREPROCESS(ch_preprocess)
+
   // ch_cogaps_input of converted adatas and rdses
-  ch_input = COGAPS_ADATA2DGC.out.dgCMatrix
-  ch_input = ch_input.mix(ch_rds)
+  ch_input = COGAPS_PREPROCESS.out.dgCMatrix
+    .map { tuple(it[0], it[1]) }
 
   // combine the two channels as input to CoGAPS
   ch_input = ch_input.combine(ch_cparams)
@@ -220,4 +291,5 @@ workflow {
 }
 
 //example:
-//nextflow run main.nf --input tests/nextflow --outdir out -c nextflow.config -profile docker 
+//nextflow run main.nf --input tests/nextflow --outdir out -c nextflow.config -profile docker --max_memory 10GB --max_cpus 8
+
diff --git a/nextflow.config b/nextflow.config
@@ -12,8 +12,8 @@ params {
     distributed = "null"
     nthreads = 1
 
-    max_memory = '128.GB'
-    max_cpus = 8
+    max_memory = '200.GB'
+    max_cpus = 16
     max_time = '72.h'
 
     n_top_genes = 5000
@@ -91,6 +91,8 @@ process {
     memory = { check_max( 6.GB * task.attempt, 'memory' ) }
     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
 
+    resourceLimits = [cpus: params.max_cpus, memory: params.max_memory, time: params.max_time]
+
     errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
     maxRetries    = 1
     maxErrors     = '-1'
@@ -103,30 +105,31 @@ process {
     // TODO nf-core: Customise requirements for specific processes.
     // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
     withLabel:process_single {
-        cpus   = { check_max( 1                  , 'cpus'    ) }
-        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+        cpus   = {  1                    }
+        memory = {  6.GB * task.attempt  }
+        time   = {  4.h  * task.attempt   }
     }
     withLabel:process_low {
-        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+        cpus   = {  2     * task.attempt }
+        memory = {  12.GB * task.attempt }
+        time   = {  4.h   * task.attempt }
     }
     withLabel:process_medium {
-        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 24.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+        cpus   = {  6     * task.attempt }
+        memory = {  24.GB * task.attempt }
+        time   = {  8.h   * task.attempt }
     }
     withLabel:process_high {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+        cpus   = {  16    * task.attempt }
+        memory = {  72.GB * task.attempt }
+        time   = {  16.h  * task.attempt }
     }
     withLabel:process_long {
-        time   = { check_max( 48.h  * task.attempt, 'time'    ) }
+        time   = {  48.h  * task.attempt }
     }
     withLabel:process_high_memory {
-        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+        memory = {  200.GB * task.attempt }
+
     }
     withLabel:error_ignore {
         errorStrategy = 'ignore'

diff --git a/tests/testthat/test_DistributedCogaps.R b/tests/testthat/test_DistributedCogaps.R
@@ -0,0 +1,47 @@
+test_that("featureLoadings and sampleFactors are not all 0s in single-cell", {
+  params <- CogapsParams(seed=42,
+                         nIterations = 100,
+                         nPatterns = 2,
+                         sparseOptimization = as.logical(0),
+                         distributed="single-cell")
+
+  params <- setDistributedParams(params, nSets = 2)
+  data(GIST)
+  cg <- CoGAPS(GIST.matrix, params=params)
+
+  featureLoadings <- cg@featureLoadings
+  sampleFactors <- cg@sampleFactors
+
+  # Check featureLoadings and sampleFactors: smaller dimension is nPatterns
+  # and larger dimension matches data dimensions
+  expect_false(all(featureLoadings == 0))
+  expect_false(all(sampleFactors == 0))
+  expect_true(sort((dim(sampleFactors)))[1] == params@nPatterns)
+  expect_true(sort((dim(featureLoadings)))[1] == params@nPatterns)
+  expect_true(sort((dim(sampleFactors)))[2] == ncol(GIST.matrix))
+  expect_true(sort((dim(featureLoadings)))[2] == nrow(GIST.matrix))
+})
+
+test_that("featureLoadings and sampleFactors are not all 0s in genome-wide", {
+  params <- CogapsParams(seed=42,
+                         nIterations = 100,
+                         nPatterns = 2,
+                         sparseOptimization = as.logical(0),
+                         distributed="genome-wide")
+
+  params <- setDistributedParams(params, nSets = 2)
+  data(GIST)
+  cg <- CoGAPS(GIST.matrix, params=params)
+
+  featureLoadings <- cg@featureLoadings
+  sampleFactors <- cg@sampleFactors
+
+  # Check featureLoadings and sampleFactors: smaller dimension is nPatterns
+  # and larger dimension matches data dimensions
+  expect_false(all(featureLoadings == 0))
+  expect_false(all(sampleFactors == 0))
+  expect_true(sort((dim(sampleFactors)))[1] == params@nPatterns)
+  expect_true(sort((dim(featureLoadings)))[1] == params@nPatterns)
+  expect_true(sort((dim(sampleFactors)))[2] == ncol(GIST.matrix))
+  expect_true(sort((dim(featureLoadings)))[2] == nrow(GIST.matrix))
+})