nf-core · nvnieuwk · Apr 16, 2025 · Apr 8, 2025 · Apr 8, 2025 · Apr 8, 2025
@@ -40,6 +40,7 @@ jobs:
         test_profile:
           - "test_stub"
           - "test_build"
+          - "test_stub_bam"
         compute_profile:
           - "docker"
           - "singularity"

@@ -31,6 +31,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `CTATSPLICING_PREPGENOMELIB` to update the starfusion genome library directory with a cancer splicing index. [#610](https://github.com/nf-core/rnafusion/pull/610)
 - Add nf-test to local subworkflow: `FUSIONREPORT_WORKFLOW`. [#607](https://github.com/nf-core/rnafusion/pull/607)
 - Add nf-test to local module: `ARRIBA_VISUALISATION`. [#625](https://github.com/nf-core/rnafusion/pull/625)
+- Added the following fields to the samplesheet [#647](https://github.com/nf-core/rnafusion/pull/647):
+  - `bam`: A BAM file aligned with STAR, it's the responsibility of the pipeline user to make sure this file has been correctly called.
+  - `bai`: The index of the BAM file, this is not required when a `bam` file has been given but can increase the pipeline speed a bit.
+  - `cram`: A CRAM file aligned with STAR, it's the responsibility of the pipeline user to make sure this file has been correctly called.
+  - `crai`: The index of the CRAM file, this is not required when a `cram` file has been given but can increase the pipeline speed a bit.
+  - `junctions`: A file containing the junctions determined by STAR (needed by `starfusion` and `ctatsplicing`)
+  - `splice_junctions` A file containing the splice junctions determined by STAR (needed by `ctatsplicing`)
 
 ### Changed
 

@@ -27,12 +27,75 @@
                 "pattern": "^\\S+f(ast)?q\\.gz$",
                 "errorMessage": "FastQ file for reads 2 cannot contain spaces, has to exist and must have extension '.fq.gz' or '.fastq.gz'"
             },
+            "bam": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.bam$",
+                "errorMessage": "BAM file cannot contain spaces, has to exist and must have extension '.bam'"
+            },
+            "bai": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.bam\\.bai$",
+                "errorMessage": "BAI file cannot contain spaces, has to exist and must have extension '.bam.bai'"
+            },
+            "cram": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.cram$",
+                "errorMessage": "CRAM file cannot contain spaces, has to exist and must have extension '.cram'"
+            },
+            "crai": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.cram\\.crai$",
+                "errorMessage": "CRAI file cannot contain spaces, has to exist and must have extension '.cram.crai'"
+            },
+            "junctions": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.junction$",
+                "errorMessage": "Junctions file cannot contain spaces, has to exist and must have extension '.junction'"
+            },
+            "splice_junctions": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.SJ.out.tab$",
+                "errorMessage": "Split junctions file cannot contain spaces, has to exist and must have extension '.SJ.out.tab'"
+            },
             "strandedness": {
                 "type": "string",
                 "enum": ["forward", "reverse", "unstranded", "unknown"],
                 "errorMessage": "Strandedness has to be 'forward', 'reverse', 'unstranded' or 'unknown'"
             }
         },
-        "required": ["sample", "fastq_1", "fastq_2", "strandedness"]
+        "dependentRequired": {
+            "bai": ["bam"],
+            "crai": ["cram"]
+        },
+        "required": ["sample", "strandedness"],
+        "anyOf": [
+            {
+                "required": ["fastq_1", "fastq_2"]
+            },
+            {
+                "required": ["bam"]
+            },
+            {
+                "required": ["cram"]
+            },
+            {
+                "required": ["junctions"]
+            },
+            {
+                "required": ["splice_junctions"]
+            }
+        ]
     }
 }
@@ -15,7 +15,7 @@ params {
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
     // Input data
-    input                  = params.pipelines_testdata_base_path +'rnafusion/testdata/human/samplesheet_valid.csv'
+    input                  = "${projectDir}/tests/csv/fastq.csv"
     tools                  = "all"
     no_cosmic              = true
 }

@@ -35,7 +35,7 @@ params {
 
     // Input data
     references_only            = true
-    input                      = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnafusion/testdata/human/samplesheet_valid.csv'
+    input                      = "${projectDir}/tests/csv/fastq.csv"
     no_cosmic                  = true
     tools                      = "arriba,starfusion,fusionreport,salmon,fusioninspector"
     fasta                      = 'https://github.com/STAR-Fusion/STAR-Fusion-Tutorial/raw/master/minigenome.fa'

@@ -83,7 +83,7 @@ nextflow run nf-core/rnafusion  \
   -profile <docker/singularity/.../institute> \
   --references_only \
   --cosmic_username <EMAIL> --cosmic_passwd <PASSWORD> \
-  --fusionreport \
+  --tools fusionreport \
   --genomes_base <PATH/TO/REFERENCES> \
   --outdir <OUTPUT/PATH>
 ```
@@ -111,7 +111,7 @@ The references are only built based on ensembl version 102. It is not possible c
 
 ### Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. The pipeline will detect whether a sample is single- or paired-end from the samplesheet - the `fastq_2` column is empty for single-end. The samplesheet has to be a comma-separated file (.csv) but can have as many columns as you desire. There is a strict requirement for the first 4 columns to match those defined in the table below with the header row included.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. The pipeline will detect whether a sample is single- or paired-end from the samplesheet - the `fastq_2` column is empty for single-end. The samplesheet has to be a comma-separated (.csv), tab-separated (.tsv), yaml (.yaml or .yml) or json (.json) file but can have as many columns as you desire. There is a strict requirement for the `sample` and `strandedness` columns. One or more of these columns should be provided too: `fastq_1`, `bam`, `cram`, `junctions` and `splice_junctions`
 A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
 
 ```csv title="samplesheet.csv"
@@ -125,14 +125,61 @@ TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,,forward
 TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,,forward
 ```
 
-As you can see above for multiple runs of the same sample, the `sample` name has to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis.
+As you can see above for multiple runs of the same sample, the `sample` name has to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Note that multiple rows per sample are not supported for samples that contain `bam`, `cram`, `junctions` and/or `splice_junctions` files.
 
-| Column         | Description                                                                                                                                                                            |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`       | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1`      | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-| `fastq_2`      | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-| `strandedness` | Strandedness: forward or reverse.                                                                                                                                                      |
+| Column             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                   | Required           |
+| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
+| `sample`           | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`).                                                                                                                                                                                                                                                        | :white_check_mark: |
+| `strandedness`     | Strandedness: forward or reverse.                                                                                                                                                                                                                                                                                                                                                                                                             | :white_check_mark: |
+| `fastq_1`          | Full path to FastQ file for Illumina short reads 1. File must exist, has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". It's recommended to always provide the FASTQ files because the pipeline will be able to create any missing files from these. The FASTQ files are required to run `salmon`, `fusioninspector` and `fusioncatcher`.                                                                                      | :grey_question:    |
+| `fastq_2`          | Full path to FastQ file for Illumina short reads 2. File must exist, has to be gzipped and have the extension ".fastq.gz" or ".fq. It's recommended to always provide the FASTQ files because the pipeline will be able to create any missing files from these. The FASTQ files are required to run `salmon`, `fusioninspector` and `fusioncatcher`.gz".                                                                                      | :grey_question:    |
+| `bam`              | Full path to the BAM file created with STAR. File has to exist and must have the extension ".bam". It's the responsibility of the pipeline user to make sure this file has been correctly created, see the [prepare chapter](#preparing-bamcramjunctionssplice_junctions) for more information. The BAM file is required to run `ctatsplicing`, `stringtie`, `fusioninspector` and `arriba` when the `fastq_1` and `cram` fields are empty.   | :grey_question:    |
+| `bai`              | Full path to the index of the BAM file. File has to exist and must have the extension ".bai".                                                                                                                                                                                                                                                                                                                                                 | :x:                |
+| `cram`             | Full path to the CRAM file created with STAR. File has to exist and must have the extension ".cram". It's the responsibility of the pipeline user to make sure this file has been correctly created, see the [prepare chapter](#preparing-bamcramjunctionssplice_junctions) for more information. The CRAM file is required to run `ctatsplicing`, `stringtie`, `fusioninspector` and `arriba` when the `fastq_1` and `bam` fields are empty. | :grey_question:    |
+| `crai`             | Full path to the index of the CRAM file. File has to exist and must have the extension ".crai".                                                                                                                                                                                                                                                                                                                                               | :x:                |
+| `junctions`        | Full path to the file containing chimeric junctions determined by STAR. File has to exist and must have the extension ".junction". It's the responsibility of the pipeline user to make sure this file has been correctly created, see the [prepare chapter](#preparing-bamcramjunctionssplice_junctions) for more information. The junctions file is required to run `starfusion` and `ctatsplicing` when the `fastq_1` field is empty.      | :grey_question:    |
+| `splice_junctions` | Full path to the file containing splice junctions determined by STAR. File has to exist and must have the extension ".SJ.out.tab". It's the responsibility of the pipeline user to make sure this file has been correctly created, see the [prepare chapter](#preparing-bamcramjunctionssplice_junctions) for more information. The splice junctions file is required to run `ctatsplicing` when the `fastq_1` field is empty.                | :grey_question:    |
+
+:white_check_mark: = Required
+:x: = Not required
+:grey_question: = One of these columns should be provided
+
+### Preparing BAM/CRAM/junctions/splice_junctions
+
+In the pipeline the following STAR command is used to produce the needed files:
+
+```bash
+STAR \\
+    --genomeDir <path-to-star-index> \
+    --readFilesIn <comma-separated-list-of-forward-fastqs> <comma-separated-list-of-reverse-fastqs> \
+    --runThreadN <threads> \
+    --outFileNamePrefix <sample-name>. \
+    --outSAMattrRGline 'ID:<sample-name>' 'SM:<sample-name>' \
+    --outReadsUnmapped None  \
+    --outSAMstrandField intronMotif \
+    --chimOutJunctionFormat 1 \
+    --twopassMode None \
+    --outFilterMultimapNmax 50 \
+    --chimMultimapNmax 50 \
+    --quantMode GeneCounts \
+    --outSAMunmapped Within \
+    --readFilesCommand zcat  \
+    --alignSJstitchMismatchNmax 5 -1 5 5 \
+    --outSAMtype BAM SortedByCoordinate \
+    --chimSegmentMin 10 \
+    --peOverlapNbasesMin 10 \
+    --alignSplicedMateMapLminOverLmate 0.5 \
+    --chimJunctionOverhangMin 10 \
+    --chimScoreJunctionNonGTAG 0 \
+    --chimScoreDropMax 30 \
+    --chimScoreSeparation 1  \
+    --chimSegmentReadGapMax 3 \
+    --chimOutType Junctions WithinBAM'
+```
+
+We found that this command produces the best results for all downstream processes in the pipeline. It is highly recommended to use the same command for the input BAM, CRAM, junctions and splice_junctions files.
+
+The pipeline will still work when another command has been used, but it is possible that the results will be significantly different from the standard flow.
 
 ### Starting commands
 

@@ -9,7 +9,7 @@ process FUSIONREPORT {
 
 
     input:
-    tuple val(meta), path(reads), path(arriba_fusions), path(starfusion_fusions),  path(fusioncatcher_fusions)
+    tuple val(meta), path(arriba_fusions), path(starfusion_fusions),  path(fusioncatcher_fusions)
     tuple val(meta2), path(fusionreport_ref)
     val(tools_cutoff)
 

@@ -21,6 +21,7 @@ process VCF_COLLECT {
 
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
+    // TODO use BGZIP to compress the VCF file instead of GZIP
     """
     vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --fusionreport_csv $fusionreport_csv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf
     gzip ${prefix}_fusion_data.vcf
@@ -35,7 +36,7 @@ process VCF_COLLECT {
     stub:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    touch ${prefix}.vcf
+    touch ${prefix}_fusion_data.vcf.gz
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":