diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..ab5eaca --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,81 @@ +name: Tests + +on: + push: + branches: [master, dev, looper-update] + pull_request: + branches: [master, dev] + workflow_dispatch: + inputs: + run_integration: + description: "Run integration tests (requires self-hosted runner)" + required: false + default: "false" + +jobs: + # -------------------------------------------------------------------------- + # Tier 1: Unit tests — no genome data or bioinformatics tools required. + # Runs on every push and pull request. + # -------------------------------------------------------------------------- + unit-tests: + name: Unit tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + run: pip install -r requirements.txt pytest + + - name: Run unit tests + run: pytest tests/test_unit.py -v --tb=short + + # -------------------------------------------------------------------------- + # Tier 2: Integration tests — full pipeline runs. + # Requires a self-hosted runner with genome indices and tools installed. + # Triggered manually via workflow_dispatch or by setting + # RUN_INTEGRATION_TESTS=true in the environment. + # -------------------------------------------------------------------------- + integration-tests: + name: Integration tests (${{ matrix.scenario }}) + if: > + github.event_name == 'workflow_dispatch' && + github.event.inputs.run_integration == 'true' + runs-on: self-hosted + strategy: + fail-fast: false + matrix: + scenario: + - se_basic + - pe_basic + - se_groseq + - se_umi + - pe_umi + - se_fastp + - se_fastx + - se_fqdedup + - se_scale + - se_no_complexity + - se_nofifo + - se_coverage + + steps: + - uses: actions/checkout@v4 + + - name: Install Python dependencies + run: pip install -r requirements.txt pytest + + - name: Run integration test for ${{ matrix.scenario }} + env: + RUN_INTEGRATION_TESTS: "true" + run: > + pytest tests/test_integration.py -v --tb=short + -k "${{ matrix.scenario }}" diff --git a/Makefile b/Makefile index 342e1b4..72a7dea 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,52 @@ test: - python pipelines/peppro.py -P 3 -M 100 -O peppro_test -R -S test -G hg38 -Q single -C peppro.yaml --genome-size hs --prealignments rCRSd human_repeats -I examples/data/test_R1.fq.gz + python pipelines/peppro.py -P 3 -M 100 -O peppro_test -R -S test -G hg38 \ + -Q single -C peppro.yaml \ + --protocol PRO \ + --prealignment-names rCRSd human_repeats \ + --genome-index $$(refgenie seek hg38/bowtie2_index --seek-key dir) \ + --chrom-sizes $$(refgenie seek hg38/fasta --seek-key chrom_sizes) \ + --pipestat-schema peppro_output_schema.yaml \ + -I examples/data/test_r1.fq.gz + +# ----------------------------------------------------------------------- +# Test suite targets +# ----------------------------------------------------------------------- + +# Run only unit tests (no genome data or external tools required) +test-unit: + pytest tests/test_unit.py -v --tb=short + +# Run a single integration scenario (e.g. make test-se SCENARIO=se_basic) +SCENARIO ?= se_basic +test-scenario: + RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short -k "$(SCENARIO)" + +# Run all SE integration scenarios +test-se: + RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short \ + -k "se_basic or se_groseq or se_umi or se_fastp or se_fastx or se_fqdedup or se_scale or se_no_complexity or se_nofifo or se_coverage" + +# Run all PE integration scenarios +test-pe: + RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short \ + -k "pe_basic or pe_umi" + +# Run recovery regression tests +test-recovery: + RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short \ + -k "recovery" + +# Run all integration tests (SE + PE + recovery) +test-integration: + RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v --tb=short + +# Run both unit and integration tests +test-all: + RUN_INTEGRATION_TESTS=true pytest tests/ -v --tb=short + +# Regenerate test FASTQ data files from the source R1 read file +test-data: + bash tests/scripts/generate_test_data.sh docker: docker build -t databio/peppro -f containers/peppro.Dockerfile . diff --git a/PEPPROr/DESCRIPTION b/PEPPROr/DESCRIPTION index e5ee59c..d7f302b 100644 --- a/PEPPROr/DESCRIPTION +++ b/PEPPROr/DESCRIPTION @@ -1,10 +1,10 @@ Package: PEPPROr Title: Functions and libraries to analyze pro-seq (or gro-seq) data -Version: 0.0.2.0000 +Version: 0.0.3.0000 Authors@R: person("Jason", "Smith", email = "jasonsmith@virginia.edu", role = c("aut", "cre")) Maintainer: Jason Smith Description: Installs required libraries to calculate the fraction of reads in features, to plot library complexity curves, TSS enrichments, and fragment length distributions. -Depends: R (>= 3.5.1), data.table, pepr, ggplot2, optigrab, GenomicDistributions +Depends: R (>= 3.5.1), data.table, pepr, ggplot2, GenomicDistributions License: BSD 2-Clause "Simplified" License Encoding: UTF-8 LazyData: true diff --git a/PEPPROr/R/PEPPROr.R b/PEPPROr/R/PEPPROr.R index 1188eef..c40dd24 100644 --- a/PEPPROr/R/PEPPROr.R +++ b/PEPPROr/R/PEPPROr.R @@ -2361,7 +2361,7 @@ calcCountsTable = function(project, results_subdir) { #' @export createAssetsSummary <- function(project, output_dir, results_subdir) { # Convenience - project_name <- config(project)$name + project_name <- pepr::config(project)$name # Create assets_summary file project_samples <- pepr::sampleTable(project)$sample_name diff --git a/README.md b/README.md index 9a21eb4..d898323 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,30 @@ PEPPRO is a pipeline designed to process PRO-seq (and GRO-seq) data. For more information see: http://peppro.databio.org/ +## Install + +```bash +pip install piper pipestat looper +``` + +**Note:** The pypiper PyPI package is `piper` (not `pypiper`, which is an unrelated package). + +## Testing + +Unit tests need no special setup: + +```bash +pytest tests/test_unit.py -v +``` + +Integration tests require bioinformatics tools via [bulker](https://bulker.io). Use the wrapper script: + +```bash +bash tests/scripts/test-integration.sh +``` + +This runs `bulker exec databio/peppro:1.1.0` to provide samtools, bowtie2, bedtools, etc. Do NOT run integration tests without bulker — they will fail with missing tools. + ## Docs Develop docs with: diff --git a/checkinstall b/checkinstall index bc44b3f..a1e67ef 100755 --- a/checkinstall +++ b/checkinstall @@ -3,8 +3,8 @@ # PEPPRO pipeline installation check # -if [ $# -gt 0 ] ; then - echo "Usage: checkinstall" +if [ $# -gt 1 ] ; then + echo "Usage: checkinstall [conda_env_name]" exit 1 fi @@ -29,8 +29,12 @@ trim() { printf '%s' "$var" } +CONDA_RUN=() + is_executable() { - if [ -x "$(command -v $1)" ]; then + if [ "${#CONDA_RUN[@]}" -gt 0 ]; then + "${CONDA_RUN[@]}" which "$1" > /dev/null 2>&1 + elif [ -x "$(command -v $1)" ]; then return 0 else return 1 @@ -38,7 +42,7 @@ is_executable() { } pip_show() { - if pip show -q $1; then + if "${CONDA_RUN[@]}" pip show -q $1 2>/dev/null; then return 0 else return 1 @@ -70,7 +74,7 @@ BASE_REQS=0 declare -a requiredPkgs=("looper") for package in ${requiredPkgs[@]}; do - if ! pip_show $package; then + if ! pip_show $package && ! is_executable $package; then echo $(fail "ERROR: PEPPRO requires the Python package, $package. Try pip install $package.") printf "\n" exit 1 @@ -89,14 +93,19 @@ NATIVE_INSTALL=0 # Check Python if ! is_executable "python"; then - echo $(warn "WARNING: PEPPRO requires python 3.0 or greater. Install python and checkinstall again.") + echo $(warn "WARNING: PEPPRO requires python >=3.9,<3.14. Install python and checkinstall again.") printf "\n" NATIVE_INSTALL=1 BULKER_INSTALL=1 else - ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]\).*/\1\2/') - if [ "$ver" -lt "30" ]; then - echo $(warn "WARNING: PEPPRO requires python 3.0 or greater. Update python and checkinstall again.") + ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]*\).*/\1\2/') + if [ "$ver" -lt "39" ]; then + echo $(warn "WARNING: PEPPRO requires python >=3.9,<3.14. Update python and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ "$ver" -ge "314" ]; then + echo $(warn "WARNING: PEPPRO requires python >=3.9,<3.14. Looper 2.x is incompatible with Python 3.14+.") printf "\n" NATIVE_INSTALL=1 BULKER_INSTALL=1 @@ -197,20 +206,20 @@ done ## Check R packages if ! is_executable "R"; then - echo $(warn "WARNING: PEPPRO requires R 3.5 or greater. Install R and checkinstall again.") + echo $(warn "WARNING: PEPPRO requires R 4.0 or greater. Install R and checkinstall again.") printf "\n" NATIVE_INSTALL=1 else rVer=$(R --version 2>&1 | grep 'R version' | awk '{print $3}') rVer=$(echo "${rVer//.}") - if [ "$rVer" -lt "350" ]; then - echo $(warn "WARNING: Please update R to >=3.5 and checkinstall again.") + if [ "$rVer" -lt "400" ]; then + echo $(warn "WARNING: PEPPRO requires R 4.0 or greater. Update R and checkinstall again.") printf "\n" NATIVE_INSTALL=1 fi fi -declare -a requiredRPackages=("argparser" "optigrab" "devtools" "GenomicDistributions" "GenomicDistributionsData" "R.utils" "PEPPROr" "data.table" "pepr" "gplots" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") +declare -a requiredRPackages=("argparser" "devtools" "GenomicDistributions" "GenomicDistributionsData" "R.utils" "PEPPROr" "data.table" "pepr" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") for package in ${requiredRPackages[@]}; do cmd=$(echo "Rscript -e 'library(\"$package\")'") packageInstalled=$(eval $cmd 2>&1) @@ -226,45 +235,46 @@ done ################################################################################ echo -e "-----------------------------------------------------------" echo -e "Checking conda installation... " + CONDA_INSTALL=0 if ! is_executable "conda"; then - echo $(warn "WARNING: Install conda to use conda environments and checkinstall again.") + echo $(warn "WARNING: conda not installed, skipping conda check.") printf "\n" CONDA_INSTALL=1 else - eval "$(conda shell.bash hook)" - conda activate peppro - - unset PYTHONPATH - unset R_LIBS - - # Check Python - if ! is_executable "python"; then - echo $(warn "WARNING: PEPPRO requires python 3.0 or greater. Install python and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 + if [ $# -eq 1 ]; then + CONDA_CNAME="$1" + elif [ -n "$CONDA_DEFAULT_ENV" ] && [ "$CONDA_DEFAULT_ENV" != "base" ]; then + CONDA_CNAME="$CONDA_DEFAULT_ENV" else - #echo "which python: $(which python)" - ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]\).*/\1\2/') - if [ "$ver" -lt "30" ]; then - echo $(warn "WARNING: PEPPRO requires python 3.0 or greater. Update python and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - fi + CONDA_CNAME="peppro" fi - # Check Python packages - if ! is_executable "pip"; then - echo $(warn "WARNING: PEPPRO requires pip. Please install pip and checkinstall again.") + CONDA_ENVS=$(conda env list) + + if echo "$CONDA_ENVS" | grep -q "$CONDA_CNAME"; then + echo -e $(success "SUCCESS: Found conda environment named, $CONDA_CNAME.") + else + echo $(warn "WARNING: Could not find a conda environment named, $CONDA_CNAME. Checkinstall expects to find one, skipping conda check.") printf "\n" CONDA_INSTALL=1 fi + CURR_CONDA=$(echo $CONDA_DEFAULT_ENV) + + if [[ "$CURR_CONDA" == "$CONDA_CNAME" ]]; then + CONDA_RUN=() + elif [[ "$CONDA_CNAME" == /* ]]; then + CONDA_RUN=(conda run --no-capture-output -p "$CONDA_CNAME") + else + CONDA_RUN=(conda run --no-capture-output -n "$CONDA_CNAME") + fi + if [ -f "requirements.txt" ]; then REQS="requirements.txt" else - curl https://raw.githubusercontent.com/databio/peppro/master/requirements.txt --silent --output curl_requirements.txt && mv requirements.txt curl_requirements.txt + curl https://raw.githubusercontent.com/databio/peppro/master/requirements.txt --silent --output requirements.txt && mv requirements.txt curl_requirements.txt REQS="curl_requirements.txt" fi @@ -288,10 +298,10 @@ else CONDA_INSTALL=1 else if [ $package == "cutadapt" ]; then - installed=$(cutadapt --version) + installed=$("${CONDA_RUN[@]}" cutadapt --version) installed=$(trim ${installed}) else - installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$("${CONDA_RUN[@]}" pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') installed=$(trim ${installed}) fi IFS='.' read -r -a installed_version <<< "$installed" @@ -346,23 +356,22 @@ else ## Check R packages if ! is_executable "R"; then - echo $(warn "WARNING: PEPPRO requires R 3.5 or greater.\n Please install R>=3.5 and checkinstall again.") + echo $(warn "WARNING: PEPPRO requires R 4.0 or greater.\n Please install R>=4.0 and checkinstall again.") printf "\n" exit 1 else - rVer=$(R --version 2>&1 | grep 'R version' | awk '{print $3}') + rVer=$("${CONDA_RUN[@]}" R --version 2>&1 | grep 'R version' | awk '{print $3}') rVer=$(echo "${rVer//.}") - if [ "$rVer" -lt "350" ]; then - echo $(warn "WARNING: PEPPRO requires R 3.5 or greater. Update R and checkinstall again.") + if [ "$rVer" -lt "400" ]; then + echo $(warn "WARNING: PEPPRO requires R 4.0 or greater. Update R and checkinstall again.") printf "\n" CONDA_INSTALL=1 fi fi - declare -a requiredRPackages=("optigrab" "devtools" "argparser" "R.utils" "GenomicDistributions" "GenomicDistributionsData" "PEPPROr" "data.table" "pepr" "gplots" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") + declare -a requiredRPackages=("devtools" "argparser" "R.utils" "GenomicDistributions" "GenomicDistributionsData" "PEPPROr" "data.table" "pepr" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") for package in ${requiredRPackages[@]}; do - cmd=$(echo "Rscript -e 'library(\"$package\")'") - packageInstalled=$(eval $cmd 2>&1) + packageInstalled=$("${CONDA_RUN[@]}" Rscript -e "library('$package')" 2>&1) if [[ "$packageInstalled" == *Error* ]]; then echo $(warn "WARNING: Please install the R package, $package, and checkinstall again.") printf "\n" @@ -372,7 +381,7 @@ else fi done - conda deactivate + CONDA_RUN=() fi ################################################################################ diff --git a/docs/changelog.md b/docs/changelog.md index 6b1aca4..76c5402 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,20 @@ # Change log All notable changes to this project will be documented in this file. +## [0.11.0] -- 2026-03-06 + +### Changed + + - Migrated to looper 2.x / pipestat stack (looper>=2.1.0, pipestat>=0.13.1, piper>=0.15.1) + - Native pipestat integration for results reporting + - Updated pipeline interfaces to looper 2.x format with Jinja2 templating + +### Fixed + + - Off-by-one errors in smoothWig.pl smoothing window and boundary handling + - Coordinate conversion in bamSitesToWig.py (0-based BAM to 1-based wiggle) + - PEPPROr R package: replaced deprecated optigrab with base R argument parsing + ## [0.10.2] -- 2022-11-30 ### Changed diff --git a/docs/custom_adapters.md b/docs/custom_adapters.md index 95032f5..f91f375 100644 --- a/docs/custom_adapters.md +++ b/docs/custom_adapters.md @@ -13,3 +13,26 @@ TGGAATTCTCGGGTGCCAAGG >3prime GATCGTCGGACTGTAGAACTCTGAAC ``` + +## Using a custom pipeline configuration file with looper + +If you want to use a custom `peppro.yaml` without modifying the original, specify the path via the `config_file` attribute in your project configuration file or sample sheet. This is passed to the pipeline as the `-C` argument. + +### Apply to all samples (project-wide) + +Add `config_file` under `sample_modifiers.append` in your project config: + +```yaml +sample_modifiers: + append: + config_file: /path/to/my_peppro.yaml +``` + +### Apply to individual samples + +Add a `config_file` column to your sample sheet CSV: + +``` +sample_name,genome,protocol,read1,config_file +my_sample,hg38,PRO,/path/to/reads.fq.gz,/path/to/my_peppro.yaml +``` diff --git a/docs/detailed-install.md b/docs/detailed-install.md index c77a61e..d970d99 100644 --- a/docs/detailed-install.md +++ b/docs/detailed-install.md @@ -2,7 +2,7 @@ This guide walks you through the minutiae of how to install each prerequisite component. We'll presume you're installing this in a Linux environment. If not the case, you'll need to go to each tool's respective site to find alternative installation approaches and options. -You have several options for installing the software prerequisites: 1) use a container, either [a single container](run-container.md) or with a [multi-container environment manager](run-bulker.md), in which case you need only either `docker` or `singularity`; 2) [install via `conda`](run-conda.md) or 3) install all prerequisites natively. We'll install everything natively in this guide. +You have several options for installing the software prerequisites: 1) use a container with a [multi-container environment manager](run-bulker.md), in which case you need only either `docker` or `singularity`; 2) [install via `conda`](run-conda.md) or 3) install all prerequisites natively. We'll install everything natively in this guide. ## 1. Install required software @@ -180,7 +180,6 @@ That should do it! - [GenomicDistributionsData (>= v1.6.0)](https://bioconductor.org/packages/release/data/experiment/html/GenomicDistributionsData.html) - [GenomicRanges (>= v1.50.1)](https://bioconductor.org/packages/release/bioc/html/GenomicRanges.html) - [ggplot2 (>= v3.4.0)](https://cran.r-project.org/package=ggplot2) - - [optigrab (>= v0.9.2.1)](https://cran.r-project.org/web/packages/optigrab/index.html) - [pepr (>= v0.4.0)](https://cran.r-project.org/web/packages/pepr/index.html) - [R.utils (>= v2.12.2)](https://cran.r-project.org/web/packages/R.utils/index.html) @@ -189,7 +188,6 @@ To install the needed packages, enter the following command in the pipeline fold Rscript -e 'install.packages("devtools")' Rscript -e 'install.packages("data.table")' Rscript -e 'install.packages("argparser")' -Rscript -e 'install.packages("optigrab")' Rscript -e 'install.packages("R.utils")' Rscript -e 'install.packages("ggplot2")' Rscript -e 'install.packages("pepr")' diff --git a/docs/install.md b/docs/install.md index 458fdac..89895d0 100644 --- a/docs/install.md +++ b/docs/install.md @@ -3,9 +3,8 @@ We provide several methods to setup `PEPPRO`. A fundamental challenge of any complex pipeline is that they rely on many independent tools. Installing all of these from scratch can be a chore, although the common use of many of the required bioinformatic tools means they are likely to already be available on an HPC or server. Installation can also be addressed through the use of containers, although that requires setting up and learning to use containers. No single approach appears to resolve all challenges for all users, but we've done our best to provide various ways to ease setup as much as possible. 1. [Run the pipeline using the multi-container environment manager, `bulker`.](run-bulker.md) -2. [Run the pipeline using a single, monolithic container.](run-container.md) -3. [Run the pipeline in a conda environment.](run-conda.md) -4. [Run the pipeline natively.](detailed-install.md) +2. [Run the pipeline in a conda environment.](run-conda.md) +3. [Run the pipeline natively.](detailed-install.md) ## Confirm installation diff --git a/docs/tutorial.md b/docs/tutorial.md index e0888d9..3f838b5 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -54,10 +54,10 @@ refgenie pull human_rDNA/fasta human_rDNA/bowtie2_index ## 3. Download tutorial read files -We're going to work with some files a little larger than the test data included in the pipeline so we can see all the features included in a full run of the pipeline. Go ahead and download the [tutorial_r1.fastq.gz](http://big.databio.org/peppro/fastq/tutorial_r1.fq.gz) and [tutorial_r2.fq.gz](http://big.databio.org/peppro/fastq/tutorial_r2.fq.gz) files. +We're going to work with some files a little larger than the test data included in the pipeline so we can see all the features included in a full run of the pipeline. Go ahead and download the [tutorial_r1.fastq.gz](https://virginia.box.com/s/iqnu4j55a3m5kyksfyadbzj17pqu1lvt) and [tutorial_r2.fq.gz](https://virginia.box.com/s/wa5ncixn13dxxs50q6m8qz6moftqwuj5) files. ```console -wget http://big.databio.org/peppro/fastq/tutorial_r1.fq.gz -wget http://big.databio.org/peppro/fastq/tutorial_r2.fq.gz +curl -L https://virginia.box.com/shared/static/iqnu4j55a3m5kyksfyadbzj17pqu1lvt --output tutorial_r1.fq.gz +curl -L https://virginia.box.com/shared/static/wa5ncixn13dxxs50q6m8qz6moftqwuj5 --output tutorial_r2.fq.gz ``` To simplify the rest of this tutorial, let's put those files in a standard location we'll use for the rest of this guide. @@ -78,18 +78,14 @@ nano tutorial_refgenie.yaml The following is what you should see in that configuration file. ```console # Run tutorial samples through PEPPRO +# Use with .looper.yaml configuration file +# This config uses refgenie for genome assets name: PEPPRO_tutorial pep_version: 2.0.0 sample_table: tutorial.csv -looper: - output_dir: "${TUTORIAL}/processed/peppro/tutorial" - pipeline_interfaces: "${TUTORIAL}/tools/peppro/project_pipeline_interface.yaml"] - sample_modifiers: - append: - pipeline_interfaces: "${TUTORIAL}/tools/peppro/sample_pipeline_interface.yaml" derive: attributes: [read1, read2] sources: @@ -102,7 +98,22 @@ sample_modifiers: genome: "hg38" prealignment_names: ["human_rDNA"] ``` -There is also a sample annotation file referenced in our configuration file. The sample annotation file contains metadata and other information about our sample. Just like before, this file, named [`tutorial.csv`](https://github.com/databio/peppro/blob/master/examples/meta/tutorial.csv) has been provided. You may check it out if you wish, otherwise we're all set. +And the tutorial's `.looper.yaml` file: +```console +# Looper 2.0 configuration for PEPPRO tutorial +pep_config: tutorial_refgenie.yaml # Use tutorial.yaml for hardcoded paths + +output_dir: "${TUTORIAL}/processed/peppro/tutorial" + +pipeline_interfaces: + - "${TUTORIAL}/tools/peppro/sample_pipeline_interface.yaml" + - "${TUTORIAL}/tools/peppro/project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${TUTORIAL}/processed/peppro/tutorial/results_pipeline/{record_identifier}/stats.yaml" +``` + +There is also a sample annotation file referenced in our configuration file. The sample annotation file contains metadata and other information about our sample. Just like before, this file, named [`tutorial.csv`](https://github.com/databio/peppro/blob/master/examples/meta/tutorial/tutorial.csv) has been provided. You may check it out if you wish, otherwise we're all set. If you choose to open `tutorial.csv`, you should see the following: ```console @@ -168,13 +179,13 @@ cd ${TUTORIAL}/tools/peppro/ Now, we'll use `looper` to run the sample pipeline locally. ```console -looper run examples/meta/tutorial_refgenie.yaml +looper run -c examples/meta/tutorial/.looper.yaml ``` Congratulations! Your first sample should be running through the pipeline now. It takes right around 25 minutes for this process to complete using a single core and maxes at about 3.5 GB of memory. We will also use `looper` to run the project pipeline locally. At the project level we can aggregate all the samples in our project (just 1 in this simple case) and view everything together. ```console -looper runp examples/meta/tutorial_refgenie.yaml +looper runp -c examples/meta/tutorial/.looper.yaml ``` After the pipeline is finished, we can look through the output directory together. We've provided a breakdown of that directory in the [browse output page](browse_output.md). @@ -184,12 +195,12 @@ After the pipeline is finished, we can look through the output directory togethe Let's take full advantage of `looper` and generate a pipeline `HTML` report that makes all our results easy to view and browse. If you'd like to skip right to the results and see what it looks like, [check out the tutorial results](files/examples/tutorial/PEPPRO_tutorial_summary.html). Otherwise, let's generate a report ourselves. Using our same configuration file we used to run the samples through the pipeline, we'll now employ the `report` function of `looper`. ```console -looper report examples/meta/tutorial_refgenie.yaml +looper report -c examples/meta/tutorial/.looper.yaml ``` That's it! Easy, right? `Looper` conveniently provides you with the location where the HTML report is produced. You may either open the report with your preferred internet browser using the PATH provided, or we can change directories to the report's location and open it there. Let's go ahead and change into the directory that contains the report. ```console -cd ${TUTORIAL}/processed/peppro/tutorial/ -firefox PEPPRO_tutorial_summary.html +cd ${TUTORIAL}/processed/peppro/tutorial/reports/PEPPRO/ +firefox index.html ``` The `HTML` report contains a summary page that integrates the project level summary table and any project level objects. The status page lists all the samples in this project along with their current status, a link to their log files, the time it took to run the sample and the peak memory used during the run. The objects page provides links to separate pages for each object type. On each object page, all the individual samples' objects are provided. Similarly, the samples page contains links to individual pages for each sample. The sample pages list the individual summary statistics for that sample as well as links to log files, command logs, and summary files. The sample pages also provide links and thumbnails for any individual objects generated for that sample. Of course, all of these files are present in the sample directory, but the report provides easy access to them all. @@ -227,10 +238,10 @@ Success! If you had any issues, feel free to [reach out to us with questions](co ## 2: Download tutorial read files -We're going to work with some files a little larger than the test data included in the pipeline so we can see all the features included in a full run of the pipeline. Go ahead and download the [tutorial_r1.fastq.gz](http://big.databio.org/peppro/fastq/tutorial_r1.fq.gz) and [tutorial_r2.fq.gz](http://big.databio.org/peppro/fastq/tutorial_r2.fq.gz) files. +We're going to work with some files a little larger than the test data included in the pipeline so we can see all the features included in a full run of the pipeline. Go ahead and download the [tutorial_r1.fastq.gz](https://virginia.box.com/s/iqnu4j55a3m5kyksfyadbzj17pqu1lvt) and [tutorial_r2.fq.gz](https://virginia.box.com/s/wa5ncixn13dxxs50q6m8qz6moftqwuj5) files. ```console -wget http://big.databio.org/peppro/fastq/tutorial_r1.fq.gz -wget http://big.databio.org/peppro/fastq/tutorial_r2.fq.gz +curl -L https://virginia.box.com/shared/static/iqnu4j55a3m5kyksfyadbzj17pqu1lvt --output tutorial_r1.fq.gz +curl -L https://virginia.box.com/shared/static/wa5ncixn13dxxs50q6m8qz6moftqwuj5 --output tutorial_r2.fq.gz ``` To simplify the rest of this tutorial, let's put those files in a standard location we'll use for the rest of this guide. @@ -400,14 +411,14 @@ cd ${TUTORIAL}/tools/peppro/ Now, we'll use `looper` to run the sample pipeline locally. ```console -looper run examples/meta/tutorial.yaml +looper run -c examples/meta/tutorial/tutorial.yaml ``` Congratulations! Your first sample should be running through the pipeline now. It takes right around 25 minutes for this process to complete using a single core and maxes at about 3.5 GB of memory. ## 6: Use `looper` to run the project level pipeline The pipeline also includes project level analyses that work on all samples concurrently. This allows for analyses that require output produced by individual sample analysis. We'll run the project analysis much like we run the sample analysis: ```console -looper runp examples/meta/tutorial.yaml +looper runp -c examples/meta/tutorial/tutorial.yaml ``` This should take about a minute on the tutorial sample and will generate a `summary/` directory containing project level output in the parent project directory. @@ -417,11 +428,11 @@ Let's take full advantage of `looper` and generate a pipeline `HTML` report that Using our same configuration file we used to run the samples through the pipeline, we'll now employ the `report` function of `looper`. ```console -looper report examples/meta/tutorial.yaml +looper report -c examples/meta/tutorial/tutorial.yaml ``` That's it! Easy, right? `Looper` conveniently provides you with the location where the HTML report is produced. You may either open the report with your preferred internet browser using the PATH returned with `looper report`, or we can change directories to the report's location and open it there. Let's go ahead and change into the directory that contains the report. ```console -cd $TUTORIAL/processed/peppro/tutorial -firefox PEPPRO_tutorial_summary.html +cd $TUTORIAL/processed/peppro/tutorial/reports/PEPPRO +firefox index.html ``` The `HTML` report contains a summary page that integrates the project level summary table and any project level objects including: raw aligned reads, percent aligned reads, TSS enrichment scores, and library complexity plots. The status page lists all the samples in this project along with their current status, a link to their log files, the time it took to run the sample and the peak memory used during the run. The objects page provides links to separate pages for each object type. On each object page, all the individual samples' objects are provided. Similarly, the samples page contains links to individual pages for each sample. The sample pages list the individual summary statistics for that sample as well as links to log files, command logs, and summary files. The sample pages also provide links and thumbnails for any individual objects generated for that sample. Of course, all of these files are present in the sample directory, but the report provides easy access to them all. diff --git a/docs/umi.md b/docs/umi.md index 8f9265b..cc2a0e9 100644 --- a/docs/umi.md +++ b/docs/umi.md @@ -4,7 +4,7 @@ By default, the pipeline assumes there is *not* a UMI. In other words, the param ## Specify a UMI length -There are three approaches for specifying the `umi_len` parameter for your samples. +There are two approaches for specifying the `umi_len` parameter for your samples. ### 1: Pass the `--umi-len` parameter at the command line @@ -20,19 +20,7 @@ For example: -O $HOME/peppro_example/ ``` -### 2: Pass the `--umi-len` parameter to the pipeline using `looper` - -If you're running `PEPPRO` with `looper`, you can also pass any number of additional arguments to `looper` that will be automatically passed to the pipeline. -For example: -``` -looper run examples/meta/peppro_test.yaml -d \ - --package slurm \ - --umi-len 8 -``` - -In this case, `looper` will automatically pass the `--umi-len 8` argument to each sample in the `peppro_test.yaml` file. - -### 3: Specify a `--umi-len` argument in the project configuration file +### 2: Specify a `--umi-len` argument in the project configuration file If you're using `looper` and you'd like to set the `--umi-len` for individual samples that is entirely possible with some customization to the configuration and annotation files. For a real life example, check out the [`peppro_paper.yaml`](https://github.com/databio/ppqc/blob/master/peppro_paper.yaml) and [`peppro_paper.csv`](https://github.com/databio/ppqc/blob/master/peppro_paper.csv) project files. diff --git a/docs/usage.md b/docs/usage.md index f992317..59d4be9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,12 +6,14 @@ `python pipelines/peppro.py --help` ```{console} +/project/gomezlab/code/.conda/envs/peppro/lib/python3.13/site-packages/refgenconf/refgenconf.py:24: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. + from pkg_resources import iter_entry_points usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--logdev] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER - [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I - INPUT_FILES [INPUT_FILES ...] - [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] + [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] + [--pipeline-name PIPELINE_NAME] -S SAMPLE_NAME + -I INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 ...]] + -G GENOME_ASSEMBLY [-Q SINGLE_OR_PAIRED] [--protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq}] [--adapter-tool {cutadapt,fastp}] [--dedup-tool {seqkit,fqdedup}] @@ -19,17 +21,17 @@ usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--max-len MAX_LEN] [--sob] [--scale] [--prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...]] [--prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...]] - --genome-index GENOME_INDEX [--fasta FASTA] --chrom-sizes - CHROM_SIZES [--TSS-name TSS_NAME] [--pi-tss PI_TSS] - [--pi-body PI_BODY] [--pre-name PRE_NAME] + --genome-index GENOME_INDEX [--fasta FASTA] + --chrom-sizes CHROM_SIZES [--TSS-name TSS_NAME] + [--pi-tss PI_TSS] [--pi-body PI_BODY] [--pre-name PRE_NAME] [--anno-name ANNO_NAME] [--exon-name EXON_NAME] [--intron-name INTRON_NAME] [--search-file SEARCH_FILE] [--coverage] [--keep] [--keep-mito] [--noFIFO] [--no-complexity] [--prioritize] [-V] -PEPPRO version 0.10.2 +PEPPRO version 0.12.0 -optional arguments: +options: -h, --help show this help message and exit -R, --recover Overwrite locks to recover from previous failed run -N, --new-start Overwrite all results to start a fresh run @@ -39,18 +41,20 @@ optional arguments: --silent Silence logging. Overrides verbosity. --verbosity V Set logging level (1-5 or logging module level name) --logdev Expand content of logging message format. - -C CONFIG_FILE, --config CONFIG_FILE + -C, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths are with respect to the pipeline script. - -M MEMORY_LIMIT, --mem MEMORY_LIMIT + -M, --mem MEMORY_LIMIT Memory limit for processes accepting such. Default units are megabytes unless specified using the suffix [K|M|G|T]. - -P NUMBER_OF_CORES, --cores NUMBER_OF_CORES + -P, --cores NUMBER_OF_CORES Number of cores for parallelized processes - -I2 [INPUT_FILES2 [INPUT_FILES2 ...]], --input2 [INPUT_FILES2 [INPUT_FILES2 ...]] + --pipeline-name PIPELINE_NAME + Name of the pipeline + -I2, --input2 [INPUT_FILES2 ...] Secondary input files, such as read2 - -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED + -Q, --single-or-paired SINGLE_OR_PAIRED Single- or paired-end sequencing protocol --protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq} Run on sequencing type. @@ -109,12 +113,12 @@ optional arguments: -V, --version show program's version number and exit required named arguments: - -O PARENT_OUTPUT_FOLDER, --output-parent PARENT_OUTPUT_FOLDER + -O, --output-parent PARENT_OUTPUT_FOLDER Parent output directory of project - -S SAMPLE_NAME, --sample-name SAMPLE_NAME + -S, --sample-name SAMPLE_NAME Name for sample to run - -I INPUT_FILES [INPUT_FILES ...], --input INPUT_FILES [INPUT_FILES ...] + -I, --input INPUT_FILES [INPUT_FILES ...] One or more primary input files - -G GENOME_ASSEMBLY, --genome GENOME_ASSEMBLY + -G, --genome GENOME_ASSEMBLY Identifier for genome assembly ``` diff --git a/examples/meta/.looper.yaml b/examples/meta/.looper.yaml new file mode 100644 index 0000000..74c9198 --- /dev/null +++ b/examples/meta/.looper.yaml @@ -0,0 +1,13 @@ +# Looper configuration file for PEPPRO +# Place this file in your project directory and run: looper run + +pep_config: peppro_test_refgenie.yaml # Path to your PEP config file + +output_dir: "peppro_test" # Output directory for results + +pipeline_interfaces: + - ../../sample_pipeline_interface.yaml + - ../../project_pipeline_interface.yaml + +pipestat: + results_file_path: "peppro_test/results_pipeline/{record_identifier}/stats.yaml" diff --git a/examples/meta/K562/.looper.yaml b/examples/meta/K562/.looper.yaml new file mode 100644 index 0000000..2f6598c --- /dev/null +++ b/examples/meta/K562/.looper.yaml @@ -0,0 +1,11 @@ +# Looper 2.0 configuration for K562 PEPPRO example +pep_config: K562_example.yaml + +output_dir: "$PROCESSED/pro_example/" + +pipeline_interfaces: + - "$CODE/peppro/sample_pipeline_interface.yaml" + - "$CODE/peppro/project_pipeline_interface.yaml" + +pipestat: + results_file_path: "$PROCESSED/pro_example/results_pipeline/{record_identifier}/stats.yaml" diff --git a/examples/meta/K562_example.csv b/examples/meta/K562/K562_example.csv similarity index 100% rename from examples/meta/K562_example.csv rename to examples/meta/K562/K562_example.csv diff --git a/examples/meta/K562_example.yaml b/examples/meta/K562/K562_example.yaml similarity index 59% rename from examples/meta/K562_example.yaml rename to examples/meta/K562/K562_example.yaml index a6f6d85..f561ec3 100644 --- a/examples/meta/K562_example.yaml +++ b/examples/meta/K562/K562_example.yaml @@ -1,16 +1,11 @@ # Run K562_example sample through PEPPRO +# Use with .looper.yaml configuration file name: K562_example pep_version: 2.0.0 sample_table: "K562_example.csv" -looper: - output_dir: "$PROCESSED/pro_example/" - pipeline_interfaces: ["$CODE/peppro/project_pipeline_interface.yaml"] - sample_modifiers: - append: - pipeline_interfaces: ["$CODE/peppro/sample_pipeline_interface.yaml"] derive: attributes: [read1, read2] sources: @@ -20,4 +15,4 @@ sample_modifiers: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: genome: hg38 - prealignments: human_rDNA \ No newline at end of file + prealignment_names: ["human_rDNA"] diff --git a/examples/meta/peppro_test.yaml b/examples/meta/peppro_test.yaml index 7fed4a4..18c5212 100644 --- a/examples/meta/peppro_test.yaml +++ b/examples/meta/peppro_test.yaml @@ -1,23 +1,11 @@ # Run test sample through PEPPRO +# Use with .looper.yaml configuration file name: test pep_version: 2.0.0 sample_table: "peppro_test.csv" -looper: - output_dir: peppro_test - pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. - sample_modifiers: - append: - pipeline_interfaces: ../../sample_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. - #prioritize: True # Default is FALSE. Pass flag to prioritize features by the order they appear in the feat_annotation asset when calculating FRiF/PRiF - #sob: True # Default is FALSE. Pass flag to use seqOutBias for signal track generation and to incorporate mappability - #no_scale: True # Default is FALSE. Pass flag to not scale signal tracks - #coverage: True # Default is FALSE. Pass flag to use coverage when producing library complexity plots. - #keep: True # Default is FALSE. Pass flag to keep prealignment BAM files. - #noFIFO: True # Default is FALSE. Pass flag to NOT use named pipes during prealignments. - #complexity: False # Default is TRUE. Pass flag to disable library complexity calculation. Faster. derive: attributes: [read1] sources: @@ -36,4 +24,4 @@ sample_modifiers: exon_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed intron_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed pi_tss: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed - pi_body: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed + pi_body: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed diff --git a/examples/meta/peppro_test_refgenie.yaml b/examples/meta/peppro_test_refgenie.yaml index 012c649..2ede82b 100644 --- a/examples/meta/peppro_test_refgenie.yaml +++ b/examples/meta/peppro_test_refgenie.yaml @@ -1,16 +1,11 @@ -# Run test sample through PEPPRO +# Run test sample through PEPPRO using Refgenie for genome assets +# Use with .looper.yaml configuration file name: test pep_version: 2.0.0 sample_table: peppro_test.csv -looper: - output_dir: peppro_test - pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. - sample_modifiers: - append: - pipeline_interfaces: ../../sample_pipeline_interface.yaml # PATH to the directory where looper will derive: attributes: [read1] sources: diff --git a/examples/meta/tutorial/.looper.yaml b/examples/meta/tutorial/.looper.yaml new file mode 100644 index 0000000..cc5ebe4 --- /dev/null +++ b/examples/meta/tutorial/.looper.yaml @@ -0,0 +1,11 @@ +# Looper 2.0 configuration for PEPPRO tutorial +pep_config: tutorial_refgenie.yaml # Use tutorial.yaml for hardcoded paths + +output_dir: "${TUTORIAL}/processed/peppro/tutorial" + +pipeline_interfaces: + - "${TUTORIAL}/peppro/sample_pipeline_interface.yaml" + - "${TUTORIAL}/peppro/project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${TUTORIAL}/processed/peppro/tutorial/results_pipeline/{record_identifier}/stats.yaml" diff --git a/examples/meta/tutorial.csv b/examples/meta/tutorial/tutorial.csv similarity index 100% rename from examples/meta/tutorial.csv rename to examples/meta/tutorial/tutorial.csv diff --git a/examples/meta/tutorial.yaml b/examples/meta/tutorial/tutorial.yaml similarity index 83% rename from examples/meta/tutorial.yaml rename to examples/meta/tutorial/tutorial.yaml index 7287131..9e873da 100644 --- a/examples/meta/tutorial.yaml +++ b/examples/meta/tutorial/tutorial.yaml @@ -1,16 +1,12 @@ # Run tutorial samples through PEPPRO +# Use with .looper.yaml configuration file +# This config uses hardcoded paths (no refgenie) name: PEPPRO_tutorial pep_version: 2.0.0 sample_table: tutorial.csv -looper: - output_dir: ${TUTORIAL}/processed/peppro/tutorial - pipeline_interfaces: ${TUTORIAL}/tools/peppro/project_pipeline_interface.yaml - sample_modifiers: - append: - pipeline_interfaces: ${TUTORIAL}/tools/peppro/sample_pipeline_interface.yaml derive: attributes: [read1, read2] sources: @@ -31,4 +27,4 @@ sample_modifiers: exon_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed intron_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed pi_tss: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed - pi_body: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed + pi_body: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed diff --git a/examples/meta/tutorial/tutorial_refgenie.yaml b/examples/meta/tutorial/tutorial_refgenie.yaml new file mode 100644 index 0000000..53c6a67 --- /dev/null +++ b/examples/meta/tutorial/tutorial_refgenie.yaml @@ -0,0 +1,20 @@ +# Run tutorial samples through PEPPRO +# Use with .looper.yaml configuration file +# This config uses refgenie for genome assets +name: PEPPRO_tutorial + +pep_version: 2.0.0 +sample_table: tutorial.csv + +sample_modifiers: + derive: + attributes: [read1, read2] + sources: + R1: "${TUTORIAL}/peppro/examples/data/{sample_name}_r1.fq.gz" + R2: "${TUTORIAL}/peppro/examples/data/{sample_name}_r2.fq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: "hg38" + prealignment_names: ["human_rDNA"] diff --git a/examples/meta/tutorial_refgenie.yaml b/examples/meta/tutorial_refgenie.yaml deleted file mode 100644 index 54e3952..0000000 --- a/examples/meta/tutorial_refgenie.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Run tutorial samples through PEPPRO -name: PEPPRO_tutorial - -pep_version: 2.0.0 -sample_table: tutorial.csv - -looper: - output_dir: "${TUTORIAL}/processed/peppro/tutorial" - pipeline_interfaces: "${TUTORIAL}/tools/peppro/project_pipeline_interface.yaml" - -sample_modifiers: - append: - pipeline_interfaces: "${TUTORIAL}/tools/peppro/sample_pipeline_interface.yaml" - derive: - attributes: [read1, read2] - sources: - R1: "${TUTORIAL}/tools/peppro/examples/data/{sample_name}_r1.fq.gz" - R2: "${TUTORIAL}/tools/peppro/examples/data/{sample_name}_r2.fq.gz" - imply: - - if: - organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] - then: - genome: "hg38" - prealignment_names: ["human_rDNA"] diff --git a/peppro_input_schema.yaml b/peppro_input_schema.yaml index 57dc25c..6fd4cc4 100644 --- a/peppro_input_schema.yaml +++ b/peppro_input_schema.yaml @@ -20,9 +20,12 @@ properties: genome: type: string description: "Refgenie genome registry identifier" - prealignments: + prealignment_names: type: ["string", "array"] - descrption: "Refgenie genome registry identifiers" + description: "Refgenie genome registry identifiers for prealignment genomes" + prealignment_index: + type: ["string", "array"] + description: "Prealignment genome name and bowtie2 index path pairs (e.g. rCRSd=/path/to/index)" read_type: type: string description: "Is this single or paired-end data?" @@ -76,26 +79,29 @@ properties: type: string description: "GenomeTools Tallymer read-length matched index search file (by default uses Refgenie asset)" sob: - type: boolean + type: ["boolean", "string"] description: "Use seqOutBias to generate signal tracks" scale: - type: boolean + type: ["boolean", "string"] description: "Scale signal tracks" coverage: - type: boolean + type: ["boolean", "string"] description: "Report library complexity using coverage: reads / (bases in genome / read length)" keep: - type: boolean + type: ["boolean", "string"] description: "Keep prealignment BAM files" noFIFO: - type: boolean + type: ["boolean", "string"] description: "Do NOT use named pipes during prealignments" complexity: - type: boolean + type: ["boolean", "string"] description: "Disable library complexity calculation (faster)" prioritize: - type: boolean + type: ["boolean", "string"] description: "Plot cFRiF/FRiF using mutually exclusive priority ranked features based on the order of feature appearance in the feature annotation asset" + config_file: + type: string + description: "Path to a custom pipeline configuration file (peppro.yaml)" required: - sample_name - protocol diff --git a/peppro_output_schema.yaml b/peppro_output_schema.yaml index 8212587..b33e30d 100644 --- a/peppro_output_schema.yaml +++ b/peppro_output_schema.yaml @@ -1,46 +1,355 @@ -description: objects produced by PEPPRO pipeline. +pipeline_name: PEPPRO +title: PEPPRO output schema +description: A pipeline that uses pipestat to report sample and project level results. +type: object properties: + pipeline_name: PEPPRO samples: type: array items: type: object properties: - cutadapt_report: - path: "cutadapt/{sample_name}_cutadapt.txt" + File_mb: + type: number + description: "size of file" + Read_type: type: string - description: "Test sample property" - plus_bw: - path: "signal_{genome}/{sample_name}_plus_body_0-mer.bw" + description: "read_type" + Genome: type: string - description: "Test sample property" - minus_bw: - path: "signal_{genome}/{sample_name}_minus_body_0-mer.bw" + description: "e.g. hg38" + Genome_size: + type: number + description: "The number of bases in a genome" + Raw_reads: type: string - description: "Test sample property" - plus_bam: - path: "aligned_{genome}/{sample_name}_plus.bam" + description: "raw reads" + Fastq_reads: + type: number + description: "fastq_reads" + Reads_with_adapter: + type: number + description: "Number of reads with adapter sequence" + Uninformative_adapter_reads: + type: number + description: "Number of reads that are adapter-adapter ligation products" + Pct_uninformative_adapter_reads: + type: number + description: "Percentage of reads that are adapter-adapter ligation products" + Trimmed_reads_R1: + type: number + description: "Number of reads that were trimmed from Read1 data" + Trim_loss_rate_R1: + type: number + description: "The amount of loss due to trimming of Read1 data" + Trimmed_reads_R2: + type: number + description: "Number of reads that were trimmed from Read2 data" + Trim_loss_rate_R2: + type: number + description: "The amount of loss due to trimming of Read2 data" + FastQC report R1: + title: "FastQC report R1" + description: "FastQC report R1" + type: object + object_type: file + properties: + path: + type: string + thumbnail_path: + type: ["string", "null"] + title: + type: string + required: + - path + - title + FastQC report R2: + title: "FastQC report R2" + description: "FastQC report R2" + type: object + object_type: file + properties: + path: + type: string + thumbnail_path: + type: ["string", "null"] + title: + type: string + required: + - path + - title + Adapter insertion distribution: + title: "Adapter insertion distribution" + description: "Adapter insertion distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: ["string", "null"] + title: + type: string + required: + - path + - thumbnail_path + - title + Peak_adapter_insertion_size: + type: number + description: "The most frequent adapter insertion size" + Degradation_ratio: + type: number + description: "Measure of the RNA integrity. Libraries with a ratio < 1 should be considered high quality" + Aligned_reads_human_rDNA: + type: number + description: "Number of reads aligned to the human ribosomal DNA sequence" + Alignment_rate_human_rDNA: + type: number + description: "Percentage of reads aligned to the human ribosomal DNA sequence" + Mapped_reads: + type: number + description: "The number of mapped reads" + Unmapped_reads: + type: number + description: "The number of unmapped reads" + QC_filtered_reads: + type: number + description: "QC_filtered_reads" + Aligned_reads: + type: number + description: "Number of reads aligned to primary genome" + Alignment_rate: + type: number + description: "Alignment rate to primary genome" + Read_depth: + type: number + description: "Libraries with at least 75% of reads unique at a sequencing depth of 10 million can be considered high quality" + Total_efficiency: + type: number + description: "Alignment efficiency" + Mitochondrial_reads: + type: number + description: "Number of reads aligned to mitochondrial sequence" + Maximum_read_length: + type: number + description: "The maximum length of a read" + NRF: + type: number + description: "NRF (Non-Redundant Fraction) measures the ratio of uniquely mapped reads to the total number of mapped reads as a quality control metric for library complexity" + PBC1: + type: number + description: "PBC1 (PCR Bottleneck Coefficient 1) is a quality control metric used to measure library complexity and identify potential over-amplification" + PBC2: + type: number + description: "PBC2 (PCR Bottleneck Coefficient 2) is a quality control metric used to evaluate the complexity of a library and estimate the extent of PCR duplication" + TSS enrichment: + title: "TSS enrichment distribution" + description: "TSS enrichment distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: ["string", "null"] + title: + type: string + required: + - path + - thumbnail_path + - title + TSS_coding_score: + type: number + description: "Assessment of run-on efficiency for the coding strand. The normalized TSS enrichment score is the ratio of the average coverage in 100-bp windows, with the numerator centered at the TSS peak summit and the denominator in the background at the edge of the 2000-bp window." + TSS_non-coding_score: + type: number + description: "Assessment of run-on efficiency for the non-coding strand. The normalized TSS enrichment score is the ratio of the average coverage in 100-bp windows, with the numerator centered at the TSS peak summit and the denominator in the background at the edge of the 2000-bp window." + Pause_index: + type: number + description: "Pause indices are calculated as the ratio of read density in the promoter proximal region versus read density in the gene body. For genes with multiple TSSs, the pause window is defined as the region +20-120 bases from each identified TSS per gene. We determine the read density at every annotated pause window per gene and identify the predominant, singular pause window as the pause window with the greatest density. This singular pause window is used to calculate and reported as the overall pause index." + Pause index: + title: "Pause index distribution" + description: "Pause index distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: ["string", "null"] + title: + type: string + required: + - path + - thumbnail_path + - title + cFRiF: + title: "cFRiF" + description: "cFRiF" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: ["string", "null"] + title: + type: string + required: + - path + - thumbnail_path + - title + FRiF: + title: "FRiF" + description: "FRiF" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: ["string", "null"] + title: + type: string + required: + - path + - thumbnail_path + - title + Plus_FRiP: + type: number + description: "The fraction of reads in peaks for the plus strand" + Minus_FRiP: + type: number + description: "The fraction of reads in peaks for the minus strand" + mRNA_contamination: + type: number + description: "A measure of mRNA contamination (i.e. nascent RNA purity) in the sample. Represents the exon to intron read density ratio. A nascent RNA sequencing library without polymerase pausing would have a ratio of exon density to intron density of ≈1. Because promoter-proximal pausing inflates this ratio, PEPPRO excludes the first exon from this calculation. In testing, the median exon-intron ratio is between 1.0 and 1.8 for high quality libraries." + mRNA contamination: + title: "mRNA contamination distribution" + description: "mRNA contamination distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: ["string", "null"] + title: + type: string + required: + - path + - thumbnail_path + - title + cutadapt_report: + type: object + description: "Cutadapt adapter trimming report" + properties: + path: + type: string + title: + type: string + plus_bw: + type: object + description: "Plus strand signal bigWig file" + properties: + path: + type: string + title: + type: string + minus_bw: + type: object + description: "Minus strand signal bigWig file" + properties: + path: + type: string + title: + type: string + plus_bam: + type: object + description: "Plus strand aligned BAM file" + properties: + path: + type: string + title: + type: string + minus_bam: + type: object + description: "Minus strand aligned BAM file" + properties: + path: + type: string + title: + type: string + gene_counts_bed: + type: object + description: "Gene coverage BED file" + properties: + path: + type: string + title: + type: string + pause_indicies_bed: + type: object + description: "Pause index BED file" + properties: + path: + type: string + title: + type: string + Time: type: string - description: "Test sample property" - minus_bam: - path: "aligned_{genome}/{sample_name}_minus.bam" + description: "time" + Success: type: string - description: "Test sample property" - gene_counts_bed: - path: "signal_{genome}/{sample_name}_gene_coverage.bed" - type: string - description: "Test sample property" - pause_indicies_bed: - path: "QC_{genome}/{sample_name}_pause_index.bed.gz" - type: string - description: "Test sample property" - library_complexity_file: - title: "Library complexity file" - description: "Plots each sample's library complexity on a single plot." - thumbnail_path: "summary/{name}_libComplexity.png" - path: "summary/{name}_libComplexity.pdf" - type: image - counts_table: - title: "Gene counts table" - description: "Combines all sample gene count files into a project level gene counts table." - path: "summary/{name}_countData.csv" - type: link + description: "success" + project: + pipeline_name: PEPPRO_collator + type: object + properties: + library_complexity_file: + type: object + description: "Library complexity plot comparing all samples" + properties: + path: + type: string + title: + type: string + thumbnail_path: + type: string + counts_table: + type: object + description: "Combined gene counts table for all samples" + properties: + path: + type: string + title: + type: string + Time: + type: string + description: "time" + Success: + type: string + description: "success" +$defs: + file: + type: object + properties: + path: + type: string + title: + type: string + required: + - path + - title + image: + type: object + properties: + path: + type: string + title: + type: string + thumbnail_path: + type: string + required: + - path + - title diff --git a/pipelines/peppro.py b/pipelines/peppro.py index b724ecb..3cbe08b 100755 --- a/pipelines/peppro.py +++ b/pipelines/peppro.py @@ -5,7 +5,7 @@ __author__ = ["Jason Smith", "Nathan Sheffield", "Mike Guertin"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.10.2" +__version__ = "0.11.0" from argparse import ArgumentParser import os @@ -35,6 +35,14 @@ DEFAULT_UMI_LEN = 0 DEFAULT_MAX_LEN = -1 +def _get_stat_float(pm, stat_name, default=0.0): + """Read a pipeline stat and convert to float, returning default if missing.""" + val = pm.get_stat(stat_name) + if val is None: + return default + return float(val) + + def parse_arguments(): """ Parse command-line arguments passed to the pipeline. @@ -43,13 +51,13 @@ def parse_arguments(): ########################################################################### parser = ArgumentParser(description='PEPPRO version ' + __version__) parser = pypiper.add_pypiper_args(parser, groups= - ['pypiper', 'looper', 'ngs'], + ['pypiper', 'looper', 'ngs', 'pipestat'], required=["input", "genome", "sample-name", "output-parent", "chrom_sizes", "genome_index"]) # Pipeline-specific arguments parser.add_argument("--protocol", dest="protocol", - default="pro", choices=RUNON_SOURCE, + default=None, choices=RUNON_SOURCE, help="Run on sequencing type.") parser.add_argument("--adapter-tool", dest="adapter", @@ -173,6 +181,10 @@ def parse_arguments(): dest="complexity", help="Disable library complexity calculation (faster).") + parser.add_argument("--no-bw", action='store_true', default=False, + dest="no_bw", + help="Skip bigWig signal track generation (faster, for testing).") + parser.add_argument("--prioritize", action='store_true', default=False, dest="prioritize", help="Plot cFRiF/FRiF using mutually exclusive priority" @@ -191,6 +203,13 @@ def parse_arguments(): return args +def _cutadapt_report_path(outfolder, sample_name, read2): + """Return the path to the cutadapt report for the given sample.""" + cutadapt_folder = os.path.join(outfolder, "cutadapt") + suffix = "_R2_cutadapt.txt" if read2 else "_R1_cutadapt.txt" + return os.path.join(cutadapt_folder, sample_name + suffix) + + def _remove_adapters(args, res, tools, read2, fq_file, outfolder): """ A helper function to build a command for adapter removal. @@ -214,15 +233,13 @@ def _remove_adapters(args, res, tools, read2, fq_file, outfolder): fastp_folder = os.path.join(outfolder, "fastp") fastq_folder = os.path.join(outfolder, "fastq") + cutadapt_report = _cutadapt_report_path(outfolder, sname, read2) + if read2: - cutadapt_report = os.path.join(cutadapt_folder, - sname + "_R2_cutadapt.txt") noadap_fastq = os.path.join(fastq_folder, sname + "_R2_noadap.fastq") short_fastq = os.path.join(fastq_folder, sname + "_R2_short.fastq") fastp_pfx = os.path.join(fastp_folder, sname + "_R2_fastp_adapter") else: - cutadapt_report = os.path.join(cutadapt_folder, - sname + "_R1_cutadapt.txt") noadap_fastq = os.path.join(fastq_folder, sname + "_R1_noadap.fastq") short_fastq = os.path.join(fastq_folder, sname + "_R1_short.fastq") fastp_pfx = os.path.join(fastp_folder, sname + "_R1_fastp_adapter") @@ -1076,13 +1093,7 @@ def _process_fastq(args, tools, res, read2, fq_file, outfolder): processed_fastq = os.path.join(fastq_folder, sname + "_R1_processed.fastq") if args.adapter == "cutadapt": - cutadapt_folder = os.path.join(outfolder, "cutadapt") - if read2: - cutadapt_report = os.path.join(cutadapt_folder, - sname + "_R2_cutadapt.txt") - else: - cutadapt_report = os.path.join(cutadapt_folder, - sname + "_R1_cutadapt.txt") + cutadapt_report = _cutadapt_report_path(outfolder, sname, read2) adapter_report = cutadapt_report else: adapter_report = os.path.join(fastqc_folder, @@ -1226,12 +1237,8 @@ def plot_fragments(infolder, outfolder): flash_notCombined_fq2 = os.path.join(outfolder, args.sample_name + ".notCombined_2.fastq.gz") - tmp = float(pm.get_stat("Raw_reads")) - if tmp: - rr = float(tmp) - else: - rr = 0 - if (rr < 1): + rr = _get_stat_float(pm, "Raw_reads", default=0.0) + if rr < 1: pm.fail_pipeline(RuntimeError("Raw_reads were not reported. " "Check output ({})".format(param.outfolder))) @@ -1367,25 +1374,23 @@ def check_trim(trimmed_fastq, paired_end, n_trim = float(ngstk.count_reads(trimmed_fastq, paired_end)) pm.report_result("Trimmed_reads_R1", int(n_trim)) - try: - rr = float(pm.get_stat("Raw_reads")) - except: - print("Can't calculate trim loss rate without raw read result.") - else: + rr = _get_stat_float(pm, "Raw_reads", default=0.0) + if rr > 0: pm.report_result("Trim_loss_rate_R1", round((rr - n_trim) * 100 / rr, 2)) + else: + print("Can't calculate trim loss rate without raw read result.") if paired_end and trimmed_fastq_R2: n_trim = float(ngstk.count_reads(trimmed_fastq_R2, paired_end)) pm.report_result("Trimmed_reads_R2", int(n_trim)) - try: - rr = float(pm.get_stat("Raw_reads")) - except: - print("Can't calculate trim loss rate without raw read result.") - else: + rr = _get_stat_float(pm, "Raw_reads", default=0.0) + if rr > 0: pm.report_result("Trim_loss_rate_R2", round((rr - n_trim) * 100 / rr, 2)) + else: + print("Can't calculate trim loss rate without raw read result.") # Also run a fastqc (if installed/requested) if fastqc_folder: @@ -1413,12 +1418,9 @@ def check_trim(trimmed_fastq, paired_end, # Put it all together paired_end = args.paired_end if read2: - pm.run([adapter_command, trim_command], trimmed_fq2) - if not _itsa_file(fastqc_report) or args.new_start: - cmd = ("echo '### Calculated the number of trimmed reads'") - pm.run(cmd, fastqc_report, - follow=check_trim(processed_fastq, paired_end, trimmed_fq2, - fastqc_folder=fastqc_folder)) + pm.run([adapter_command, trim_command], trimmed_fq2, + follow=lambda: check_trim(processed_fastq, paired_end, trimmed_fq2, + fastqc_folder=fastqc_folder)) if args.adapter == "cutadapt": output_folder = os.path.join(outfolder, "cutadapt") else: @@ -1637,19 +1639,16 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, # report aligned reads pm.report_result("Aligned_reads_" + assembly_identifier, ar) - try: - # wrapped in try block in case Trimmed_reads is not reported - # in this pipeline. - tr = float(pm.get_stat("Trimmed_reads_R1")) - except: - print("Trimmed reads is not reported.") - else: + tr = _get_stat_float(pm, "Trimmed_reads_R1", default=0.0) + if tr > 0: res_key = "Alignment_rate_" + assembly_identifier if float(ar) > 0: pm.report_result(res_key, round(float(ar) * 100 / float(tr), 2)) else: pm.report_result(res_key, 0) + else: + print("Trimmed reads is not reported.") if paired: unmap_fq1 = out_fastq_r1 @@ -1867,7 +1866,7 @@ def _add_resources(args, res, asset_dict=None): required_list = [] # Check that bowtie2 indicies exist for specified prealignments - for reference in args.prealignments: + for reference in args.prealignment_names: for asset in [BT2_IDX_KEY]: try: res[asset] = rgc.seek(reference, asset) @@ -1917,21 +1916,29 @@ def _add_resources(args, res, asset_dict=None): "config file or point directly to the file using the noted " "command-line arguments:") + def _fmt_asset(x, include_arg=False): + sk = x["seek_key"] or x["asset_name"] + tn = x["tag_name"] or "default" + s = "{}.{}:{}".format(x["asset_name"], sk, tn) + if include_arg and x.get("user_arg"): + s += " (--{})".format(x["user_arg"]) + return s + if len(key_errors) > 0: if required_list: err_msg = "Required assets missing from REFGENIE config file: {}" - pm.fail_pipeline(IOError(err_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name}".format(**x) for x in required_list])))) + pm.fail_pipeline(IOError(err_msg.format(", ".join([_fmt_asset(x) for x in required_list])))) else: warning_msg = "Optional assets missing from REFGENIE config file: {}" - pm.info(warning_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name}".format(**x) for x in key_errors]))) + pm.info(warning_msg.format(", ".join([_fmt_asset(x) for x in key_errors]))) if len(exist_errors) > 0: if required_list: err_msg = "Required assets not existing: {}" - pm.fail_pipeline(IOError(err_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name} (--{user_arg})".format(**x) for x in required_list])))) + pm.fail_pipeline(IOError(err_msg.format(", ".join([_fmt_asset(x, include_arg=True) for x in required_list])))) else: warning_msg = "Optional assets not existing: {}" - pm.info(warning_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name} (--{user_arg})".format(**x) for x in exist_errors]))) + pm.info(warning_msg.format(", ".join([_fmt_asset(x, include_arg=True) for x in exist_errors]))) return res, rgc @@ -1977,7 +1984,10 @@ def main(): os.path.join(args.output_parent, args.sample_name)) global pm pm = pypiper.PipelineManager( - name="PEPPRO", outfolder=outfolder, args=args, version=__version__) + name="PEPPRO", outfolder=outfolder, + pipestat_record_identifier=args.sample_name, + pipestat_pipeline_type="sample", + args=args, version=__version__) global ngstk ngstk = pypiper.NGSTk(pm=pm) @@ -2009,6 +2019,12 @@ def main(): err_msg = "Missing required tools. See message above." pm.fail_pipeline(RuntimeError(err_msg)) + if args.protocol is None: + err_msg = ("--protocol is required. Set 'protocol' in your sample " + "sheet or pass --protocol with one of: {}.".format( + ", ".join(RUNON_SOURCE))) + pm.fail_pipeline(RuntimeError(err_msg)) + if args.input2 and not args.paired_end: err_msg = (f"Incompatible settings: You specified single-end, " f"but provided --input2.") @@ -2189,8 +2205,8 @@ def main(): unmap_fq2_dups = out_fastq_pre + '_unmap_dups_R2.fq' cutadapt_folder = os.path.join(outfolder, "cutadapt") - cutadapt_report = os.path.join(cutadapt_folder, - args.sample_name + "_R1_cutadapt.txt") + # SE-only degradation analysis uses R1 report; PE degradation is handled earlier + cutadapt_report = _cutadapt_report_path(outfolder, args.sample_name, False) processed_target_R1 = os.path.join(fastq_folder, "processed_R1.flag") processed_target_R2 = os.path.join(fastq_folder, "processed_R2.flag") @@ -2207,25 +2223,38 @@ def main(): "for single end data.".format(args.adapter)) if args.paired_end: + r1_expected = out_fastq_pre + "_R1_processed.fastq" + r2_expected = out_fastq_pre + "_R2_trimmed.fastq" if not args.complexity and int(args.umi_len) > 0: - if not os.path.exists(processed_target_R1) or args.new_start: + if (not os.path.exists(processed_target_R1) or args.new_start or + not os.path.exists(r1_expected)): unmap_fq1, unmap_fq1_dups = _process_fastq( args, tools, res, False, untrimmed_fastq1, outfolder=param.outfolder) + else: + unmap_fq1 = r1_expected + unmap_fq1_dups = out_fastq_pre + "_R1_trimmed.fastq" cmd = ("touch " + processed_target_R1) pm.run(cmd, processed_target_R1) else: - if not os.path.exists(processed_target_R1) or args.new_start: + if (not os.path.exists(processed_target_R1) or args.new_start or + not os.path.exists(r1_expected)): unmap_fq1 = _process_fastq( args, tools, res, False, untrimmed_fastq1, outfolder=param.outfolder) + else: + unmap_fq1 = r1_expected cmd = ("touch " + processed_target_R1) pm.run(cmd, processed_target_R1) - if not os.path.exists(processed_target_R2) or args.new_start: + if (not os.path.exists(processed_target_R2) or args.new_start or + not os.path.exists(r2_expected)): unmap_fq2, unmap_fq2_dups = _process_fastq( args, tools, res, True, untrimmed_fastq2, outfolder=param.outfolder) + else: + unmap_fq2 = r2_expected + unmap_fq2_dups = out_fastq_pre + "_R2_trimmed_dups.fastq" cmd = ("touch " + processed_target_R2) pm.run(cmd, processed_target_R2) @@ -2252,12 +2281,8 @@ def main(): r2_repair_single = os.path.join( fastq_folder, args.sample_name + "_R2_trimmed.fastq.single.fq") - tmp = float(pm.get_stat("Raw_reads")) - if tmp: - rr = float(tmp) - else: - rr = 0 - if (rr < 1): + rr = _get_stat_float(pm, "Raw_reads", default=0.0) + if rr < 1: pm.fail_pipeline(RuntimeError("Raw_reads were not reported. Check output ({})".format(param.outfolder))) if args.adapter == "fastp" and int(args.umi_len) > 0: @@ -2592,6 +2617,8 @@ def main(): else: bt2_options = param.bowtie2.params + bt2_orientation = getattr(param.bowtie2, 'orientation', '--rf') or '--rf' + # samtools sort needs a temporary directory tempdir = tempfile.mkdtemp(dir=map_genome_folder) os.chmod(tempdir, 0o771) @@ -2617,7 +2644,7 @@ def main(): cmd += " --rg-id " + args.sample_name cmd += " -x " + res.genome_index if args.paired_end: - cmd += " --rf -1 " + unmap_fq1 + " -2 " + unmap_fq2 + cmd += " " + bt2_orientation + " -1 " + unmap_fq1 + " -2 " + unmap_fq2 else: cmd += " -U " + unmap_fq1 cmd += " | " + tools.samtools + " view -bS - -@ 1 " @@ -2640,7 +2667,7 @@ def main(): cmd_dups += " --rg-id " + args.sample_name cmd_dups += " -x " + res.genome_index if args.paired_end: - cmd_dups += " --rf -1 " + unmap_fq1_dups + " -2 " + unmap_fq2_dups + cmd_dups += " " + bt2_orientation + " -1 " + unmap_fq1_dups + " -2 " + unmap_fq2_dups else: cmd_dups += " -U " + unmap_fq1_dups cmd_dups += " | " + tools.samtools + " view -bS - -@ 1 " @@ -2672,17 +2699,8 @@ def check_alignment_genome(temp_bam, bam): if args.paired_end: ar = float(ar)/2 - tmp = pm.get_stat("Raw_reads") - if tmp: - rr = float(tmp) - else: - rr = 0 - - tmp = pm.get_stat("Trimmed_reads_R1") - if tmp: - tr = float(tmp) - else: - tr = 0 + rr = _get_stat_float(pm, "Raw_reads", default=0.0) + tr = _get_stat_float(pm, "Trimmed_reads_R1", default=0.0) if os.path.exists(res.pre_name): cmd = (tools.samtools + " depth -b " + @@ -2694,14 +2712,20 @@ def check_alignment_genome(temp_bam, bam): " | awk '{counter++;sum+=$3}END{print sum/counter}'") rd = pm.checkprint(cmd) - pm.report_result("Mapped_reads", mr) + pm.report_result("Mapped_reads", round(float(mr))) pm.report_result("QC_filtered_reads", round(float(mr)) - round(float(ar))) - pm.report_result("Aligned_reads", ar) - pm.report_result("Alignment_rate", round(float(ar) * 100 / - float(tr), 2)) - pm.report_result("Total_efficiency", round(float(ar) * 100 / - float(rr), 2)) + pm.report_result("Aligned_reads", round(float(ar))) + if tr > 0: + pm.report_result("Alignment_rate", round(float(ar) * 100 / + float(tr), 2)) + else: + pm.info("Skipping Alignment_rate: Trimmed_reads_R1 is 0 or missing") + if rr > 0: + pm.report_result("Total_efficiency", round(float(ar) * 100 / + float(rr), 2)) + else: + pm.info("Skipping Total_efficiency: Raw_reads is 0 or missing") if rd and rd.strip(): pm.report_result("Read_depth", round(float(rd), 2)) @@ -2792,7 +2816,12 @@ def check_alignment_genome(temp_bam, bam): " | " + tools.samtools + " sort - -@ " + str(pm.cores) + " > " + mapping_pe2_bam) pm.run([cmd1, cmd2], [mapping_pe1_bam, mapping_pe2_bam]) - mapping_genome_bam = mapping_pe1_bam + # --rf (reverse-forward): nascent RNA signal is on PE1 + # --fr (forward-reverse): nascent RNA signal is on PE2 + if bt2_orientation == "--fr": + mapping_genome_bam = mapping_pe2_bam + else: + mapping_genome_bam = mapping_pe1_bam ############################################################################ # Determine maximum read length and add seqOutBias resource # @@ -3021,11 +3050,20 @@ def count_unmapped_reads(): minus_bam = os.path.join( map_genome_folder, args.sample_name + "_minus.bam") + # --rf (reverse-forward): forward-mapped reads = plus strand nascent RNA + # --fr (forward-reverse): reverse-mapped reads = plus strand nascent RNA + if bt2_orientation == "--fr": + plus_flag = ("-f", 16) + minus_flag = ("-F", 20) + else: + plus_flag = ("-F", 20) + minus_flag = ("-f", 16) + cmd1 = build_command([ tools.samtools, "view", "-bh", - ("-F", 20), + plus_flag, mapping_genome_bam, (">", plus_bam) ]) @@ -3034,7 +3072,7 @@ def count_unmapped_reads(): tools.samtools, "view", "-bh", - ("-f", 16), + minus_flag, mapping_genome_bam, (">", minus_bam) ]) @@ -3264,17 +3302,19 @@ def count_unmapped_reads(): cmd4 = ("awk -F '\t' 'NR==FNR {id[$1]; next} $4 in id' " + PI_shared_genes + " " + body_density + " > " + shared_body_density) - cmd5 = ("awk 'BEGIN{FS=OFS=\"\t\"} FNR>0 && " + - "FNR==NR{a[$4]=$4 OFS $0; next} " + + cmd5 = ("awk 'BEGIN{FS=OFS=\"\t\"} FNR>0 && " + + "FNR==NR{a[$4]=$4 OFS $0; next} " + "FNR>0{print $0,a[$4]?a[$4]:\"\t\"}' " + shared_TSS_density + " " + shared_body_density + - " | awk -v OFS='\t' '{ if ($6 == \"+\")" + - "{print $9, $10, $3, $4," + - "sqrt((($15+$7)/sqrt(($3-$10)^2))^2)," + - "($15/sqrt(($11-$10)^2))/($7/sqrt(($3-$2)^2)), $6} " + - "else {print $9, $10, $3, $12," + - "sqrt((($15+$7)/sqrt(($10-$2)^2))^2)," + - "($15/sqrt(($11-$10)^2))/($7/sqrt(($3-$2)^2)), $6}}' " + + " | awk -v OFS='\t' '{ if ($6 == \"+\")" + + "{ if ($3!=$10 && $11!=$10 && $3!=$2)" + + "{print $9, $10, $3, $4," + + "sqrt((($15+$7)/sqrt(($3-$10)^2))^2)," + + "($15/sqrt(($11-$10)^2))/($7/sqrt(($3-$2)^2)), $6}}" + + " else { if ($10!=$2 && $11!=$10 && $3!=$2)" + + "{print $9, $10, $3, $12," + + "sqrt((($15+$7)/sqrt(($10-$2)^2))^2)," + + "($15/sqrt(($11-$10)^2))/($7/sqrt(($3-$2)^2)), $6}}}' " + "| env LC_COLLATE=C sort -k1,1 -k2,2n > " + temp.name) pm.run([cmd1, cmd2, cmd3, cmd4, cmd5], pause_index, nofail=True) temp.close() @@ -3477,7 +3517,8 @@ def count_unmapped_reads(): for pos, anno in enumerate(ft_list): # working files anno_file = os.path.join(QC_folder, str(anno)) - valid_name = str(re.sub('[^\w_.)( -]', '', anno).strip().replace(' ', '_')) + #valid_name = str(re.sub('[^\w_.)( -]', '', anno).strip().replace(' ', '_')) + valid_name = str(re.sub(r'[^\w_.)( -]', '', anno).strip().replace(' ', '_')) file_name = os.path.join(QC_folder, valid_name) anno_sort = os.path.join(QC_folder, valid_name + "_sort.bed") @@ -3571,7 +3612,7 @@ def count_unmapped_reads(): for pos, anno in enumerate(ft_list): # working files anno_file = os.path.join(QC_folder, str(anno)) - valid_name = str(re.sub('[^\w_.)( -]', '', anno).strip().replace(' ', '_')) + valid_name = str(re.sub(r'[^\w_.)( -]', '', anno).strip().replace(' ', '_')) file_name = os.path.join(QC_folder, valid_name) anno_sort = os.path.join(QC_folder, valid_name + "_sort.bed") @@ -3726,7 +3767,7 @@ def count_unmapped_reads(): pm.clean_add(introns_cov) # need Total Reads divided by 1M - ar = float(pm.get_stat("Aligned_reads")) + ar = _get_stat_float(pm, "Aligned_reads", default=0.0) scaling_factor = float(ar/1000000) exons_rpkm = os.path.join(QC_folder, args.sample_name + @@ -3861,15 +3902,17 @@ def count_unmapped_reads(): signal_folder, args.sample_name + "_minus_exact_body_0-mer.bw") minus_smooth_bw = os.path.join( signal_folder, args.sample_name + "_minus_smooth_body_0-mer.bw") - - if not args.sob: + + if args.no_bw: + pm.timestamp("### Skipping bigWig generation (--no-bw)") + elif not args.sob: # If not scaling we don't need to use seqOutBias to generate the # separate strand bigWigs; just convert the BAM's directly with # bamSitesToWig.py which uses UCSC wigToBigWig pm.timestamp("### Produce bigWig files") # need Total Reads divided by 1M - ar = float(pm.get_stat("Aligned_reads")) + ar = _get_stat_float(pm, "Aligned_reads", default=0.0) scaling_factor = float(ar/1000000) wig_cmd_callable = ngstk.check_command("wigToBigWig") diff --git a/pipelines/peppro.yaml b/pipelines/peppro.yaml index d7ee090..866e3ab 100644 --- a/pipelines/peppro.yaml +++ b/pipelines/peppro.yaml @@ -15,7 +15,7 @@ tools: # absolute paths to required tools bigWigCat: bigWigCat wigToBigWig: wigToBigWig # for PyPiper NGTSK - picard: ${PICARD} + picard: picard # optional tools fqdedup: fqdedup fastx: fastx @@ -42,6 +42,9 @@ parameters: # parameters passed to bioinformatic tools, subclassed by tool # pipeline default: --very-sensitive -X 2000 # --very-sensitive: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 # -X 2000: paired-end maximum fragment length + orientation: "--rf" + # --rf: reverse-forward (default; reverse-stranded PRO-seq/GRO-seq) + # --fr: forward-reverse (forward-stranded libraries) samtools: params: "-q 10" # -q: skip alignments with MAPQ < 10. diff --git a/pipelines/peppro_collator.py b/pipelines/peppro_collator.py index 8eb9c3e..dccf490 100755 --- a/pipelines/peppro_collator.py +++ b/pipelines/peppro_collator.py @@ -5,7 +5,7 @@ __author__ = ["Michal Stolarczyk", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.0.3" +__version__ = "0.2.0" from argparse import ArgumentParser import os @@ -33,7 +33,7 @@ def parse_arguments(): """ parser = VersionInHelpParser(prog="PEPPRO collator", description='PEPPRO collator' , version=__version__) - parser = pypiper.add_pypiper_args(parser, groups=['pypiper', 'looper']) + parser = pypiper.add_pypiper_args(parser, groups=['pypiper', 'looper', 'pipestat', 'common']) parser.add_argument("-n", "--name", help="Name of the project to use.", type=str) parser.add_argument("-r", "--results", @@ -46,8 +46,11 @@ def main(): args = parse_arguments() outfolder = os.path.abspath(os.path.join(args.output_parent, "summary")) - pm = pypiper.PipelineManager(name="PEPPRO collator", outfolder=outfolder, - args=args, version=__version__) + pm = pypiper.PipelineManager( + name="PEPPRO collator", outfolder=outfolder, + pipestat_record_identifier="summary", + pipestat_pipeline_type="project", + args=args, version=__version__) #pm.info("args: {}\n".format(args)) diff --git a/project_pipeline_interface.yaml b/project_pipeline_interface.yaml index 120fb33..6e041a3 100644 --- a/project_pipeline_interface.yaml +++ b/project_pipeline_interface.yaml @@ -1,24 +1,21 @@ -pipeline_name: PEPPRO_summarizer -pipeline_type: project -path: pipelines/peppro_collator.py +pipeline_name: PEPPRO_collator input_schema: peppro_input_schema.yaml output_schema: peppro_output_schema.yaml -command_template: > - looper table {looper.pep_config} && - {pipeline.path} - --config {looper.pep_config} - -O {looper.output_dir} - -P {compute.cores} - -M {compute.mem} - -n {project.name} - -r {looper.results_subdir} +project_interface: + command_template: > + {looper.piface_dir}/pipelines/peppro_collator.py + --config {looper.pep_config} + -O {looper.output_dir} + -P {compute.cores} + -M {compute.mem} + -n {project.name} + -r {looper.results_subdir} + --pipestat-config {pipestat.config_file} + compute: - singularity_image: ${SIMAGES}peppro - docker_image: databio/peppro - bulker_crate: databio/peppro:1.0.1 + bulker_crate: databio/peppro:1.1.0 size_dependent_variables: resources.tsv bioconductor: readFunName: readPepproGeneCounts readFunPath: BiocProject/readPepproGeneCounts.R - \ No newline at end of file diff --git a/requirements-conda.yml b/requirements-conda.yml index ec70f2d..9a50afd 100644 --- a/requirements-conda.yml +++ b/requirements-conda.yml @@ -3,402 +3,388 @@ channels: - conda-forge - bioconda - r - - defaults dependencies: - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=1_gnu + - _openmp_mutex=4.5=2_gnu - _r-mutex=1.0.1=anacondar_1 - - alsa-lib=1.2.3=h516909a_0 - - attmap=0.13.0=pyhd8ed1ab_0 - - attrs=21.2.0=pyhd8ed1ab_0 - - bedtools=2.30.0=h7d7f7ad_2 - - binutils_impl_linux-64=2.36.1=h193b22a_2 - - binutils_linux-64=2.36=hf3e587d_1 - - bioconductor-annotationdbi=1.54.0=r41hdfd78af_0 - - bioconductor-annotationfilter=1.16.0=r41hdfd78af_0 - - bioconductor-annotationhub=3.0.0=r41hdfd78af_0 - - bioconductor-biobase=2.52.0=r41hd029910_0 - - bioconductor-biocfilecache=2.0.0=r41hdfd78af_0 - - bioconductor-biocgenerics=0.38.0=r41hdfd78af_0 - - bioconductor-biocio=1.2.0=r41hdfd78af_0 - - bioconductor-biocparallel=1.26.0=r41h399db7b_0 - - bioconductor-biocversion=3.13.1=r41hdfd78af_0 - - bioconductor-biomart=2.48.0=r41hdfd78af_0 - - bioconductor-biostrings=2.60.0=r41hd029910_0 - - bioconductor-bsgenome=1.60.0=r41hdfd78af_0 - - bioconductor-delayedarray=0.18.0=r41hd029910_0 - - bioconductor-ensembldb=2.16.0=r41hdfd78af_0 - - bioconductor-experimenthub=2.0.0=r41hdfd78af_0 - - bioconductor-genomeinfodb=1.28.0=r41hdfd78af_0 - - bioconductor-genomeinfodbdata=1.2.6=r41hdfd78af_0 - - bioconductor-genomicalignments=1.28.0=r41hd029910_0 - - bioconductor-genomicfeatures=1.44.0=r41hdfd78af_0 - - bioconductor-genomicranges=1.44.0=r41hd029910_0 - - bioconductor-interactivedisplaybase=1.30.0=r41hdfd78af_0 - - bioconductor-iranges=2.26.0=r41hd029910_0 - - bioconductor-keggrest=1.32.0=r41hdfd78af_0 - - bioconductor-matrixgenerics=1.4.0=r41hdfd78af_0 - - bioconductor-protgenerics=1.24.0=r41hdfd78af_0 - - bioconductor-rhtslib=1.24.0=r41hd029910_0 - - bioconductor-rsamtools=2.8.0=r41h399db7b_0 - - bioconductor-rtracklayer=1.52.0=r41hd029910_0 - - bioconductor-s4vectors=0.30.0=r41hd029910_0 - - bioconductor-summarizedexperiment=1.22.0=r41hdfd78af_0 - - bioconductor-xvector=0.32.0=r41hd029910_0 - - bioconductor-zlibbioc=1.38.0=r41hd029910_0 - - bowtie2=2.4.2=py39hc9c6fcd_2 - - brotlipy=0.7.0=py39h3811e60_1001 - - bwidget=1.9.14=ha770c72_1 - - bzip2=1.0.8=h7f98852_4 - - c-ares=1.18.1=h7f98852_0 - - ca-certificates=2021.10.8=ha878542_0 - - cairo=1.16.0=h6cf1ce9_1008 - - cffi=1.14.6=py39h4bc2ebd_1 - - chardet=4.0.0=py39hf3d152e_1 - - colorama=0.4.4=pyh9f0ad1d_0 - - commonmark=0.9.1=py_0 - - coreutils=8.31=h516909a_0 - - coverage=6.1.1=py39h3811e60_0 - - cryptography=35.0.0=py39h95dcef6_1 - - curl=7.79.1=h2574ce0_1 - - cutadapt>=2.9 - - cykhash=1.0.2=py39hf149a3a_2 - - cython=0.29.24=py39he80948d_0 - - fastqc=0.11.9=hdfd78af_1 - - fastq-pair - - fastp - - flash + - argcomplete=3.6.3=pyhd8ed1ab_0 + - bedtools=2.31.1=h13024bc_3 + - binutils_impl_linux-64=2.45=default_hfdba357_105 + - bioconductor-annotationdbi=1.64.1=r43hdfd78af_0 + - bioconductor-annotationfilter=1.26.0=r43hdfd78af_0 + - bioconductor-annotationhub=3.10.0=r43hdfd78af_0 + - bioconductor-biobase=2.62.0=r43ha9d7317_3 + - bioconductor-biocfilecache=2.10.1=r43hdfd78af_0 + - bioconductor-biocgenerics=0.48.1=r43hdfd78af_2 + - bioconductor-biocio=1.12.0=r43hdfd78af_0 + - bioconductor-biocparallel=1.36.0=r43hf17093f_2 + - bioconductor-biocversion=3.18.1=r43hdfd78af_0 + - bioconductor-biomart=2.58.0=r43hdfd78af_0 + - bioconductor-biostrings=2.70.1=r43ha9d7317_2 + - bioconductor-bsgenome=1.70.1=r43hdfd78af_0 + - bioconductor-data-packages=20250625=hdfd78af_0 + - bioconductor-delayedarray=0.28.0=r43ha9d7317_2 + - bioconductor-ensembldb=2.26.0=r43hdfd78af_0 + - bioconductor-experimenthub=2.10.0=r43hdfd78af_0 + - bioconductor-genomeinfodb=1.38.1=r43hdfd78af_1 + - bioconductor-genomeinfodbdata=1.2.11=r43hdfd78af_1 + - bioconductor-genomicalignments=1.38.0=r43ha9d7317_1 + - bioconductor-genomicdistributions=1.10.0=r43hdfd78af_0 + - bioconductor-genomicdistributionsdata=1.10.0=r43hdfd78af_0 + - bioconductor-genomicfeatures=1.54.1=r43hdfd78af_0 + - bioconductor-genomicranges=1.54.1=r43ha9d7317_2 + - bioconductor-interactivedisplaybase=1.40.0=r43hdfd78af_0 + - bioconductor-iranges=2.36.0=r43ha9d7317_2 + - bioconductor-keggrest=1.42.0=r43hdfd78af_0 + - bioconductor-matrixgenerics=1.14.0=r43hdfd78af_3 + - bioconductor-protgenerics=1.34.0=r43hdfd78af_0 + - bioconductor-rhtslib=2.4.0=r43ha9d7317_2 + - bioconductor-rsamtools=2.18.0=r43hf17093f_2 + - bioconductor-rtracklayer=1.62.0=r43ha9d7317_1 + - bioconductor-s4arrays=1.2.0=r43ha9d7317_2 + - bioconductor-s4vectors=0.40.2=r43ha9d7317_2 + - bioconductor-sparsearray=1.2.2=r43ha9d7317_2 + - bioconductor-summarizedexperiment=1.32.0=r43hdfd78af_0 + - bioconductor-xvector=0.42.0=r43ha9d7317_2 + - bioconductor-zlibbioc=1.48.0=r43ha9d7317_2 + - bowtie2=2.5.4=he96a11b_7 + - bwidget=1.10.1=ha770c72_1 + - bzip2=1.0.8=hda65f42_8 + - c-ares=1.34.6=hb03c661_0 + - ca-certificates=2026.1.4=hbd8a1cb_0 + - cairo=1.18.0=hbb29018_2 + - curl=8.18.0=h4e3cde8_0 + - fastp=1.1.0=heae3180_0 + - fastq-pair=1.0=h503566f_6 + - flash=1.2.11=hadc24fc_2 - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - font-ttf-inconsolata=3.000=h77eed37_0 - font-ttf-source-code-pro=2.038=h77eed37_0 - - font-ttf-ubuntu=0.83=hab24e00_0 - - fontconfig=2.13.1=hba837de_1005 + - font-ttf-ubuntu=0.83=h77eed37_3 + - fontconfig=2.15.0=h7e30c49_1 - fonts-conda-ecosystem=1=0 - - fonts-conda-forge=1=0 - - freetype=2.10.4=h0708190_1 - - fribidi=1.0.10=h36c2ea0_0 - - future=0.18.2=py39hf3d152e_3 - - gawk=5.1.0=h7f98852_0 - - gcc_impl_linux-64=9.4.0=h03d3576_11 - - gcc_linux-64=9.4.0=h391b98a_1 - - genrich=0.6.1=h5bf99c6_1 - - gettext=0.19.8.1=h73d1719_1008 - - gfortran_impl_linux-64=9.4.0=h0003116_11 - - gfortran_linux-64=9.4.0=hf0ab688_1 - - giflib=5.2.1=h36c2ea0_2 - - graphite2=1.3.13=h58526e2_1001 - - grep=3.4=h9d02d08_1 - - gsl=2.6=he838d99_2 - - gxx_impl_linux-64=9.4.0=h03d3576_11 - - gxx_linux-64=9.4.0=h0316aca_1 - - harfbuzz=2.9.1=h83ec7ef_1 - - hmmratac=1.2.10=hdfd78af_1 - - homer=4.11=pl5262h7d875b9_5 - - htslib=1.14=h9093b5e_0 - - icu=68.2=h9c3ff4c_0 - - importlib-metadata=4.8.1=py39hf3d152e_0 - - iniconfig=1.1.1=pyh9f0ad1d_0 - - jbig=2.1=h7f98852_2003 - - jinja2=3.0.2=pyhd8ed1ab_0 - - jpeg=9d=h36c2ea0_0 - - jsonschema=4.1.2=pyhd8ed1ab_0 - - kernel-headers_linux-64=2.6.32=he073ed8_15 - - krb5=1.19.2=hcc1bbae_2 - - lcms2=2.12=hddcbb42_0 - - ld_impl_linux-64=2.36.1=hea4e1c9_2 - - lerc=2.2.1=h9c3ff4c_0 - - libblas=3.9.0=12_linux64_openblas - - libcblas=3.9.0=12_linux64_openblas - - libcurl=7.79.1=h2574ce0_1 - - libdeflate=1.7=h7f98852_5 - - libedit=3.1.20191231=he28a2e2_2 - - libev=4.33=h516909a_1 - - libffi=3.4.2=h9c3ff4c_4 - - libgcc-devel_linux-64=9.4.0=hd854feb_11 - - libgcc-ng=11.2.0=h1d223b6_11 - - libgfortran-ng=11.2.0=h69a702a_11 - - libgfortran5=11.2.0=h5c6108e_11 - - libgit2=1.3.0=hee63804_1 - - libglib=2.70.0=h174f98d_1 - - libgomp=11.2.0=h1d223b6_11 - - libiconv=1.16=h516909a_0 - - libidn2=2.3.2=h7f98852_0 - - liblapack=3.9.0=12_linux64_openblas - - libnghttp2=1.43.0=h812cca2_1 - - libopenblas=0.3.18=pthreads_h8fe5266_0 - - libpng=1.6.37=h21135ba_2 - - libsanitizer=9.4.0=h79bfe98_11 - - libssh2=1.10.0=ha56f1ee_2 - - libstdcxx-devel_linux-64=9.4.0=hd854feb_11 - - libstdcxx-ng=11.2.0=he4da1e4_11 - - libtiff=4.3.0=hf544144_1 - - libunistring=0.9.10=h7f98852_0 - - libuuid=2.32.1=h7f98852_1000 - - libwebp-base=1.2.1=h7f98852_0 - - libxcb=1.13=h7f98852_1003 - - libxml2=2.9.12=h72842e0_0 - - libzlib=1.2.11=h36c2ea0_1013 - - lz4-c=1.9.3=h9c3ff4c_1 - - make=4.3=hd18ef5c_1 - - markupsafe=2.0.1=py39h3811e60_0 - - more-itertools=8.10.0=pyhd8ed1ab_0 - - mysql-connector-c=6.1.11=h6eb9d5d_1007 - - ncurses=6.2=h58526e2_4 - - numpy=1.21.3=py39hdbf815f_0 - - openblas=0.3.18=pthreads_h4748800_0 - - openjdk=11.0.9.1=h5cc2fde_1 - - openssl=1.1.1l=h7f98852_0 - - oyaml=1.0=pyhd8ed1ab_0 - - packaging=21.0=pyhd8ed1ab_0 - - pango=1.48.10=hb8ff022_1 - - pathlib2=2.3.6=py39hf3d152e_0 - - pcre=8.45=h9c3ff4c_0 - - pcre2=10.37=h032f7d1_0 - - peppy=0.31.1=pyhd8ed1ab_0 - - perl=5.26.2=h36c2ea0_1008 - - picard=2.26.4=hdfd78af_0 - - pigz=2.6=h27826a3_0 - - pip=21.3.1=pyhd8ed1ab_0 - - piper=0.12.1=py_1 - - pixman=0.40.0=h36c2ea0_0 - - pluggy=1.0.0=py39hf3d152e_1 - - preseq=2.0.3=hc216eb9_5 - - psutil=5.8.0=py39h3811e60_1 - - pthread-stubs=0.4=h36c2ea0_1001 - - py=1.10.0=pyhd3deb0d_0 - - pycparser=2.20=pyh9f0ad1d_2 - - pyfaidx=0.6.3.1=pyh5e36f6f_0 - - pygments=2.10.0=pyhd8ed1ab_0 - - pyopenssl=21.0.0=pyhd8ed1ab_0 - - pyparsing=3.0.4=pyhd8ed1ab_0 - - pysocks=1.7.1=py39hf3d152e_3 - - pytest=6.2.5=py39hf3d152e_0 - - python=3.9.7=hb7a2778_3_cpython - - python-dateutil=2.8.2=pyhd8ed1ab_0 - - python-levenshtein=0.12.2=py39h3811e60_0 - - python_abi=3.9=2_cp39 - - pytz=2021.3=pyhd8ed1ab_0 - - r-askpass=1.1=r41hcfec24a_2 - - r-assertthat=0.2.1=r41hc72bb7e_2 - - r-backports=1.3.0=r41hcfec24a_0 - - r-base=4.1.1=hb67fd72_0 - - r-essentials=4.1 - - r-gert - - r-base64enc=0.1_3=r41hcfec24a_1004 - - r-bh=1.75.0_0=r41hc72bb7e_0 - - r-biocmanager=1.30.16=r41hc72bb7e_0 - - r-bit=4.0.4=r41hcfec24a_0 - - r-bit64=4.0.5=r41hcfec24a_0 - - r-bitops=1.0_7=r41hcfec24a_0 - - r-blob=1.2.2=r41hc72bb7e_0 - - r-brew=1.0_6=r41hc72bb7e_1003 - - r-brio=1.1.2=r41hcfec24a_0 - - r-bslib=0.3.1=r41hc72bb7e_0 - - r-cachem=1.0.6=r41hcfec24a_0 - - r-callr=3.7.0=r41hc72bb7e_0 - - r-catools=1.18.2=r41h03ef668_0 - - r-cli=3.1.0=r41h03ef668_0 - - r-clipr=0.7.1=r41hc72bb7e_0 - - r-colorspace=2.0_2=r41hcfec24a_0 - - r-commonmark=1.7=r41hcfec24a_1002 - - r-covr=3.5.1=r41h03ef668_0 - - r-cpp11=0.4.0=r41hc72bb7e_0 - - r-crayon=1.4.2=r41hc72bb7e_0 - - r-credentials=1.3.1=r41hc72bb7e_0 - - r-crosstalk=1.1.1=r41hc72bb7e_0 - - r-curl=4.3.2=r41hcfec24a_0 - - r-data.table=1.14.2=r41hcfec24a_0 - - r-dbi=1.1.1=r41hc72bb7e_0 - - r-dbplyr=2.1.1=r41hc72bb7e_0 - - r-desc=1.4.0=r41hc72bb7e_0 - - r-devtools=2.4.2=r41hc72bb7e_0 - - r-diffobj=0.3.5=r41hcfec24a_0 - - r-digest=0.6.28=r41h03ef668_0 - - r-dplyr=1.0.7=r41h03ef668_0 - - r-dt=0.19=r41hc72bb7e_0 - - r-ellipsis=0.3.2=r41hcfec24a_0 - - r-evaluate=0.14=r41hc72bb7e_2 - - r-fansi=0.4.2=r41hcfec24a_0 - - r-farver=2.1.0=r41h03ef668_0 - - r-fastmap=1.1.0=r41h03ef668_0 - - r-filelock=1.0.2=r41hcfec24a_1002 - - r-fontawesome=0.2.2=r41hc72bb7e_0 - - r-formatr=1.11=r41hc72bb7e_0 - - r-fs=1.5.0=r41h03ef668_0 - - r-futile.logger=1.4.3=r41hc72bb7e_1003 - - r-futile.options=1.0.1=r41hc72bb7e_1002 - - r-generics=0.1.1=r41hc72bb7e_0 - - r-gert=1.4.1=r41h29657ab_1 - - r-ggplot2=3.3.5=r41hc72bb7e_0 - - r-gh=1.3.0=r41hc72bb7e_0 - - r-git2r=0.28.0=r41hf628c3e_1 - - r-gitcreds=0.1.1=r41hc72bb7e_0 - - r-glue=1.4.2=r41hcfec24a_0 - - r-gplots=3.1.1=r41hc72bb7e_0 - - r-gtable=0.3.0=r41hc72bb7e_3 - - r-gtools=3.9.2=r41hcfec24a_0 - - r-highr=0.9=r41hc72bb7e_0 - - r-hms=1.1.1=r41hc72bb7e_0 - - r-htmltools=0.5.2=r41h03ef668_0 - - r-htmlwidgets=1.5.4=r41hc72bb7e_0 - - r-httpuv=1.6.3=r41h03ef668_0 - - r-httr=1.4.2=r41hc72bb7e_0 - - r-ini=0.3.1=r41hc72bb7e_1003 - - r-isoband=0.2.5=r41h03ef668_0 - - r-jquerylib=0.1.4=r41hc72bb7e_0 - - r-jsonlite=1.7.2=r41hcfec24a_0 - - r-kernsmooth=2.23_20=r41h742201e_0 - - r-knitr=1.35=r41hc72bb7e_0 - - r-labeling=0.4.2=r41hc72bb7e_0 - - r-lambda.r=1.2.4=r41hc72bb7e_1 - - r-later=1.2.0=r41h03ef668_0 - - r-lattice=0.20_45=r41hcfec24a_0 - - r-lazyeval=0.2.2=r41hcfec24a_2 - - r-lifecycle=1.0.1=r41hc72bb7e_0 - - r-magrittr=2.0.1=r41hcfec24a_1 - - r-markdown=1.1=r41hcfec24a_1 - - r-mass=7.3_54=r41hcfec24a_0 - - r-matrix=1.3_4=r41he454529_0 - - r-matrixstats=0.61.0=r41hcfec24a_0 - - r-memoise=2.0.0=r41hc72bb7e_0 - - r-mgcv=1.8_38=r41he454529_0 - - r-mime=0.12=r41hcfec24a_0 - - r-munsell=0.5.0=r41hc72bb7e_1003 - - r-nlme=3.1_153=r41h859d828_0 - - r-openssl=1.4.5=r41he36bf35_1 - - r-pillar=1.6.4=r41hc72bb7e_0 - - r-pkgbuild=1.2.0=r41hc72bb7e_0 - - r-pkgconfig=2.0.3=r41hc72bb7e_1 - - r-pkgload=1.2.3=r41h03ef668_0 - - r-plogr=0.2.0=r41hc72bb7e_1003 - - r-png=0.1_7=r41hcfec24a_1004 - - r-praise=1.0.0=r41hc72bb7e_1004 - - r-prettyunits=1.1.1=r41hc72bb7e_1 - - r-processx=3.5.2=r41hcfec24a_0 - - r-progress=1.2.2=r41hc72bb7e_2 - - r-promises=1.2.0.1=r41h03ef668_0 - - r-ps=1.6.0=r41hcfec24a_0 - - r-purrr=0.3.4=r41hcfec24a_1 - - r-r6=2.5.1=r41hc72bb7e_0 - - r-rappdirs=0.3.3=r41hcfec24a_0 - - r-rcmdcheck=1.4.0=r41h785f33e_0 - - r-rcolorbrewer=1.1_2=r41h785f33e_1003 - - r-rcpp=1.0.7=r41h03ef668_0 - - r-rcurl=1.98_1.5=r41hcfec24a_0 - - r-rematch2=2.1.2=r41hc72bb7e_1 - - r-remotes=2.4.1=r41hc72bb7e_0 - - r-restfulr=0.0.13=r41hdf9a8c9_1 - - r-rex=1.2.0=r41hc72bb7e_1 - - r-rjson=0.2.20=r41h03ef668_1002 - - r-rlang=0.4.12=r41hcfec24a_0 - - r-roxygen2=7.1.2=r41h03ef668_0 - - r-rprojroot=2.0.2=r41hc72bb7e_0 - - r-rsqlite=2.2.8=r41h03ef668_0 - - r-rstudioapi=0.13=r41hc72bb7e_0 - - r-rversions=2.1.1=r41hc72bb7e_0 - - r-sass=0.4.0=r41h03ef668_0 - - r-scales=1.1.1=r41hc72bb7e_0 - - r-sessioninfo=1.2.0=r41hc72bb7e_0 - - r-shiny=1.7.1=r41h785f33e_0 - - r-snow=0.4_4=r41hc72bb7e_0 - - r-sourcetools=0.1.7=r41h9c3ff4c_1002 - - r-stringi=1.7.5=r41hcabe038_0 - - r-stringr=1.4.0=r41hc72bb7e_2 - - r-sys=3.4=r41hcfec24a_0 - - r-testthat=3.1.0=r41h03ef668_0 - - r-tibble=3.1.5=r41hcfec24a_0 - - r-tidyselect=1.1.1=r41hc72bb7e_0 - - r-usethis=2.1.3=r41hc72bb7e_0 - - r-utf8=1.2.2=r41hcfec24a_0 - - r-vctrs=0.3.8=r41hcfec24a_1 - - r-viridislite=0.4.0=r41hc72bb7e_0 - - r-waldo=0.3.1=r41hc72bb7e_0 - - r-whisker=0.4=r41hc72bb7e_1 - - r-withr=2.4.2=r41hc72bb7e_0 - - r-xfun=0.27=r41h03ef668_0 - - r-xml=3.99_0.8=r41hcfec24a_0 - - r-xml2=1.3.2=r41h03ef668_1 - - r-xopen=1.0.0=r41hc72bb7e_1003 - - r-xtable=1.8_4=r41hc72bb7e_3 - - r-yaml=2.2.1=r41hcfec24a_1 - - r-zip=2.2.0=r41hcfec24a_0 - - readline=8.1=h46c0cb4_0 - - refgenie=0.12.0=pyhdfd78af_0 - - rich=10.12.0=py39hf3d152e_0 - - rust=1.56.0=h61edd41_0 - - rust-std-x86_64-unknown-linux-gnu=1.56.0=hc1431ca_0 - - samblaster=0.1.26=h7d875b9_1 - - samtools=1.14=hb421002_0 - - sed=4.8=he412f7d_0 - - seqkit - - seqtk - - six=1.16.0=pyh6c4a22f_0 - - sqlite=3.36.0=h9cd32fc_2 - - sysroot_linux-64=2.12=he073ed8_15 - - tbb=2020.2=h4bd325d_4 - - tk=8.6.11=h27826a3_1 - - tktable=2.10=hb7b940f_3 - - toml=0.10.2=pyhd8ed1ab_0 - - tomli=1.2.2=pyhd8ed1ab_0 - - typing_extensions=3.10.0.2=pyha770c72_0 - - tzdata=2021e=he74cb21_0 - - ubiquerg=0.6.1=pyh9f0ad1d_0 - - ucsc-bedgraphtobigwig=377=h0b8a92a_2 - - ucsc-bedtobigbed=377=h0b8a92a_2 - - ucsc-bigwigcat=377=h0b8a92a_2 - - ucsc-bigwigmerge=377=h0b8a92a_2 - - ucsc-wigtobigwig=377=h0b8a92a_2 - - unzip=6.0=h7f98852_2 - - urllib3=1.26.7=pyhd8ed1ab_0 - - veracitools=0.1.3=py_0 - - wget=1.20.3=ha56f1ee_1 - - wheel=0.37.0=pyhd8ed1ab_1 - - xorg-fixesproto=5.0=h7f98852_1002 - - xorg-inputproto=2.3.2=h7f98852_1002 - - xorg-kbproto=1.0.7=h7f98852_1002 - - xorg-libice=1.0.10=h7f98852_0 - - xorg-libsm=1.2.3=hd9c2040_1000 - - xorg-libx11=1.7.2=h7f98852_0 - - xorg-libxau=1.0.9=h7f98852_0 - - xorg-libxdmcp=1.1.3=h7f98852_0 - - xorg-libxext=1.3.4=h7f98852_1 - - xorg-libxfixes=5.0.3=h7f98852_1004 - - xorg-libxi=1.7.10=h7f98852_0 - - xorg-libxrender=0.9.10=h7f98852_1003 - - xorg-libxt=1.2.1=h7f98852_2 - - xorg-libxtst=1.2.3=h7f98852_1002 - - xorg-recordproto=1.14.2=h7f98852_1002 - - xorg-renderproto=0.11.1=h7f98852_1002 - - xorg-xextproto=7.3.0=h7f98852_1002 - - xorg-xproto=7.0.31=h7f98852_1007 - - xz=5.2.5=h516909a_1 - - yacman=0.8.3=pyhd8ed1ab_0 - - yaml=0.2.5=h516909a_0 - - zipp=3.6.0=pyhd8ed1ab_0 - - zlib=1.2.11=h36c2ea0_1013 - - zstd=1.5.0=ha95c52a_0 + - fonts-conda-forge=1=hc364b38_1 + - freetype=2.14.1=ha770c72_0 + - fribidi=1.0.16=hb03c661_0 + - gcc_impl_linux-64=15.2.0=hc5723f1_16 + - gfortran_impl_linux-64=15.2.0=h281d09f_16 + - graphite2=1.3.14=hecca717_2 + - gsl=1.16=0 + - gxx_impl_linux-64=15.2.0=hda75c37_16 + - harfbuzz=9.0.0=hfac3d4d_0 + - htslib=1.23=h566b1c6_0 + - icu=73.2=h59595ed_0 + - isa-l=2.31.1=hb9d3cd8_1 + - jq=1.8.1=h73b1eb8_0 + - kernel-headers_linux-64=4.18.0=he073ed8_9 + - keyutils=1.6.3=hb9d3cd8_0 + - krb5=1.21.3=h659f571_0 + - ld_impl_linux-64=2.45=default_hbd61a6d_105 + - lerc=4.0.0=h0aef613_1 + - libblas=3.11.0=5_h4a7cf45_openblas + - libcurl=8.18.0=h4e3cde8_0 + - libdeflate=1.22=hb9d3cd8_0 + - libedit=3.1.20250104=pl5321h7949ede_0 + - libev=4.33=hd590300_2 + - libexpat=2.7.3=hecca717_0 + - libffi=3.5.2=h3435931_0 + - libfreetype=2.14.1=ha770c72_0 + - libfreetype6=2.14.1=h73754d4_0 + - libgcc=15.2.0=he0feb66_16 + - libgcc-devel_linux-64=15.2.0=hcc6f6b0_116 + - libgcc-ng=15.2.0=h69a702a_16 + - libgfortran=15.2.0=h69a702a_16 + - libgfortran-ng=15.2.0=h69a702a_16 + - libgfortran5=15.2.0=h68bc16d_16 + - libgit2=1.8.4=hd24f944_1 + - libglib=2.84.0=h2ff4ddf_0 + - libgomp=15.2.0=he0feb66_16 + - libiconv=1.18=h3b78370_2 + - libjpeg-turbo=3.1.2=hb03c661_0 + - liblapack=3.11.0=5_h47877c9_openblas + - liblzma=5.8.2=hb03c661_0 + - liblzma-devel=5.8.2=hb03c661_0 + - libmpdec=4.0.0=hb03c661_1 + - libnghttp2=1.67.0=had1ee68_0 + - libopenblas=0.3.30=pthreads_h94d23a6_4 + - libopenssl-static=3.6.1=hb03c661_1 + - libpng=1.6.54=h421ea60_0 + - libsanitizer=15.2.0=h90f66d4_16 + - libsqlite=3.51.2=h0c1763c_0 + - libssh2=1.11.1=hcf80075_0 + - libstdcxx=15.2.0=h934c35e_16 + - libstdcxx-devel_linux-64=15.2.0=hd446a21_116 + - libstdcxx-ng=15.2.0=hdf11a46_16 + - libtiff=4.7.0=hc4654cb_2 + - libuuid=2.41.3=h5347b49_0 + - libuv=1.51.0=hb03c661_1 + - libwebp-base=1.6.0=hd42ef1d_0 + - libxcb=1.17.0=h8a09558_0 + - libxcrypt=4.4.36=hd590300_1 + - libxml2=2.12.7=h4c95cb1_3 + - libzlib=1.3.1=hb9d3cd8_2 + - make=4.4.1=hb9d3cd8_2 + - mysql-connector-c=6.1.11=h659d440_1008 + - ncurses=6.5=h2d0b736_3 + - oniguruma=6.9.10=hb9d3cd8_0 + - openssl=3.6.1=h35e630c_1 + - pandoc=3.9=ha770c72_0 + - pango=1.54.0=h4c5309f_1 + - pcre2=10.44=hc749103_2 + - perl=5.32.1=7_hd590300_perl5 + - pip=26.0=pyh145f28c_0 + - pixman=0.46.4=h54a6638_1 + - preseq=2.0.2=gsl1.16_0 + - pthread-stubs=0.4=hb9d3cd8_1002 + - python=3.13.11=hc97d973_101_cp313 + - python_abi=3.13=8_cp313 + - pyyaml=6.0.3=py313h3dea7bd_0 + - r-abind=1.4_5=r43hc72bb7e_1006 + - r-argparser=0.7.2=r43hc72bb7e_1 + - r-askpass=1.2.1=r43h2b5f3a1_0 + - r-assertthat=0.2.1=r43hc72bb7e_5 + - r-backports=1.5.0=r43hb1dbf0f_1 + - r-base=4.3.3=h8ee917f_2 + - r-base64enc=0.1_3=r43hb1dbf0f_1007 + - r-bh=1.87.0_1=r43hc72bb7e_0 + - r-biocmanager=1.30.26=r43hc72bb7e_0 + - r-bit=4.6.0=r43h2b5f3a1_0 + - r-bit64=4.6.0_1=r43h2b5f3a1_0 + - r-bitops=1.0_9=r43h2b5f3a1_0 + - r-blob=1.2.4=r43hc72bb7e_2 + - r-brew=1.0_10=r43hc72bb7e_1 + - r-brio=1.1.5=r43hb1dbf0f_1 + - r-broom=1.0.9=r43hc72bb7e_0 + - r-bslib=0.9.0=r43hc72bb7e_0 + - r-cachem=1.1.0=r43hb1dbf0f_1 + - r-callr=3.7.6=r43hc72bb7e_1 + - r-cli=3.6.5=r43h93ab643_0 + - r-clipr=0.8.0=r43hc72bb7e_3 + - r-codetools=0.2_20=r43hc72bb7e_1 + - r-colorspace=2.1_1=r43hdb488b9_0 + - r-commonmark=2.0.0=r43h2b5f3a1_0 + - r-cpp11=0.5.2=r43h785f33e_1 + - r-crayon=1.5.3=r43hc72bb7e_1 + - r-credentials=2.0.3=r43hc72bb7e_0 + - r-crosstalk=1.2.2=r43hc72bb7e_0 + - r-curl=7.0.0=r43h10955f1_0 + - r-data.table=1.17.8=r43h1c8cec4_0 + - r-dbi=1.2.3=r43hc72bb7e_1 + - r-dbplyr=2.5.1=r43hc72bb7e_0 + - r-desc=1.4.3=r43hc72bb7e_1 + - r-devtools=2.4.5=r43hc72bb7e_3 + - r-diffobj=0.3.6=r43h2b5f3a1_0 + - r-digest=0.6.37=r43h0d4f4ea_0 + - r-downlit=0.4.4=r43hc72bb7e_1 + - r-dplyr=1.1.4=r43h0d4f4ea_1 + - r-dt=0.34.0=r43hc72bb7e_0 + - r-ellipsis=0.3.2=r43hb1dbf0f_3 + - r-evaluate=1.0.5=r43hc72bb7e_0 + - r-fansi=1.0.6=r43hb1dbf0f_1 + - r-farver=2.1.2=r43ha18555a_1 + - r-fastmap=1.2.0=r43ha18555a_1 + - r-filelock=1.0.3=r43hb1dbf0f_1 + - r-fontawesome=0.5.3=r43hc72bb7e_0 + - r-formatr=1.14=r43hc72bb7e_2 + - r-fs=1.6.6=r43h93ab643_0 + - r-futile.logger=1.4.3=r43hc72bb7e_1006 + - r-futile.options=1.0.1=r43hc72bb7e_1005 + - r-generics=0.1.4=r43hc72bb7e_0 + - r-gert=2.1.4=r43h017ce79_0 + - r-ggplot2=3.5.2=r43hc72bb7e_0 + - r-gh=1.5.0=r43hc72bb7e_0 + - r-gitcreds=0.1.2=r43hc72bb7e_3 + - r-glue=1.8.0=r43h2b5f3a1_0 + - r-gtable=0.3.6=r43hc72bb7e_0 + - r-highr=0.11=r43hc72bb7e_1 + - r-hms=1.1.3=r43hc72bb7e_2 + - r-htmltools=0.5.8.1=r43ha18555a_1 + - r-htmlwidgets=1.6.4=r43h785f33e_3 + - r-httpuv=1.6.16=r43h6d565e7_0 + - r-httr=1.4.7=r43hc72bb7e_1 + - r-httr2=1.2.1=r43hc72bb7e_0 + - r-ini=0.3.1=r43hc72bb7e_1006 + - r-isoband=0.2.7=r43ha18555a_3 + - r-jquerylib=0.1.4=r43hc72bb7e_3 + - r-jsonlite=2.0.0=r43h2b5f3a1_0 + - r-knitr=1.50=r43hc72bb7e_0 + - r-labeling=0.4.3=r43hc72bb7e_1 + - r-lambda.r=1.2.4=r43hc72bb7e_4 + - r-later=1.4.4=r43h3697838_0 + - r-lattice=0.22_7=r43h2b5f3a1_0 + - r-lazyeval=0.2.2=r43hb1dbf0f_5 + - r-lifecycle=1.0.4=r43hc72bb7e_1 + - r-lobstr=1.1.2=r43ha18555a_4 + - r-magrittr=2.0.3=r43hb1dbf0f_3 + - r-mass=7.3_60.0.1=r43hb1dbf0f_1 + - r-matrix=1.6_5=r43he966344_1 + - r-matrixstats=1.5.0=r43h2b5f3a1_0 + - r-memoise=2.0.1=r43hc72bb7e_3 + - r-mgcv=1.9_3=r43h2ae2be5_0 + - r-mime=0.13=r43h2b5f3a1_0 + - r-miniui=0.1.2=r43hc72bb7e_0 + - r-munsell=0.5.1=r43hc72bb7e_1 + - r-nlme=3.1_168=r43hb67ce94_0 + - r-openssl=2.3.3=r43he8289e2_0 + - r-pillar=1.11.0=r43hc72bb7e_0 + - r-pkgbuild=1.4.8=r43hc72bb7e_0 + - r-pkgconfig=2.0.3=r43hc72bb7e_4 + - r-pkgdown=2.1.3=r43hc72bb7e_0 + - r-pkgload=1.4.0=r43hc72bb7e_0 + - r-plogr=0.2.0=r43hc72bb7e_1006 + - r-plyr=1.8.9=r43h3697838_2 + - r-png=0.1_8=r43h21f035c_2 + - r-praise=1.0.0=r43hc72bb7e_1008 + - r-prettyunits=1.2.0=r43hc72bb7e_1 + - r-processx=3.8.6=r43h2b5f3a1_0 + - r-profvis=0.4.0=r43h2b5f3a1_0 + - r-progress=1.2.3=r43hc72bb7e_1 + - r-promises=1.3.3=r43h3697838_0 + - r-pryr=0.1.6=r43h0d4f4ea_2 + - r-ps=1.9.1=r43h2b5f3a1_0 + - r-purrr=1.1.0=r43h54b55ab_0 + - r-r.methodss3=1.8.2=r43hc72bb7e_3 + - r-r.oo=1.27.1=r43hc72bb7e_0 + - r-r.utils=2.13.0=r43hc72bb7e_0 + - r-r6=2.6.1=r43hc72bb7e_0 + - r-ragg=1.5.0=r43h9f1dc4d_0 + - r-rappdirs=0.3.3=r43hb1dbf0f_3 + - r-rcmdcheck=1.4.0=r43h785f33e_3 + - r-rcolorbrewer=1.1_3=r43h785f33e_3 + - r-rcpp=1.1.0=r43h93ab643_0 + - r-rcurl=1.98_1.16=r43he8228da_1 + - r-rematch2=2.1.2=r43hc72bb7e_4 + - r-remotes=2.5.0=r43hc72bb7e_1 + - r-reshape2=1.4.4=r43h3697838_5 + - r-restfulr=0.0.15=r43h56115f1_4 + - r-rjson=0.2.23=r43h93ab643_0 + - r-rlang=1.1.6=r43h93ab643_0 + - r-rmarkdown=2.29=r43hc72bb7e_0 + - r-roxygen2=7.3.3=r43h3697838_0 + - r-rprojroot=2.1.1=r43hc72bb7e_0 + - r-rsqlite=2.4.3=r43h3697838_0 + - r-rstudioapi=0.17.1=r43hc72bb7e_0 + - r-rversions=2.1.2=r43hc72bb7e_3 + - r-sass=0.4.10=r43h93ab643_0 + - r-scales=1.4.0=r43hc72bb7e_0 + - r-sessioninfo=1.2.3=r43hc72bb7e_0 + - r-shiny=1.11.1=r43h785f33e_0 + - r-snow=0.4_4=r43hc72bb7e_3 + - r-sourcetools=0.1.7_1=r43ha18555a_2 + - r-stringi=1.8.4=r43ha8ce623_2 + - r-stringr=1.5.2=r43h785f33e_0 + - r-sys=3.4.3=r43h2b5f3a1_0 + - r-systemfonts=1.2.3=r43h74f4acd_0 + - r-testthat=3.2.3=r43h3697838_1 + - r-textshaping=0.4.0=r43ha47bcaa_2 + - r-tibble=3.3.0=r43h2b5f3a1_0 + - r-tidyr=1.3.1=r43h0d4f4ea_1 + - r-tidyselect=1.2.1=r43hc72bb7e_1 + - r-tinytex=0.57=r43hc72bb7e_0 + - r-urlchecker=1.0.1=r43hc72bb7e_3 + - r-usethis=3.2.1=r43hc72bb7e_0 + - r-utf8=1.2.6=r43h2b5f3a1_0 + - r-vctrs=0.6.5=r43h0d4f4ea_1 + - r-viridislite=0.4.2=r43hc72bb7e_2 + - r-waldo=0.6.2=r43hc72bb7e_0 + - r-whisker=0.4.1=r43hc72bb7e_2 + - r-withr=3.0.2=r43hc72bb7e_0 + - r-xfun=0.53=r43h3697838_0 + - r-xml=3.99_0.17=r43he716329_1 + - r-xml2=1.3.6=r43h8194278_2 + - r-xopen=1.0.1=r43hc72bb7e_1 + - r-xtable=1.8_4=r43hc72bb7e_6 + - r-yaml=2.3.10=r43hdb488b9_0 + - r-zip=2.3.3=r43h2b5f3a1_0 + - readline=8.3=h853b02a_0 + - samtools=1.23=h96c455f_0 + - sed=4.9=h6688a6e_0 + - seqkit=2.12.0=he881be0_1 + - seqtk=1.5=h577a1d6_1 + - setuptools=80.10.2=pyh332efcf_0 + - sysroot_linux-64=2.28=h4ee821c_9 + - tk=8.6.13=noxft_h366c992_103 + - tktable=2.10=h8d826fa_7 + - toml=0.10.2=pyhcf101f3_3 + - tomlkit=0.14.0=pyha770c72_0 + - ucsc-bigwigcat=482=h0b57e2e_0 + - ucsc-wigtobigwig=482=hdc0a859_1 + - xmltodict=1.0.2=pyhcf101f3_0 + - xorg-libice=1.1.2=hb9d3cd8_0 + - xorg-libsm=1.2.6=he73a12e_0 + - xorg-libx11=1.8.12=h4f16b4b_0 + - xorg-libxau=1.0.12=hb03c661_1 + - xorg-libxdmcp=1.1.5=hb03c661_1 + - xorg-libxext=1.3.7=hb03c661_0 + - xorg-libxrender=0.9.12=hb9d3cd8_0 + - xorg-libxt=1.3.1=hb9d3cd8_0 + - xz=5.8.2=ha02ee65_0 + - xz-gpl-tools=5.8.2=ha02ee65_0 + - xz-tools=5.8.2=hb03c661_0 + - yaml=0.2.5=h280c20c_3 + - yq=3.4.3=pyhe01879c_2 + - zlib=1.3.1=hb9d3cd8_2 + - zstd=1.5.7=hb78ec9c_6 - pip: - - bio==1.3.2 - - biopython==1.79 - - biothings-client==0.2.6 - - certifi==2021.10.8 - - charset-normalizer==2.0.7 - - codecov==2.1.12 - - divvy==0.6.0 - - eido==0.1.5 - - hypothesis==4.38.0 - - idna==3.3 - - logmuse==0.2.7 - - looper==1.3.1 - - mygene==3.2.2 - - pandas==1.3.4 - - pararead==0.7.0 - - piper>=0.12.3 - - pyrsistent==0.18.0 - - pysam==0.17.0 - - pyyaml==6.0 - - refgenconf==0.12.1 - - requests==2.26.0 - - setuptools==58.4.0 - - tqdm==4.62.3 - + - annotated-types==0.7.0 + - attmap==0.13.2 + - attrs==25.4.0 + - certifi==2026.1.4 + - charset-normalizer==3.4.4 + - click==8.3.1 + - colorama==0.4.6 + - coloredlogs==15.0.1 + - cutadapt==5.2 + - divvy==0.6.0 + - dnaio==1.2.4 + - eido==0.2.5 + - future==1.0.0 + - humanfriendly==10.0 + - idna==3.11 + - isal==1.8.0 + - jinja2==3.1.6 + - jsonschema==4.26.0 + - jsonschema-specifications==2025.9.1 + - logmuse==0.2.8 + - looper==2.1.0 + - markdown-it-py==4.0.0 + - markupsafe==3.0.3 + - mdurl==0.1.2 + - numpy==2.4.2 + - oyaml==1.0 + - packaging==26.0 + - pandas==2.3.3 + - pararead==0.8.1 + - pephubclient==0.5.0 + - peppy==0.40.8 + - piper==0.14.5 + - pipestat==0.13.0 + - psutil==7.2.2 + - pydantic==2.12.5 + - pydantic-argparse==0.10.0 + - pydantic-core==2.41.5 + - pydantic-settings==2.13.0 + - pyfaidx==0.9.0.3 + - pygments==2.19.2 + - pysam==0.23.3 + - python-dateutil==2.9.0.post0 + - python-dotenv==1.2.1 + - pytz==2025.2 + - referencing==0.37.0 + - refgenconf==0.12.2 + - refgenie==0.12.1 + - requests==2.32.5 + - rich==14.3.2 + - rpds-py==0.30.0 + - shellingham==1.5.4 + - six==1.17.0 + - tqdm==4.67.3 + - typer==0.21.1 + - typing-extensions==4.15.0 + - typing-inspection==0.4.2 + - tzdata==2025.3 + - ubiquerg==0.8.2 + - urllib3==2.6.3 + - xopen==2.0.2 + - yacman==0.9.5 + - zlib-ng==1.0.0 diff --git a/requirements.txt b/requirements.txt index 2db2556..7e39002 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,18 @@ +# Requires Python >=3.9,<3.14 (looper 2.x incompatible with Python 3.14+) +# WARNING: pypiper's PyPI package is "piper" (NOT "pypiper", which is unrelated) +setuptools>=68.0,<81 # provides pkg_resources; no longer bundled with Python 3.12+ attmap>=0.13.0 -divvy>=0.6.0 -eido>=0.1.3 cutadapt>=2.9 -looper>=1.3.1 +divvy>=0.6.0 +eido>=0.2.0 +looper>=2.0.0 numpy>=1.17 pandas>=1.3.4 pararead>=0.7.0 -peppy>=0.31.1 -piper>=0.12.3 -refgenconf>=0.7.0 -refgenie>=0.12.1 # Optional v0.9.12+ +peppy>=0.40.0 +pipestat>=0.6.0 +piper>=0.14.0 +refgenconf>=0.12.0 +refgenie>=0.12.1 ubiquerg>=0.6.1 -yacman>=0.6.7 \ No newline at end of file +yacman>=0.9.0 diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 01ffa61..d2baabe 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -1,59 +1,62 @@ pipeline_name: PEPPRO -pipeline_type: sample -path: pipelines/peppro.py input_schema: peppro_input_schema.yaml output_schema: peppro_output_schema.yaml -command_template: > - {pipeline.path} - --sample-name {sample.sample_name} - --genome {sample.genome} - --input {sample.read1} - --single-or-paired {sample.read_type} - -O {looper.results_subdir} - -P {compute.cores} - -M {compute.mem} - {% if sample.read2 is defined %} --input2 {sample.read2} {% endif %} - {% if sample.protocol is defined %} --protocol {sample.protocol} {% endif %} - {% if sample.adapter is defined %} --adapter-tool {sample.adapter} {% endif %} - {% if sample.dedup is defined %} --dedup-tool {sample.dedup} {% endif %} - {% if sample.trimmer is defined %} --trimmer-tool {sample.trimmer} {% endif %} - {% if sample.umi_len is defined %} --umi-len {sample.umi_len} {% endif %} - {% if sample.max_len is defined %} --max-len {sample.max_len} {% endif %} - {% if sample.sob is defined %} --sob {% endif %} - {% if sample.scale is defined %} --scale {% endif %} - {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} - {% if sample.chrom_sizes is defined %} --chrom-sizes { sample.chrom_sizes } {% elif refgenie[sample.genome].fasta is defined %} --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } {% endif %} - {% if sample.prealignment_index is defined %} --prealignment-index { sample.prealignment_index } {% endif %} - {% if sample.prealignment_names is defined %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% endif %} - {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %} - {% if sample.pi_tss is defined %} --pi-tss { sample.pi_tss } {% elif refgenie[sample.genome].ensembl_gtf is defined %} --pi-tss { refgenie[sample.genome].ensembl_gtf.ensembl_tss } {% endif %} - {% if sample.pi_body is defined %} --pi-body { sample.pi_body } {% elif refgenie[sample.genome].ensembl_gtf is defined %} --pi-body { refgenie[sample.genome].ensembl_gtf.ensembl_gene_body } {% endif %} - {% if sample.pre_name is defined %} --pre-name { sample.pre_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --pre-name { refgenie[sample.genome].refgene_anno.refgene_pre_mRNA } {% endif %} - {% if sample.exon_name is defined %} --exon-name { sample.exon_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --exon-name { refgenie[sample.genome].refgene_anno.refgene_exon } {% endif %} - {% if sample.intron_name is defined %} --intron-name { sample.intron_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --intron-name { refgenie[sample.genome].refgene_anno.refgene_intron } {% endif %} - {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif refgenie[sample.genome].feat_annotation is defined %} --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} - {% if sample.sob is defined %} {% if refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} {% endif %} - {% if sample.sob is defined %} {% if refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %} {% endif %} - {% if sample.fasta is defined %} --fasta { sample.fasta } {% elif refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %} - {% if sample.search_file is defined %} --search-file { sample.search_file } {% elif refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} - {% if sample.coverage is defined %} --coverage {% endif %} - {% if sample.keep is defined %} --keep {% endif %} - {% if sample.keep_mito is defined %} --keep-mito {% endif %} - {% if sample.no_fifo is defined %} --noFIFO {% endif %} - {% if sample.complexity is defined %} --no-complexity {% endif %} - {% if sample.prioritize is defined %} --prioritize {% endif %} +sample_interface: + command_template: > + {looper.piface_dir}/pipelines/peppro.py + --sample-name {sample.sample_name} + --genome {sample.genome} + --input {sample.read1} + --single-or-paired {sample.read_type} + -O {looper.results_subdir} + -P {compute.cores} + -M {compute.mem} + {% if sample.read2 is defined %} --input2 {sample.read2} {% endif %} + --protocol {sample.protocol} + {% if sample.adapter is defined %} --adapter-tool {sample.adapter} {% endif %} + {% if sample.dedup is defined %} --dedup-tool {sample.dedup} {% endif %} + {% if sample.trimmer is defined %} --trimmer-tool {sample.trimmer} {% endif %} + {% if sample.umi_len is defined %} --umi-len {sample.umi_len} {% endif %} + {% if sample.max_len is defined %} --max-len {sample.max_len} {% endif %} + {% if sample.sob is defined %} --sob {% endif %} + {% if sample.scale is defined %} --scale {% endif %} + {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} + {% if sample.chrom_sizes is defined %} --chrom-sizes { sample.chrom_sizes } {% elif refgenie[sample.genome].fasta is defined %} --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } {% endif %} + {% if sample.prealignment_index is defined %} --prealignment-index { sample.prealignment_index } {% endif %} + {% if sample.prealignment_names is defined %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% endif %} + {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %} + {% if sample.pi_tss is defined %} --pi-tss { sample.pi_tss } {% elif refgenie[sample.genome].ensembl_gtf is defined %} --pi-tss { refgenie[sample.genome].ensembl_gtf.ensembl_tss } {% endif %} + {% if sample.pi_body is defined %} --pi-body { sample.pi_body } {% elif refgenie[sample.genome].ensembl_gtf is defined %} --pi-body { refgenie[sample.genome].ensembl_gtf.ensembl_gene_body } {% endif %} + {% if sample.pre_name is defined %} --pre-name { sample.pre_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --pre-name { refgenie[sample.genome].refgene_anno.refgene_pre_mRNA } {% endif %} + {% if sample.exon_name is defined %} --exon-name { sample.exon_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --exon-name { refgenie[sample.genome].refgene_anno.refgene_exon } {% endif %} + {% if sample.intron_name is defined %} --intron-name { sample.intron_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --intron-name { refgenie[sample.genome].refgene_anno.refgene_intron } {% endif %} + {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif refgenie[sample.genome].feat_annotation is defined %} --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} + {% if sample.sob is defined %} {% if refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} {% endif %} + {% if sample.sob is defined %} {% if refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %} {% endif %} + {% if sample.fasta is defined %} --fasta { sample.fasta } {% elif refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %} + {% if sample.search_file is defined %} --search-file { sample.search_file } {% elif refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} + {% if sample.coverage is defined %} --coverage {% endif %} + {% if sample.keep is defined %} --keep {% endif %} + {% if sample.keep_mito is defined %} --keep-mito {% endif %} + {% if sample.no_fifo is defined %} --noFIFO {% endif %} + {% if sample.complexity is defined %} --no-complexity {% endif %} + {% if sample.no_bw is defined %} --no-bw {% endif %} + {% if sample.prioritize is defined %} --prioritize {% endif %} + {% if sample.config_file is defined %} -C {sample.config_file} {% endif %} + --pipestat-config {pipestat.config_file} compute: - singularity_image: ${SIMAGES}peppro - docker_image: databio/peppro - bulker_crate: databio/peppro:1.0.1 + conda_env: peppro + bulker_crate: databio/peppro:1.1.0 size_dependent_variables: resources.tsv + var_templates: refgenie_config: "$REFGENIE" + pre_submit: python_functions: - refgenconf.looper_refgenie_populate + bioconductor: readFunName: readPepproGeneCounts readFunPath: BiocProject/readPepproGeneCounts.R - diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..d91f42f --- /dev/null +++ b/tests/README.md @@ -0,0 +1,172 @@ +# PEPPRO Test Suite + +This directory contains the PEPPRO test suite, organized into two tiers: + +- **Unit tests** — fast, no genome data or external bioinformatics tools required; run on every push/PR via GitHub Actions +- **Integration tests** — full pipeline runs; require a self-hosted runner with genome indices and all tools installed + +--- + +## Directory Structure + +``` +tests/ +├── data/ # Small test FASTQ files (~3 MB total) +│ ├── test_R1.fastq.gz # SE reads (12,500 reads) +│ ├── test_R2.fastq.gz # PE reverse reads (rev-comp of R1) +│ └── test_R1_umi.fastq.gz # R1 with 8-nt UMI prefix for UMI tests +├── pep_configs/ # PEP project configs for each scenario +│ ├── se_basic.yaml / .csv +│ ├── pe_basic.yaml / .csv +│ └── ... +├── looper_configs/ # Looper run configs for each scenario +│ ├── .looper_se_basic.yaml +│ └── ... +├── scripts/ +│ └── generate_test_data.sh # Regenerate test FASTQ data from source +├── test_unit.py # Unit tests (no tools/genome needed) +├── test_integration.py # Integration tests (full pipeline runs) +└── README.md # This file +``` + +--- + +## Unit Tests + +Unit tests cover: + +- **Constants**: `RUNON_SOURCE`, `ADAPTER_REMOVERS`, `TRIMMERS`, `DEDUPLICATORS` values and defaults +- **PEP loading**: Each test config loads correctly with expected sample attributes +- **Schema validation**: eido validation passes for valid configs; regression tests ensure invalid inputs (e.g., integer `umi_len` in YAML `imply`, invalid `protocol`/`adapter`/`trimmer`/`dedup` enum values) fail correctly +- **Argument parsing**: All CLI flags parse correctly, defaults are correct, invalid choices raise `SystemExit` +- **Recovery paths**: Expected output file naming conventions are documented and verified + +### Running unit tests + +```bash +# Via pytest directly +pytest tests/test_unit.py -v + +# Via Makefile +make test-unit +``` + +No environment variables or external tools are needed. + +--- + +## Integration Tests + +Integration tests run the full PEPPRO pipeline for each scenario and verify: + +1. Pipeline exits with status `0` +2. Key output files exist (BAM, bigWig, stats.yaml) +3. `stats.yaml` contains the expected result keys +4. The `TestRecovery` class additionally tests checkpoint skipping and the `unmap_R1.fq` recovery regression + +### Prerequisites + +The integration tests require a machine with all PEPPRO dependencies installed and genome assets configured via refgenie: + +| Tool | Version tested | +|------|---------------| +| bowtie2 | ≥2.4 | +| samtools | ≥1.13 | +| bedtools | ≥2.30 | +| cutadapt | ≥4.0 | +| fastp | ≥0.23 | +| seqtk | ≥1.3 | +| fastx_toolkit | any | +| seqkit | ≥2.0 | +| fqdedup | any | +| fastq_pair | any | +| wigToBigWig | UCSC | +| bedGraphToBigWig | UCSC | + +**Genome assets** (via refgenie, pointed to by `$REFGENIE`): + +- `hg38/bowtie2_index` +- `human_rDNA/bowtie2_index` +- `hg38/fasta` (for chromosome sizes) +- `hg38/blacklist` (optional, for coverage tests) + +### Running integration tests + +**Important notes:** + +- The PyPI package for pypiper is **`piper`** (not `pypiper`, which is an unrelated package). +- Bioinformatics tools (samtools, bowtie2, etc.) are provided via bulker. The wrapper script handles this automatically, or you can use `bulker activate` / `bulker exec` directly. +- Tests run with `-p local` (divvy local compute package) so the pipeline executes inline rather than being submitted to a job scheduler. + +```bash +# Recommended: use the wrapper script (runs pytest via bulker exec) +bash tests/scripts/test-integration.sh + +# Or manually: activate bulker, then run pytest +bulker activate databio/peppro:1.1.0 +RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v +bulker deactivate + +# Run a specific scenario +bash tests/scripts/test-integration.sh -k se_basic + +# Via Makefile targets +make test-se # All SE scenarios +make test-pe # All PE scenarios +make test-recovery # Recovery regression tests +make test-integration # All integration tests +make test-all # Unit + integration + +# Run a single named scenario +make test-scenario SCENARIO=se_fastp + +# Keep output directories for debugging (default: cleaned up after each class) +KEEP_TEST_OUTPUTS=true RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v -k se_basic +``` + +--- + +## Test Scenarios + +| Scenario | Read type | Protocol | Adapter | Trimmer | Dedup | Notes | +|----------|-----------|----------|---------|---------|-------|-------| +| `se_basic` | SE | PRO-seq | cutadapt | seqtk | — | Baseline SE run | +| `pe_basic` | PE | PRO-seq | cutadapt | seqtk | — | Baseline PE run | +| `se_groseq` | SE | GRO-seq | cutadapt | seqtk | — | GRO-seq protocol | +| `se_umi` | SE | PRO-seq | cutadapt | seqtk | seqkit | 8-nt UMI deduplication | +| `pe_umi` | PE | PRO-seq | cutadapt | seqtk | seqkit | PE with UMI dedup | +| `se_fastp` | SE | PRO-seq | fastp | seqtk | — | fastp adapter trimming | +| `se_fastx` | SE | PRO-seq | cutadapt | fastx | — | fastx_trimmer | +| `se_fqdedup` | SE | PRO-seq | cutadapt | seqtk | fqdedup | fqdedup UMI dedup | +| `se_scale` | SE | PRO-seq | cutadapt | seqtk | — | `--scale` flag | +| `se_no_complexity` | SE | PRO-seq | cutadapt | seqtk | — | `--no-complexity` flag | +| `se_nofifo` | SE | PRO-seq | cutadapt | seqtk | — | `--no-fifo` flag | +| `se_coverage` | SE | PRO-seq | cutadapt | seqtk | — | `--coverage` flag | + +--- + +## Test Data + +The files in `tests/data/` are derived from `examples/data/test_r1.fq.gz` (the existing pipeline example read file). They are small enough to commit to the repository (~1 MB each). + +To regenerate the test data files (requires `seqtk`): + +```bash +make test-data +# or +bash tests/scripts/generate_test_data.sh +``` + +--- + +## GitHub Actions + +Unit tests run automatically on every push and pull request targeting `master` or `dev`, across Python 3.9, 3.11, and 3.12. + +Integration tests are triggered manually via **workflow_dispatch** on a self-hosted runner: + +1. Go to **Actions** → **Tests** → **Run workflow** +2. Set "Run integration tests" to `true` +3. Click **Run workflow** + +See `.github/workflows/tests.yml` for the full configuration. diff --git a/tests/bulker_manifest.yaml b/tests/bulker_manifest.yaml new file mode 100644 index 0000000..a3c86f5 --- /dev/null +++ b/tests/bulker_manifest.yaml @@ -0,0 +1,57 @@ +manifest: + name: peppro + version: 1.1.0 + imports: + - bulker/alpine:default + - bulker/coreutils:default + host_commands: + - python3 + - perl + - fqdedup + commands: + - command: bedtools + docker_args: "-i" + docker_image: quay.io/biocontainers/bedtools:2.31.1--h13024bc_3 + - command: bowtie2 + docker_image: quay.io/biocontainers/bowtie2:2.5.4--he96a11b_7 + - command: cutadapt + docker_image: quay.io/biocontainers/cutadapt:4.9--py312hf67a6ed_2 + - command: fastp + docker_image: quay.io/biocontainers/fastp:0.23.4--hadf994f_3 + - command: fastq_pair + docker_image: quay.io/biocontainers/fastq-pair:1.0--h503566f_6 + - command: fastqc + docker_image: quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0 + - command: fastx_trimmer + docker_image: quay.io/biocontainers/fastx_toolkit:0.0.14--he1b5a44_8 + - command: fastx_reverse_complement + docker_image: quay.io/biocontainers/fastx_toolkit:0.0.14--he1b5a44_8 + - command: flash + docker_image: databio/flashz + - command: java + docker_image: openjdk + docker_args: "-i" + - command: bigWigCat + docker_image: quay.io/biocontainers/ucsc-bigwigcat:377--h0b8a92a_2 + - command: wigToBigWig + docker_image: quay.io/biocontainers/ucsc-wigtobigwig:377--h0b8a92a_2 + - command: picard + docker_image: quay.io/biocontainers/picard:2.27.5--hdfd78af_0 + - command: pigz + docker_image: nsheff/pigz + - command: preseq + docker_image: quay.io/biocontainers/preseq:2.0.3--hf53bd2b_3 + - command: R + docker_image: databio/rpipe:0.3.1 + docker_command: R + - command: Rscript + docker_image: databio/rpipe:0.3.1 + - command: samtools + docker_args: "-i" + docker_image: quay.io/biocontainers/samtools:1.23--h96c455f_0 + - command: seqkit + docker_image: quay.io/biocontainers/seqkit:2.8.2--h9ee0642_1 + - command: seqOutBias + docker_image: databio/seqoutbias:0.0.1 + - command: seqtk + docker_image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 diff --git a/tests/data/test_R1.fastq.gz b/tests/data/test_R1.fastq.gz new file mode 100644 index 0000000..72c8389 Binary files /dev/null and b/tests/data/test_R1.fastq.gz differ diff --git a/tests/data/test_R1_umi.fastq.gz b/tests/data/test_R1_umi.fastq.gz new file mode 100644 index 0000000..378acb1 Binary files /dev/null and b/tests/data/test_R1_umi.fastq.gz differ diff --git a/tests/data/test_R2.fastq.gz b/tests/data/test_R2.fastq.gz new file mode 100644 index 0000000..7827767 Binary files /dev/null and b/tests/data/test_R2.fastq.gz differ diff --git a/tests/looper_configs/.looper_pe_basic.yaml b/tests/looper_configs/.looper_pe_basic.yaml new file mode 100644 index 0000000..4c32d12 --- /dev/null +++ b/tests/looper_configs/.looper_pe_basic.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/pe_basic.yaml" + +output_dir: "${HOME}/peppro_test_pe_basic" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_pe_basic/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_pe_umi.yaml b/tests/looper_configs/.looper_pe_umi.yaml new file mode 100644 index 0000000..38aaa66 --- /dev/null +++ b/tests/looper_configs/.looper_pe_umi.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/pe_umi.yaml" + +output_dir: "${HOME}/peppro_test_pe_umi" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_pe_umi/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_basic.yaml b/tests/looper_configs/.looper_se_basic.yaml new file mode 100644 index 0000000..66c9f73 --- /dev/null +++ b/tests/looper_configs/.looper_se_basic.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_basic.yaml" + +output_dir: "${HOME}/peppro_test_se_basic" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_basic/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_coverage.yaml b/tests/looper_configs/.looper_se_coverage.yaml new file mode 100644 index 0000000..3038c09 --- /dev/null +++ b/tests/looper_configs/.looper_se_coverage.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_coverage.yaml" + +output_dir: "${HOME}/peppro_test_se_coverage" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_coverage/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_fastp.yaml b/tests/looper_configs/.looper_se_fastp.yaml new file mode 100644 index 0000000..509b773 --- /dev/null +++ b/tests/looper_configs/.looper_se_fastp.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_fastp.yaml" + +output_dir: "${HOME}/peppro_test_se_fastp" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_fastp/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_fastx.yaml b/tests/looper_configs/.looper_se_fastx.yaml new file mode 100644 index 0000000..dd20240 --- /dev/null +++ b/tests/looper_configs/.looper_se_fastx.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_fastx.yaml" + +output_dir: "${HOME}/peppro_test_se_fastx" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_fastx/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_fqdedup.yaml b/tests/looper_configs/.looper_se_fqdedup.yaml new file mode 100644 index 0000000..8d975cb --- /dev/null +++ b/tests/looper_configs/.looper_se_fqdedup.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_fqdedup.yaml" + +output_dir: "${HOME}/peppro_test_se_fqdedup" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_fqdedup/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_groseq.yaml b/tests/looper_configs/.looper_se_groseq.yaml new file mode 100644 index 0000000..f89297c --- /dev/null +++ b/tests/looper_configs/.looper_se_groseq.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_groseq.yaml" + +output_dir: "${HOME}/peppro_test_se_groseq" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_groseq/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_no_complexity.yaml b/tests/looper_configs/.looper_se_no_complexity.yaml new file mode 100644 index 0000000..7813dde --- /dev/null +++ b/tests/looper_configs/.looper_se_no_complexity.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_no_complexity.yaml" + +output_dir: "${HOME}/peppro_test_se_no_complexity" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_no_complexity/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_nofifo.yaml b/tests/looper_configs/.looper_se_nofifo.yaml new file mode 100644 index 0000000..8c902aa --- /dev/null +++ b/tests/looper_configs/.looper_se_nofifo.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_nofifo.yaml" + +output_dir: "${HOME}/peppro_test_se_nofifo" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_nofifo/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_recovery.yaml b/tests/looper_configs/.looper_se_recovery.yaml new file mode 100644 index 0000000..26df4a3 --- /dev/null +++ b/tests/looper_configs/.looper_se_recovery.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_basic.yaml" + +output_dir: "${HOME}/peppro_test_se_recovery" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_recovery/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_scale.yaml b/tests/looper_configs/.looper_se_scale.yaml new file mode 100644 index 0000000..0936419 --- /dev/null +++ b/tests/looper_configs/.looper_se_scale.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_scale.yaml" + +output_dir: "${HOME}/peppro_test_se_scale" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_scale/results_pipeline/stats.yaml" diff --git a/tests/looper_configs/.looper_se_umi.yaml b/tests/looper_configs/.looper_se_umi.yaml new file mode 100644 index 0000000..b7e6083 --- /dev/null +++ b/tests/looper_configs/.looper_se_umi.yaml @@ -0,0 +1,10 @@ +pep_config: "../pep_configs/se_umi.yaml" + +output_dir: "${HOME}/peppro_test_se_umi" + +pipeline_interfaces: + - "../../sample_pipeline_interface.yaml" + - "../../project_pipeline_interface.yaml" + +pipestat: + results_file_path: "${HOME}/peppro_test_se_umi/results_pipeline/stats.yaml" diff --git a/tests/pep_configs/pe_basic.csv b/tests/pep_configs/pe_basic.csv new file mode 100644 index 0000000..d7d96e6 --- /dev/null +++ b/tests/pep_configs/pe_basic.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,read2 +pe_basic,human,PROSEQ,paired,R1,R2 diff --git a/tests/pep_configs/pe_basic.yaml b/tests/pep_configs/pe_basic.yaml new file mode 100644 index 0000000..9e36c29 --- /dev/null +++ b/tests/pep_configs/pe_basic.yaml @@ -0,0 +1,18 @@ +# PE PRO-seq, cutadapt + seqtk, no UMI — baseline paired-end test +name: peppro_test_pe_basic +pep_version: 2.0.0 +sample_table: pe_basic.csv + +sample_modifiers: + derive: + attributes: [read1, read2] + sources: + R1: "tests/data/test_R1.fastq.gz" + R2: "tests/data/test_R2.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/pe_umi.csv b/tests/pep_configs/pe_umi.csv new file mode 100644 index 0000000..5714435 --- /dev/null +++ b/tests/pep_configs/pe_umi.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,read2 +pe_umi,human,PROSEQ,paired,R1_umi,R2 diff --git a/tests/pep_configs/pe_umi.yaml b/tests/pep_configs/pe_umi.yaml new file mode 100644 index 0000000..3bd68ad --- /dev/null +++ b/tests/pep_configs/pe_umi.yaml @@ -0,0 +1,20 @@ +# PE PRO-seq with 8-nt UMI, dedup with seqkit +name: peppro_test_pe_umi +pep_version: 2.0.0 +sample_table: pe_umi.csv + +sample_modifiers: + derive: + attributes: [read1, read2] + sources: + R1_umi: "tests/data/test_R1_umi.fastq.gz" + R2: "tests/data/test_R2.fastq.gz" + append: + umi_len: "8" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_basic.csv b/tests/pep_configs/se_basic.csv new file mode 100644 index 0000000..309d16c --- /dev/null +++ b/tests/pep_configs/se_basic.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1 +se_basic,human,PROSEQ,single,R1 diff --git a/tests/pep_configs/se_basic.yaml b/tests/pep_configs/se_basic.yaml new file mode 100644 index 0000000..9f5aa6c --- /dev/null +++ b/tests/pep_configs/se_basic.yaml @@ -0,0 +1,17 @@ +# SE PRO-seq, cutadapt + seqtk, no UMI — baseline single-end test +name: peppro_test_se_basic +pep_version: 2.0.0 +sample_table: se_basic.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1: "tests/data/test_R1.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_coverage.csv b/tests/pep_configs/se_coverage.csv new file mode 100644 index 0000000..a6a1142 --- /dev/null +++ b/tests/pep_configs/se_coverage.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,coverage +se_coverage,human,PROSEQ,single,R1,true diff --git a/tests/pep_configs/se_coverage.yaml b/tests/pep_configs/se_coverage.yaml new file mode 100644 index 0000000..885b15d --- /dev/null +++ b/tests/pep_configs/se_coverage.yaml @@ -0,0 +1,17 @@ +# SE PRO-seq with --coverage (coverage-based library complexity) +name: peppro_test_se_coverage +pep_version: 2.0.0 +sample_table: se_coverage.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1: "tests/data/test_R1.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_fastp.csv b/tests/pep_configs/se_fastp.csv new file mode 100644 index 0000000..a45c3be --- /dev/null +++ b/tests/pep_configs/se_fastp.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,adapter +se_fastp,human,PROSEQ,single,R1,fastp diff --git a/tests/pep_configs/se_fastp.yaml b/tests/pep_configs/se_fastp.yaml new file mode 100644 index 0000000..e196536 --- /dev/null +++ b/tests/pep_configs/se_fastp.yaml @@ -0,0 +1,17 @@ +# SE PRO-seq with fastp for adapter removal +name: peppro_test_se_fastp +pep_version: 2.0.0 +sample_table: se_fastp.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1: "tests/data/test_R1.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_fastx.csv b/tests/pep_configs/se_fastx.csv new file mode 100644 index 0000000..ff13035 --- /dev/null +++ b/tests/pep_configs/se_fastx.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,trimmer +se_fastx,human,PROSEQ,single,R1,fastx diff --git a/tests/pep_configs/se_fastx.yaml b/tests/pep_configs/se_fastx.yaml new file mode 100644 index 0000000..9cd5697 --- /dev/null +++ b/tests/pep_configs/se_fastx.yaml @@ -0,0 +1,17 @@ +# SE PRO-seq with fastx trimmer +name: peppro_test_se_fastx +pep_version: 2.0.0 +sample_table: se_fastx.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1: "tests/data/test_R1.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_fqdedup.csv b/tests/pep_configs/se_fqdedup.csv new file mode 100644 index 0000000..25be628 --- /dev/null +++ b/tests/pep_configs/se_fqdedup.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,dedup +se_fqdedup,human,PROSEQ,single,R1_umi,fqdedup diff --git a/tests/pep_configs/se_fqdedup.yaml b/tests/pep_configs/se_fqdedup.yaml new file mode 100644 index 0000000..d906181 --- /dev/null +++ b/tests/pep_configs/se_fqdedup.yaml @@ -0,0 +1,19 @@ +# SE PRO-seq with 8-nt UMI, dedup with fqdedup +name: peppro_test_se_fqdedup +pep_version: 2.0.0 +sample_table: se_fqdedup.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1_umi: "tests/data/test_R1_umi.fastq.gz" + append: + umi_len: "8" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_groseq.csv b/tests/pep_configs/se_groseq.csv new file mode 100644 index 0000000..d24a75b --- /dev/null +++ b/tests/pep_configs/se_groseq.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1 +se_groseq,human,GROSEQ,single,R1 diff --git a/tests/pep_configs/se_groseq.yaml b/tests/pep_configs/se_groseq.yaml new file mode 100644 index 0000000..95ee731 --- /dev/null +++ b/tests/pep_configs/se_groseq.yaml @@ -0,0 +1,17 @@ +# SE GRO-seq protocol +name: peppro_test_se_groseq +pep_version: 2.0.0 +sample_table: se_groseq.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1: "tests/data/test_R1.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_no_complexity.csv b/tests/pep_configs/se_no_complexity.csv new file mode 100644 index 0000000..be4e851 --- /dev/null +++ b/tests/pep_configs/se_no_complexity.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,complexity +se_no_complexity,human,PROSEQ,single,R1,true diff --git a/tests/pep_configs/se_no_complexity.yaml b/tests/pep_configs/se_no_complexity.yaml new file mode 100644 index 0000000..b833a58 --- /dev/null +++ b/tests/pep_configs/se_no_complexity.yaml @@ -0,0 +1,17 @@ +# SE PRO-seq with --no-complexity (skip library complexity calculation) +name: peppro_test_se_no_complexity +pep_version: 2.0.0 +sample_table: se_no_complexity.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1: "tests/data/test_R1.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_nofifo.csv b/tests/pep_configs/se_nofifo.csv new file mode 100644 index 0000000..469dd28 --- /dev/null +++ b/tests/pep_configs/se_nofifo.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,no_fifo +se_nofifo,human,PROSEQ,single,R1,true diff --git a/tests/pep_configs/se_nofifo.yaml b/tests/pep_configs/se_nofifo.yaml new file mode 100644 index 0000000..314d83d --- /dev/null +++ b/tests/pep_configs/se_nofifo.yaml @@ -0,0 +1,17 @@ +# SE PRO-seq with --noFIFO (disable named pipes during prealignments) +name: peppro_test_se_nofifo +pep_version: 2.0.0 +sample_table: se_nofifo.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1: "tests/data/test_R1.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_scale.csv b/tests/pep_configs/se_scale.csv new file mode 100644 index 0000000..eccac23 --- /dev/null +++ b/tests/pep_configs/se_scale.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1,scale +se_scale,human,PROSEQ,single,R1,true diff --git a/tests/pep_configs/se_scale.yaml b/tests/pep_configs/se_scale.yaml new file mode 100644 index 0000000..08cdcd2 --- /dev/null +++ b/tests/pep_configs/se_scale.yaml @@ -0,0 +1,17 @@ +# SE PRO-seq with --scale flag (scaled bigWig output) +name: peppro_test_se_scale +pep_version: 2.0.0 +sample_table: se_scale.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1: "tests/data/test_R1.fastq.gz" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/pep_configs/se_umi.csv b/tests/pep_configs/se_umi.csv new file mode 100644 index 0000000..c723292 --- /dev/null +++ b/tests/pep_configs/se_umi.csv @@ -0,0 +1,2 @@ +sample_name,organism,protocol,read_type,read1 +se_umi,human,PROSEQ,single,R1_umi diff --git a/tests/pep_configs/se_umi.yaml b/tests/pep_configs/se_umi.yaml new file mode 100644 index 0000000..e2a729f --- /dev/null +++ b/tests/pep_configs/se_umi.yaml @@ -0,0 +1,19 @@ +# SE PRO-seq with 8-nt UMI, dedup with seqkit +name: peppro_test_se_umi +pep_version: 2.0.0 +sample_table: se_umi.csv + +sample_modifiers: + derive: + attributes: [read1] + sources: + R1_umi: "tests/data/test_R1_umi.fastq.gz" + append: + umi_len: "8" + imply: + - if: + organism: ["human"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] + no_bw: "true" diff --git a/tests/scripts/generate_test_data.sh b/tests/scripts/generate_test_data.sh new file mode 100644 index 0000000..d8abeb9 --- /dev/null +++ b/tests/scripts/generate_test_data.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Regenerate small test FASTQ files from the existing SE test read. +# Requires: seqtk, awk, gzip (or run generate_test_data.py for a Python-only version) +# Run from the repository root: bash tests/scripts/generate_test_data.sh + +set -euo pipefail + +SRC="examples/data/test_r1.fq.gz" +OUTDIR="tests/data" + +echo "Generating test data from ${SRC} ..." + +# R1 copy (rename convention to fastq.gz) +cp "${SRC}" "${OUTDIR}/test_R1.fastq.gz" + +# R2: reverse complement of R1 +seqtk seq -r "${SRC}" | gzip > "${OUTDIR}/test_R2.fastq.gz" + +# UMI R1: prepend 8-nt UMI (ACGTACGT / IIIIIIII) to every read +zcat "${SRC}" | awk ' + NR%4 == 1 { print; next } + NR%4 == 2 { print "ACGTACGT" $0; next } + NR%4 == 3 { print; next } + NR%4 == 0 { print "IIIIIIII" $0 } +' | gzip > "${OUTDIR}/test_R1_umi.fastq.gz" + +echo "Done. Files written to ${OUTDIR}/" +ls -lh "${OUTDIR}/" diff --git a/tests/scripts/test-integration.sh b/tests/scripts/test-integration.sh new file mode 100755 index 0000000..de6afac --- /dev/null +++ b/tests/scripts/test-integration.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Integration Test Runner for PEPPRO +# Uses bulker exec to run pytest with bioinformatics tools available +# via containerized commands. +# +# Usage: +# ./tests/scripts/test-integration.sh # Run all integration tests +# ./tests/scripts/test-integration.sh -k "test_se" # Run specific tests +# ./tests/scripts/test-integration.sh --keep-test-outputs # Preserve outputs + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +TESTS_DIR="$SCRIPT_DIR/.." + +BULKER_CRATE="${PEPPRO_TEST_BULKER_CRATE:-databio/peppro:1.1.0}" + +# Colors for output +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' + +echo -e "${GREEN}=== PEPPRO Integration Tests ===${NC}" +echo -e "\n${GREEN}Running integration tests via bulker exec ${BULKER_CRATE}...${NC}" +echo "" + +cd "$PROJECT_ROOT" + +set +e +RUN_INTEGRATION_TESTS=true bulker exec "${BULKER_CRATE}" -- \ + python3 -m pytest "$TESTS_DIR/test_integration.py" -v "$@" +PYTEST_EXIT=$? +set -e + +if [ $PYTEST_EXIT -eq 0 ]; then + echo -e "\n${GREEN}Integration tests completed successfully!${NC}" +else + echo -e "\n${RED}Integration tests failed (exit code: ${PYTEST_EXIT})${NC}" +fi +exit $PYTEST_EXIT diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..0be7aed --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,432 @@ +""" +Integration tests for PEPPRO — runs the full pipeline for each test scenario. + +IMPORTANT: Use the wrapper script, which activates bulker for bioinformatics tools: + + bash tests/scripts/test-integration.sh + +Or manually activate bulker first, then run pytest: + + bulker activate databio/peppro:1.1.0 + RUN_INTEGRATION_TESTS=true pytest tests/test_integration.py -v + bulker deactivate + +NOTE: The pypiper PyPI package is "piper" (NOT "pypiper", which is unrelated). + +Prerequisites: + - $REFGENIE set to a refgenie config with hg38 and human_rDNA assets + - bulker crate databio/peppro:1.1.0 (provides samtools, bowtie2, etc.) + - RUN_INTEGRATION_TESTS=true environment variable set + +To run a single scenario: + bash tests/scripts/test-integration.sh -k se_basic + +Keep output directories for debugging: + bash tests/scripts/test-integration.sh --keep-test-outputs +""" + +import glob +import os +import shutil +import subprocess +import yaml +import pytest + +# --------------------------------------------------------------------------- +# Gate: skip all integration tests unless explicitly enabled +# --------------------------------------------------------------------------- + +INTEGRATION_ENABLED = os.environ.get("RUN_INTEGRATION_TESTS", "").lower() in ( + "1", "true", "yes" +) +KEEP_TEST_OUTPUTS = os.environ.get("KEEP_TEST_OUTPUTS", "").lower() in ( + "1", "true", "yes" +) + +pytestmark = pytest.mark.skipif( + not INTEGRATION_ENABLED, + reason="Set RUN_INTEGRATION_TESTS=true to run integration tests. " + "Use: bash tests/scripts/test-integration.sh (activates bulker for bio tools)", +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +LOOPER_CFG_DIR = os.path.join(REPO_ROOT, "tests", "looper_configs") + +CORE_STATS = [ + "Raw_reads", + "Trimmed_reads_R1", + "Aligned_reads", + "Alignment_rate", + "Mapped_reads", + "NRF", + "PBC1", + "PBC2", +] + + +def run_looper(looper_cfg, recover=False): + """ + Run looper for a given looper config using the local compute package + so the pipeline executes inline (not submitted to a job scheduler). + Returns a CompletedProcess instance. + """ + cmd = ["looper", "run", "-c", looper_cfg, "-p", "local"] + if recover: + # looper 2.1+ removed --recover; pass pypiper's -R flag via --command-extra. + # Use --command-extra=-R (not -x -R) so argparse doesn't mistake -R for a flag. + cmd.append("--command-extra=-R") + return subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=REPO_ROOT, + ) + + +def assert_pipeline_succeeded(result, sample_dir): + """Assert looper exited 0 and the pipeline actually completed.""" + assert result.returncode == 0, ( + f"looper exited {result.returncode}\n" + f"STDOUT:\n{result.stdout[-3000:]}\n" + f"STDERR:\n{result.stderr[-3000:]}" + ) + + # Check that the pipeline log exists and does not indicate failure + log = os.path.join(sample_dir, "PEPPRO_log.md") + assert os.path.exists(log), ( + f"Pipeline log not found at {log}\n" + f"STDOUT:\n{result.stdout[-3000:]}\n" + f"STDERR:\n{result.stderr[-3000:]}" + ) + log_content = open(log).read() + assert "Pipeline failed" not in log_content, ( + f"Pipeline log indicates failure:\n{log_content[-2000:]}" + ) + + # Check that looper/pipestat did not report errors in stdout/stderr + combined_output = result.stdout + result.stderr + assert "Traceback (most recent call last)" not in combined_output, ( + f"Traceback found in looper output:\n{combined_output[-3000:]}" + ) + + +def load_stats(output_dir, sample_name): + """Load pipestat stats.yaml for a completed sample run. + + pipestat writes results to a single flat file: + /stats.yaml + with structure: PEPPRO.sample.. + Returns the flat metrics dict for the sample. + """ + path = os.path.join(output_dir, "stats.yaml") + if not os.path.exists(path): + return {} + with open(path) as f: + data = yaml.safe_load(f) or {} + # Navigate PEPPRO -> sample -> + try: + return data["PEPPRO"]["sample"][sample_name] + except (KeyError, TypeError): + return data + + +def assert_stats_keys(stats, expected_keys): + """Assert that all expected metric keys are present in stats.yaml.""" + missing = [k for k in expected_keys if k not in stats] + assert not missing, f"Missing stats keys: {missing}" + + +def scenario_output_dir(scenario): + """Return the output_dir for a scenario (matches looper config's output_dir).""" + return os.path.expandvars(f"$HOME/peppro_test_{scenario}/results_pipeline") + + +# =========================================================================== +# Base class: runs looper once per test class, shared by all test methods +# =========================================================================== + +class PepproIntegrationBase: + """ + Base class for single-scenario integration tests. + + setup_class runs looper once; all test methods in the class share the + resulting output directory and CompletedProcess (self.result, self.sample_dir). + teardown_class removes the output tree unless KEEP_TEST_OUTPUTS=true. + """ + + SCENARIO = None # override in subclass, e.g. "se_basic" + SAMPLE = None # sample_name in the PEP CSV, e.g. "se_basic" + + @classmethod + def setup_class(cls): + cls.output_dir = scenario_output_dir(cls.SCENARIO) + cls.sample_dir = os.path.join(cls.output_dir, cls.SAMPLE) + # Clean any stale output from previous runs, then create fresh. + parent = os.path.dirname(cls.output_dir) + if os.path.exists(parent): + shutil.rmtree(parent) + os.makedirs(cls.output_dir, exist_ok=True) + cfg = os.path.join(LOOPER_CFG_DIR, f".looper_{cls.SCENARIO}.yaml") + cls.result = run_looper(cfg) + + @classmethod + def teardown_class(cls): + if not KEEP_TEST_OUTPUTS: + # Remove ${HOME}/peppro_test_ 0) { + opt_pos <- pos[1] + break + } + } + + # Option not found + if (is.na(opt_pos)) { + if (required) { + stop(paste0("Required option not found: ", + paste(flags, collapse = " or "))) + } + return(default) + } + + # Get the value(s) following the option + if (is.null(n) || is.na(n) || n == 1) { + # Check if there's a next argument + if (opt_pos + 1 > length(args)) { + # No next argument - if default is boolean, treat as flag + if (is.logical(default)) { + return(TRUE) + } + if (required) { + stop(paste0("Option ", args[opt_pos], " requires a value")) + } + return(default) + } + + next_arg <- args[opt_pos + 1] + + # If next arg is another flag, treat this as a boolean flag + if (grepl("^-", next_arg)) { + if (is.logical(default)) { + return(TRUE) + } + if (required) { + stop(paste0("Option ", args[opt_pos], " requires a value")) + } + return(default) + } + + return(next_arg) + } else { + # Multiple values (n > 1) + values <- character(0) + pos <- opt_pos + 1 + count <- 0 + + while (pos <= length(args) && count < n) { + # Stop if we hit another option flag + if (grepl("^-", args[pos])) { + break + } + values <- c(values, args[pos]) + pos <- pos + 1 + count <- count + 1 + } + + if (length(values) == 0) { + if (required) { + stop(paste0("Option ", args[opt_pos], " requires value(s)")) + } + return(default) + } + return(values) + } +} + +############################################################################### ##### Load dependencies ##### required_libraries <- c("PEPPROr") @@ -61,17 +196,17 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { "Command: preseq \t plot preseq complexity curves\n\n", " -i, --input\t\t Input files generated by preseq.\n", " -c, --coverage\t\t Use coverage on axes instead of read counts. ", - "Enter number of base pairs of reference.\n", + "Enter number of base pairs of reference.\n", " -l, --read_length\t Sequence read length, for use in coverage ", - "calculations.\n", + "calculations.\n", " -r, --real_counts\t File name for file with three columns - ", - "preseq filename, total number reads, number", - " of unique reads (unique optional, ", - "whitespace delimited)\n", + "preseq filename, total number reads, number", + " of unique reads (unique optional, ", + "whitespace delimited)\n", " -u, --ignore_unique\t Ignore any information about unique read ", - "counts found in --real_counts file.\n", + "counts found in --real_counts file.\n", " -o, --output_name\t Output name (.png/.pdf will be automatically ", - "added). Default: 'complexity_curves'.\n", + "added). Default: 'complexity_curves'.\n", " -m, --x_min\t\t Lower x-limit (default 0).\n", " -x, --x_max\t\t Upper x-limit (default 500 million).\n" ) diff --git a/tools/PEPPRO_summarizer.R b/tools/PEPPRO_summarizer.R index 587c8d8..acee0b3 100644 --- a/tools/PEPPRO_summarizer.R +++ b/tools/PEPPRO_summarizer.R @@ -86,7 +86,7 @@ pep <- argv$config # Load the project prj <- invisible(suppressWarnings(pepr::Project(pep))) # Convenience -project_name <- config(prj)$name +project_name <- pepr::config(prj)$name # Set the output directory summary_dir <- suppressMessages(file.path(argv$output, "summary")) diff --git a/tools/bamSitesToWig.py b/tools/bamSitesToWig.py index 358f929..c1dd0cf 100755 --- a/tools/bamSitesToWig.py +++ b/tools/bamSitesToWig.py @@ -3,7 +3,7 @@ __author__ = ["Nathan C. Sheffield", "Jason Smith"] __credits__ = [] __license__ = "BSD2" -__version__ = "0.3.1" +__version__ = "0.4.0" __email__ = "nathan@code.databio.org" from argparse import ArgumentParser @@ -102,41 +102,26 @@ def __call__(self, chrom): cutsToWig = os.path.join(os.path.dirname(__file__), "cutsToWig.pl") - cmd1 = ("sort -n | perl " + cutsToWig + " " + str(chrom_size) + - " " + str(self.variable_step) + " " + str(self.scale)) - cmd2 = ("wigToBigWig -clip -fixedSummaries -keepAllChromosomes stdin " + - self.chrom_sizes_file + " " + chromOutFileBw) - _LOGGER.debug(" cutsToWigProcess: " + cmd1) - _LOGGER.debug(" wigToBigWigProcess: " + cmd2) - if self.exactbw: + tmpWigFile = chromOutFile + "_exact.wig" + cmd1 = ("sort -n | perl " + cutsToWig + " " + str(chrom_size) + + " " + str(self.variable_step) + " " + str(self.scale) + + " > " + tmpWigFile) cutsToWigProcess = subprocess.Popen(cmd1, shell=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - wigToBigWigProcess = subprocess.Popen( - ['wigToBigWig', '-clip', '-fixedSummaries', - '-keepAllChromosomes', 'stdin', - self.chrom_sizes_file, chromOutFileBw], - stdin=cutsToWigProcess.stdout) + stdin=subprocess.PIPE) if self.smoothbw: cutsToWigSm = os.path.join(os.path.dirname(__file__), "smoothWig.pl") chromOutFileBwSm = chromOutFile + "_smooth.bw" tmpFile = chromOutFile + "_cuts.txt" + tmpWigFileSm = chromOutFile + "_smooth.wig" cmd1 = ("sort -n | tee " + tmpFile + " | perl " + cutsToWigSm + " " + str(chrom_size) + " " + str(self.smooth_length) + " " + str(self.step_size) + " " + str(self.variable_step) + - " " + str(self.scale)) - cmd2 = ("wigToBigWig -clip -fixedSummaries " + - "-keepAllChromosomes stdin " + self.chrom_sizes_file + - " " + chromOutFileBwSm) + " " + str(self.scale) + " > " + tmpWigFileSm) cutsToWigProcessSm = subprocess.Popen(cmd1, shell=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - wigToBigWigProcessSm = subprocess.Popen( - ['wigToBigWig', '-clip', '-fixedSummaries', - '-keepAllChromosomes', 'stdin', - self.chrom_sizes_file, chromOutFileBwSm], - stdin=cutsToWigProcessSm.stdout) + stdin=subprocess.PIPE) if self.bedout: chromOutFileBed = chromOutFile + ".bed" @@ -218,10 +203,10 @@ def get_shifted_pos(read, shift_factor): shifted_pos = get_shifted_pos(read, shift_factor) if self.exactbw and shifted_pos: - cutsToWigProcess.stdin.write((str(shifted_pos) + "\n").encode('utf-8')) + cutsToWigProcess.stdin.write((str(shifted_pos + 1) + "\n").encode('utf-8')) if self.smoothbw and shifted_pos: - cutsToWigProcessSm.stdin.write((str(shifted_pos) + "\n").encode('utf-8')) + cutsToWigProcessSm.stdin.write((str(shifted_pos + 1) + "\n").encode('utf-8')) if self.bedout and shifted_pos: strand = "-" if read.is_reverse else "+" @@ -240,18 +225,26 @@ def get_shifted_pos(read, shift_factor): # Clean up processes if self.exactbw: cutsToWigProcess.stdin.close() - _LOGGER.debug("Encoding exact bigwig for " + chrom + - " (last read position:" + str(read.pos) + ")...") - wigToBigWigProcess.communicate() + cutsToWigProcess.wait() + _LOGGER.debug("Encoding exact bigwig for " + chrom + "...") + subprocess.call(['wigToBigWig', '-clip', '-fixedSummaries', + '-keepAllChromosomes', tmpWigFile, + self.chrom_sizes_file, chromOutFileBw]) + if os.path.exists(tmpWigFile): + os.remove(tmpWigFile) if self.bedout: bedOut.close() if self.smoothbw: cutsToWigProcessSm.stdin.close() - _LOGGER.debug("Encoding smooth bigwig for " + chrom + - " (last read position:" + str(read.pos) + ")...") - wigToBigWigProcessSm.communicate() + cutsToWigProcessSm.wait() + _LOGGER.debug("Encoding smooth bigwig for " + chrom + "...") + subprocess.call(['wigToBigWig', '-clip', '-fixedSummaries', + '-keepAllChromosomes', tmpWigFileSm, + self.chrom_sizes_file, chromOutFileBwSm]) + if os.path.exists(tmpWigFileSm): + os.remove(tmpWigFileSm) except StopIteration as e: print("StopIteration error for chrom ", chrom, ": ", e) diff --git a/tools/smoothWig.pl b/tools/smoothWig.pl index f04d7e3..48b9eec 100755 --- a/tools/smoothWig.pl +++ b/tools/smoothWig.pl @@ -1,6 +1,14 @@ #! /usr/bin/env perl # By Nathan Sheffield, University of Virginia, 2018 +# Version 2.0.0 - 2026-02-17 +# +# Changes in v2.0.0: +# - Fix off-by-one errors in smoothing window boundaries +# - Fix end site calculation to use original cut position (not clamped start) +# - Count duplicate cuts at the same position (previously skipped) +# - Default scale to 1 to prevent division by zero +# - Handle empty input gracefully # This is an incredibly fast Perl utility that converts cut sites # (coordinates) into a wiggle-like output. @@ -29,7 +37,7 @@ my $smoothSize = shift; # Smooth size is 2nd argument my $stepSize = shift; # Step size my $variableStep = shift; # Fourth argument is whether to use variable or fixed -my $scale = shift; # Fifth argument is scaling factor +my $scale = shift || 1; # Fifth argument is scaling factor (default 1 to avoid division by zero) $countIndex = 1; $currentCount = 0; @@ -54,27 +62,28 @@ my @closers; $cutSite = <>; # Grab the first cut +exit 0 if !defined($cutSite); # No cuts, nothing to do $cutSite -= $smoothSize; -$endSite = $cutSite + $smoothSize*2; +$endSite = $cutSite + $smoothSize*2 + 1; +$currentCount++; if ($variableStep) { - # Cycle to the first cut - while ($countIndex < $cutSite) { - $countIndex += $stepSize; + # Set countIndex to the first print position (multiple of stepSize) at or after cutSite + # This avoids the cycling bug where we overshoot valid positions + if ($cutSite > 0) { + # Calculate: ceiling of cutSite to nearest stepSize + $countIndex = int(($cutSite + $stepSize - 1) / $stepSize) * $stepSize; } $previousCut = $cutSite; # Loop through cuts, converting to wiggle format while($cutSite = <>) { $cutSite -= $smoothSize; - $currentCount++; - push @closers, $cutSite + $smoothSize*2; chomp($cutSite); - # if it's a duplicate read... - if ($cutSite == $previousCut) { - next; # skip to next read - } + # Duplicate reads at the same position are counted (not skipped) + # Each read represents a separate biological observation + # Print positions up to this cut with the OLD count while ($countIndex < $cutSite) { while ($endSite == $countIndex) { $currentCount--; @@ -88,6 +97,14 @@ $countIndex++; } + # THEN increment for the new cut + $currentCount++; + my $newEndSite = $cutSite + $smoothSize*2 + 1; + if (!defined($endSite)) { + $endSite = $newEndSite; + } else { + push @closers, $newEndSite; + } $previousCut = $cutSite; } # end while @@ -105,24 +122,28 @@ } } else { # Use fixedStep wiggle format # Print out 0s until the first cut - while ($countIndex < $cutSite) { + # Calculate the first print position at or after cutSite + my $firstPrintPos = int(($cutSite + $stepSize - 1) / $stepSize) * $stepSize; + while ($countIndex < $firstPrintPos) { print "0\n"; - $countIndex += $stepSize; + $countIndex += $stepSize; } $previousCut = $cutSite; # Loop through cuts, converting to wiggle format while($cutSite = <>) { $cutSite -= $smoothSize; - $currentCount++; - push @closers, $cutSite + $smoothSize*2; chomp($cutSite); - # if it's a duplicate read... + # Handle duplicate cuts at the same position if ($cutSite == $previousCut) { - next; # skip to next read + # Increment count and add closer, but don't print (we're at same position) + $currentCount++; + my $newEndSite = $cutSite + $smoothSize*2 + 1; + push @closers, $newEndSite; + next; } - # and print out all 0s between them + # Print positions up to this cut with the OLD count while ($countIndex < $cutSite) { # print ":".$countIndex.":".$endSite.":"; while ($endSite == $countIndex) { @@ -136,6 +157,14 @@ $countIndex++; } + # THEN increment for the new cut + $currentCount++; + my $newEndSite = $cutSite + $smoothSize*2 + 1; + if (!defined($endSite)) { + $endSite = $newEndSite; + } else { + push @closers, $newEndSite; + } $previousCut = $cutSite; } # end while diff --git a/usage.txt b/usage.txt index b049ed7..fe35cef 100644 --- a/usage.txt +++ b/usage.txt @@ -1,9 +1,11 @@ +/project/gomezlab/code/.conda/envs/peppro/lib/python3.13/site-packages/refgenconf/refgenconf.py:24: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. + from pkg_resources import iter_entry_points usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--logdev] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER - [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I - INPUT_FILES [INPUT_FILES ...] - [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] + [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] + [--pipeline-name PIPELINE_NAME] -S SAMPLE_NAME + -I INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 ...]] + -G GENOME_ASSEMBLY [-Q SINGLE_OR_PAIRED] [--protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq}] [--adapter-tool {cutadapt,fastp}] [--dedup-tool {seqkit,fqdedup}] @@ -11,17 +13,17 @@ usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--max-len MAX_LEN] [--sob] [--scale] [--prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...]] [--prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...]] - --genome-index GENOME_INDEX [--fasta FASTA] --chrom-sizes - CHROM_SIZES [--TSS-name TSS_NAME] [--pi-tss PI_TSS] - [--pi-body PI_BODY] [--pre-name PRE_NAME] + --genome-index GENOME_INDEX [--fasta FASTA] + --chrom-sizes CHROM_SIZES [--TSS-name TSS_NAME] + [--pi-tss PI_TSS] [--pi-body PI_BODY] [--pre-name PRE_NAME] [--anno-name ANNO_NAME] [--exon-name EXON_NAME] [--intron-name INTRON_NAME] [--search-file SEARCH_FILE] [--coverage] [--keep] [--keep-mito] [--noFIFO] [--no-complexity] [--prioritize] [-V] -PEPPRO version 0.10.2 +PEPPRO version 0.12.0 -optional arguments: +options: -h, --help show this help message and exit -R, --recover Overwrite locks to recover from previous failed run -N, --new-start Overwrite all results to start a fresh run @@ -31,18 +33,20 @@ optional arguments: --silent Silence logging. Overrides verbosity. --verbosity V Set logging level (1-5 or logging module level name) --logdev Expand content of logging message format. - -C CONFIG_FILE, --config CONFIG_FILE + -C, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths are with respect to the pipeline script. - -M MEMORY_LIMIT, --mem MEMORY_LIMIT + -M, --mem MEMORY_LIMIT Memory limit for processes accepting such. Default units are megabytes unless specified using the suffix [K|M|G|T]. - -P NUMBER_OF_CORES, --cores NUMBER_OF_CORES + -P, --cores NUMBER_OF_CORES Number of cores for parallelized processes - -I2 [INPUT_FILES2 [INPUT_FILES2 ...]], --input2 [INPUT_FILES2 [INPUT_FILES2 ...]] + --pipeline-name PIPELINE_NAME + Name of the pipeline + -I2, --input2 [INPUT_FILES2 ...] Secondary input files, such as read2 - -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED + -Q, --single-or-paired SINGLE_OR_PAIRED Single- or paired-end sequencing protocol --protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq} Run on sequencing type. @@ -101,11 +105,11 @@ optional arguments: -V, --version show program's version number and exit required named arguments: - -O PARENT_OUTPUT_FOLDER, --output-parent PARENT_OUTPUT_FOLDER + -O, --output-parent PARENT_OUTPUT_FOLDER Parent output directory of project - -S SAMPLE_NAME, --sample-name SAMPLE_NAME + -S, --sample-name SAMPLE_NAME Name for sample to run - -I INPUT_FILES [INPUT_FILES ...], --input INPUT_FILES [INPUT_FILES ...] + -I, --input INPUT_FILES [INPUT_FILES ...] One or more primary input files - -G GENOME_ASSEMBLY, --genome GENOME_ASSEMBLY + -G, --genome GENOME_ASSEMBLY Identifier for genome assembly