Add submit scripts for some biobakery tools

unlhcc · Nov 30, 2022 · 0940935 · 0940935
1 parent f3362c5
commit 0940935
Show file tree

Hide file tree

Showing 10 changed files with 3,340 additions and 0 deletions.
diff --git a/biobakery/README.md b/biobakery/README.md
@@ -0,0 +1,7 @@
+# Using bioBakery tools on HCC Clusters
+
+These scripts give an example of running common [bioBakery tools](https://github.com/biobakery/biobakery) on our clusters, such as:
+-[KneadData](https://github.com/biobakery/kneaddata)
+-[MetaPhlAn](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-4)
+-[HUMAnN](https://github.com/biobakery/humann)
+-[MaAsLin2](https://huttenhower.sph.harvard.edu/maaslin/)
diff --git a/biobakery/data/HMP2_metadata.tsv b/biobakery/data/HMP2_metadata.tsv
diff --git a/biobakery/data/HMP2_taxonomy.tsv b/biobakery/data/HMP2_taxonomy.tsv
diff --git a/biobakery/data/README.md b/biobakery/data/README.md
@@ -0,0 +1,15 @@
+The test `demo1` dataset is downloaded from: https://github.com/biobakery/biobakery_workflows/tree/master/examples/wmgx/paired
+with:
+```
+wget https://github.com/biobakery/biobakery_workflows/raw/master/examples/wmgx/paired/demo1.R1.fastq.gz 
+wget https://github.com/biobakery/biobakery_workflows/raw/master/examples/wmgx/paired/demo1.R2.fastq.gz 
+```
+
+
+
+The test input files for MaAsLin2 are downloaded from: https://github.com/biobakery/Maaslin2/tree/master/inst/extdata
+with:
+```
+wget https://github.com/biobakery/Maaslin2/raw/master/inst/extdata/HMP2_metadata.tsv
+wget https://github.com/biobakery/Maaslin2/raw/master/inst/extdata/HMP2_taxonomy.tsv
+```
diff --git a/biobakery/data/demo1.R1.fastq.gz b/biobakery/data/demo1.R1.fastq.gz
diff --git a/biobakery/data/demo1.R2.fastq.gz b/biobakery/data/demo1.R2.fastq.gz
diff --git a/biobakery/humann.submit b/biobakery/humann.submit
@@ -0,0 +1,48 @@
+#!/bin/bash
+#SBATCH --mem=20gb
+#SBATCH --time=168:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --job-name=humann
+#SBATCH --error=humann.%J.err
+#SBATCH --output=humann.%J.out
+#SBATCH --partition=batch
+
+module purge
+module load humann/3.6
+module load biodata/1.0
+
+# The CHOCOPhlAn and UniRef databases are already downloaded on our clusters
+# These databases can be accessed with the variable $HUMANN2 once the biodata module is loaded
+
+# Run HUMAnN
+
+# Here, input paired-end reads are the output reads generated with KneadData
+# Paired-end reads need to be concatenated before running HUMAnN:
+cat ./kneaddata_output_demo1/demo1.R1_kneaddata_paired_1.fastq ./kneaddata_output_demo1/demo1.R1_kneaddata_paired_2.fastq >> ./kneaddata_output_demo1/demo1_pair_1_2_cat.fastq
+
+# To save computational time, when working with multiple samples,
+# only one sample can be run with the taxonomic profile generated by MetaPhlAn
+# Then, the remaining samples can be run with the bowtie2 indexed custom ChocoPhlAn database
+# generated when running the first sample
+# More information on this can be found here, https://github.com/biobakery/humann#joint-taxonomic-profile
+
+# Run HUMAnN on one sample
+# Make sure the "Run HUMAnN on the remaining samples" Section is commented out when running this
+humann -i ./kneaddata_output_demo1/demo1_pair_1_2_cat.fastq \
+    --input-format fastq --threads 8 \
+    --taxonomic-profile all_metaphlan_output.txt \
+    -o demo1_humann_output
+
+# Run HUMAnN on the remaining samples
+# Make sure the "Run HUMAnN on one sample" Section is commented out when running this
+humann -i ./kneaddata_output_demo2/demo2_pair_1_2_cat.fastq \
+    --input-format fastq --threads 8 \
+    --nucleotide-database ./demo1_humann_output/demo1_pair_1_2_merged_humann_temp/ \
+    --bypass-nucleotide-index \
+    -o demo2_humann_output
+
+# If needed, output files from multiple samples can be merged with:
+# humann_join_tables -i *_humann_output/ --file_name "merged_genefamilies.tsv" --output all_merged_genefamilies.tsv
+# humann_join_tables -i *_humann_output/ --file_name "merged_pathabundance.tsv" --output all_merged_pathabundance.tsv
+# humann_join_tables -i *_humann_output/--file_name "merged_pathcoverage.tsv" --output all_merged_pathcoverage.tsv
diff --git a/biobakery/kneaddata.submit b/biobakery/kneaddata.submit
@@ -0,0 +1,24 @@
+#!/bin/bash
+#SBATCH --mem=20gb
+#SBATCH --time=168:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --job-name=kneaddata
+#SBATCH --error=kneaddata.%J.err
+#SBATCH --output=kneaddata.%J.out
+#SBATCH --partition=batch
+
+module purge
+module load kneaddata/0.12
+
+# If needed, download human contaminant database, or any other database, as one time thing
+mkdir kneaddata_db
+kneaddata_database --download human_genome bowtie2 ./kneaddata_db
+
+# Run KneadData
+# Input paired-end reads are in ./data in these examples
+# Contaminant database is in ./kneaddata_db in these examples
+# Please modify these paths according to your data location
+kneaddata --input1 ./data/demo1.R1.fastq.gz --input2 ./data/demo1.R2.fastq.gz \
+    -db ./kneaddata_db/hg37dec_v0.1 --output kneaddata_output_demo1 -t 8 -p 8 \
+    --trimmomatic=/util/opt/anaconda/deployed-conda-envs/packages/kneaddata/envs/kneaddata-0.12.0/share/trimmomatic-0.39-2/
diff --git a/biobakery/maaslin2.submit b/biobakery/maaslin2.submit
@@ -0,0 +1,24 @@
+#!/bin/bash
+#SBATCH --mem=20gb
+#SBATCH --time=168:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --job-name=maaslin2
+#SBATCH --error=maaslin2.%J.err
+#SBATCH --output=maaslin2.%J.out
+#SBATCH --partition=batch
+
+module purge
+module load maaslin2/0.3
+
+# Running the demo example from https://github.com/biobakery/Maaslin2#run-a-demo
+# MaAsLin2 uses metadata and output from HUMAnN as input
+# If needed, please normalize your HUMAnN output using humann_renorm_table 
+
+# Input .tsv files are in ./data in these examples
+# Please modify these paths according to your data location
+# Please modify the arguments, as well as --fixed_effects and --random_effects, based on your data
+Maaslin2.R --fixed_effects="diagnosis,dysbiosis" \
+    --random_effects="site,subject" \
+    --standardize=FALSE \
+   ./data/HMP2_taxonomy.tsv ./data/HMP2_metadata.tsv test_maaslin2_output
diff --git a/biobakery/metaphlan.submit b/biobakery/metaphlan.submit
@@ -0,0 +1,30 @@
+#!/bin/bash
+#SBATCH --mem=20gb
+#SBATCH --time=168:00:00
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --job-name=metaphlan
+#SBATCH --error=metaphlan.%J.err
+#SBATCH --output=metaphlan.%J.out
+#SBATCH --partition=batch
+
+module purge
+module load metaphlan/4.0
+
+# The indexed CHOCOPhlAnSGB database is already downloaded on our clusters
+# This database can be accessed with the variable $METAPHLAN_BOWTIE2_DB once the MetaPhlAn module is loaded
+
+# Run MetaPhlAn
+# Here, input paired-end reads are the output reads generated with KneadData
+# Please modify these paths according to your data location
+metaphlan ./kneaddata_output_demo1/demo1.R1_kneaddata_paired_1.fastq,./kneaddata_output_demo1/demo1.R1_kneaddata_paired_2.fastq \
+    --input_type fastq --nproc 8 \
+    --bowtie2out demo1_metagenome_bowtie2.bz2 \
+    -s demo1_metagenome_sam.bz2 \
+    -o demo1_profiled_metagenome_marker_counts.txt
+
+# Zipped *_sam.bz2 files can be unzipped with:
+# bzip2 -d demo1_metagenome_sam.bz2
+
+# If needed, output files from multiple samples can be merged with:
+# merge_metaphlan_tables.py *_profiled_metagenome_marker_counts.txt > all_metaphlan_output.txt