Use pathlib (#31)

* Use pathlib in Snakefile * Add logdir config param. Get tired because Snakemake doesn't support Path objects as input or log files. * Use pathlib for all paths. Add version printout to Snakefile * Add info about pathlib use * Add details on branching structure to CONTRIBUTING.md * Bump docs version
ctmrbio · Apr 30, 2018 · 56e700d · 56e700d
1 parent 1e2828f
commit 56e700d
Show file tree

Hide file tree

Showing 14 changed files with 299 additions and 239 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -5,18 +5,41 @@ We use the issue tracker in Github. Submit issues for things such as
 bug reports, feature requests, or general improvement discussion topics.
 
 ## Submitting changes
-The typical procedure to develop new features or fix bugs in StaG-mwc looks
-something like this:
+The main branch of StaG-mwc should always be stable and reliable. All
+development should be based on the develop branch. Please create new feature
+branches from the develop branch. The develop branch is then merged into the
+master branch when enough improvements have accrued. The typical procedure to
+develop new features or fix bugs in StaG-mwc looks something like this:
 
 1. Fork or clone the repository.
-2. Create a branch with a descriptive name based on your intended changes using
-   dashes to separate words, e.g. `branch-to-add-megahit-assembly-step`.
-3. Insert your code into the respective folders, i.e. scripts, rules and envs.
-   Define the entry point of the workflow in the Snakefile and the main
-   configuration in the config.yaml file.
+2. Checkout the develop branch and create a new feature branch from there.
+   Use a descriptive name and use dashes to separate words:
+   ```
+   git checkout develop
+   git checkout -b add-megahit-assembly-step
+   ```
+3. Write or modify code in the scripts, rules and envs folders. Define the
+   entry point of the workflow in the Snakefile and the main configuration in the
+   config.yaml file.
+4. If a new feature has been added, document it in the Sphinx documentation.
 4. Commit changes to your fork/clone.
-5. Create a pull request (PR) with some motivation behind the work you have
-   done and possibly some explanations for tricky bits.
+5. Create a pull request (PR) with some descriptions of the work you have
+   done and possibly some explanations for potentially tricky bits.
+6. When the feature is considered complete, we bump the version number and
+   merge the PR back to the develop branch.
+
+### Releases
+New releases are made whenever enough new features have accrued on the develop
+branch. Before creating a new release, ensure the following things have been
+taken care of:
+
+* All pending features that should be included in the upcoming release are
+  merged into the develop branch.
+* Double check that documentation is up-to-date for implemented features.
+* Check that the version number in the documentation matches the Snakefile.
+
+Then, merge the develop branch into master, squashing all commits, and tag
+the new release.
 
 
 ## Code organization
@@ -76,7 +99,12 @@ designed to allow some inclusion logic in the main Snakefile, so components can
 be turned on or off without too much trouble. Output should typically be in a
 subfolder inside the overall `outdir` folder. `outdir` is available as a string
 in all rule files, as it is defined in the main Snakefile based on the value
-set in `config.yaml`. 
+set in `config.yaml`.
+
+Declare paths to input, output and log files using the pathlib Path objects
+INPUTDIR, OUTDIR, and LOGDIR. Note that Snakemake is not yet fully pathlib
+compatible so convert Path objects to strings inside `expand` statements and
+log file declarations.
 
 Tools that require databases or other reference material to work can be
 confusing or annyoing to users of the workflow. To minimize the amount of

diff --git a/Snakefile b/Snakefile
@@ -1,21 +1,32 @@
 # vim: syntax=python expandtab
 #
-#                   StaG
-#   mwc - Metagenomic Workflow Collaboration
+#    StaG Metagenomic Workflow Collaboration
+#                 StaG-mwc
 #         Copyright (c) 2018 Authors
 #
-# Running snakemake -n in a clone of this repository should successfully
-# execute a test dry-run of the workflow.
+# Running snakemake --use-conda -n in a clone of this repository should
+# successfully execute a test dry run of the workflow.
+from pathlib import Path
+
 from snakemake.exceptions import WorkflowError
+from snakemake.utils import min_version
+min_version("4.8.1")  # TODO: Bump version when Snakemake is pathlib compatible
 
-from sys import exit
-import os.path
+stag_version = "0.1.1-dev"
+print("="*60)
+print("StaG Metagenomic Workflow Collaboration".center(60))
+print("StaG-mwc".center(60))
+print(stag_version.center(60))
+print("="*60)
 
 configfile: "config.yaml"
-outdir = config["outdir"]
+INPUTDIR = Path(config["inputdir"])
+OUTDIR = Path(config["outdir"])
+LOGDIR = Path(config["logdir"])
+DBDIR = Path(config["dbdir"])
 all_outputs = []
 
-SAMPLES = set(glob_wildcards(config["inputdir"]+"/"+config["input_fn_pattern"]).sample)
+SAMPLES = set(glob_wildcards(INPUTDIR/config["input_fn_pattern"]).sample)
 
 #############################
 # Pre-processing

diff --git a/config.yaml b/config.yaml
@@ -15,6 +15,7 @@
 inputdir: "input"
 input_fn_pattern: "{sample}_R{readpair}.fastq.gz" 
 outdir: "output_dir"
+logdir: "output_dir/logs"
 dbdir: "databases"           # Databases will be downloaded to this dir, if requested
 
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -56,9 +56,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.1.0'
+version = '0.1.1'
 # The full version, including alpha/beta/rc tags.
-release = '0.1.0-dev'
+release = '0.1.1-dev'
 
 # reStructuredText prolog contains a string of reStructuredText that will be
 # included at the beginning of every source file that is read.

diff --git a/rules/antibiotic_resistance/megares.smk b/rules/antibiotic_resistance/megares.smk
@@ -1,84 +1,92 @@
 # Generic rules for detection of antibiotic resistance genes using MEGARes
+# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible.
+from pathlib import Path
 from snakemake.exceptions import WorkflowError
-import os.path
 
 localrules:
     download_megares
 
-if not os.path.isdir(os.path.join(config["megares"]["db_path"], "ref")):
-    err_message = "No MEGARes database found at: '{}'!\n".format(config["megares"]["db_path"])
+megares_db_path = Path(config["megares"]["db_path"])
+if not Path(megares_db_path/"ref").exists():
+    err_message = "No MEGARes database found at: '{}'!\n".format(megares_db_path)
     err_message += "Specify the DB path in the megares section of config.yaml.\n"
-    err_message += "Run 'snakemake create_megares_index' to download and build a BBMap index in '{dbdir}/megares'\n".format(dbdir=config["dbdir"])
+    err_message += "Run 'snakemake create_megares_index' to download and build a BBMap index in '{dbdir}'\n".format(dbdir=DBDIR/"megares")
     err_message += "If you do not want to map reads against MEGARes for antibiotic resistance gene detection, set antibiotic_resistance: False in config.yaml"
     raise WorkflowError(err_message)
 
-megares_outputs = expand("{outdir}/megares/{sample}.{output_type}",
-        outdir=outdir,
+megares_outputs = expand(str(OUTDIR/"megares/{sample}.{output_type}"),
         sample=SAMPLES,
         output_type=("sam.gz", "mapped_reads.fq.gz", "mhist.txt", "covstats.txt", "rpkm.txt"))
 all_outputs.extend(megares_outputs)
 
 rule download_megares:
     """Download MEGARes database."""
     output:
-        config["dbdir"]+"/megares/megares_annotations_v1.01.csv",
-        config["dbdir"]+"/megares/megares_database_v1.01.fasta",
-        config["dbdir"]+"/megares/megares_to_external_header_mappings_v1.01.tsv",
+        DBDIR/"megares/megares_annotations_v1.01.csv",
+        DBDIR/"megares/megares_database_v1.01.fasta",
+        DBDIR/"megares/megares_to_external_header_mappings_v1.01.tsv",
+    log:
+        str(LOGDIR/"megares/megares.download.log")
     shadow:
         "shallow"
     params:
-        dbdir=config["dbdir"]+"/megares"
+        dbdir=DBDIR/"megares"
     shell:
         """
         cd {params.dbdir}
         wget http://megares.meglab.org/download/megares_v1.01.zip \
+            > {log} \
         && \
         unzip megares_v1.01.zip \
+            >> {log} \
         && \
         mv megares_v1.01/* . \
         && \
-        rm -rfv megares_v1.01 megares_v1.01.zip
+        rm -rfv megares_v1.01 megares_v1.01.zip \
+            >> {log}
         """
 
 
 rule create_megares_index:
     """Create BBMap index for MEGARes."""
     input:
-        fasta=config["dbdir"]+"/megares/megares_database_v1.01.fasta"
+        fasta=DBDIR/"megares/megares_database_v1.01.fasta"
     output:
-        config["dbdir"]+"/megares/ref/genome/1/chr1.chrom.gz",
-        config["dbdir"]+"/megares/ref/genome/1/info.txt",
-        config["dbdir"]+"/megares/ref/genome/1/scaffolds.txt.gz",
-        config["dbdir"]+"/megares/ref/genome/1/summary.txt",
-        config["dbdir"]+"/megares/ref/index/1/chr1_index_k13_c8_b1.block",
-        config["dbdir"]+"/megares/ref/index/1/chr1_index_k13_c8_b1.block2.gz",
+        DBDIR/"megares/ref/genome/1/chr1.chrom.gz",
+        DBDIR/"megares/ref/genome/1/info.txt",
+        DBDIR/"megares/ref/genome/1/scaffolds.txt.gz",
+        DBDIR/"megares/ref/genome/1/summary.txt",
+        DBDIR/"megares/ref/index/1/chr1_index_k13_c8_b1.block",
+        DBDIR/"megares/ref/index/1/chr1_index_k13_c8_b1.block2.gz",
+    log:
+        str(LOGDIR/"megares/megares.bbmap_index.log")
     shadow:
         "shallow"
     conda:
         "../../envs/stag-mwc.yaml"
     params:
-        dbdir=config["dbdir"]+"/megares"
+        dbdir=DBDIR/"megares"
     shell:
         """
-        bbmap.sh ref={input} path={params.dbdir}
+        bbmap.sh ref={input} path={params.dbdir} > {log}
         """
 
 
 megares_config = config["megares"]
 rule bbmap_to_megares:
     """BBMap to MEGARes."""
     input:
-        read1=config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz",
-        read2=config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz",
+        read1=OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz",
+        read2=OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz",
     output:
-        sam=config["outdir"]+"/megares/{sample}.sam.gz",
-        mapped_reads=config["outdir"]+"/megares/{sample}.mapped_reads.fq.gz",
-        covstats=config["outdir"]+"/megares/{sample}.covstats.txt",
-        rpkm=config["outdir"]+"/megares/{sample}.rpkm.txt",
-        mhist=config["outdir"]+"/megares/{sample}.mhist.txt",
+        sam=OUTDIR/"megares/{sample}.sam.gz",
+        mapped_reads=OUTDIR/"megares/{sample}.mapped_reads.fq.gz",
+        covstats=OUTDIR/"megares/{sample}.covstats.txt",
+        rpkm=OUTDIR/"megares/{sample}.rpkm.txt",
+        mhist=OUTDIR/"megares/{sample}.mhist.txt",
     log:
-        stdout=config["outdir"]+"/logs/megares/{sample}.bbmap.stdout.log",
-        stderr=config["outdir"]+"/logs/megares/{sample}.bbmap.statsfile.txt"
+        stdout=str(LOGDIR/"megares/{sample}.bbmap.stdout.log"),
+        stderr=str(LOGDIR/"megares/{sample}.bbmap.statsfile.txt"),
     shadow:
         "shallow"
     conda:

diff --git a/rules/mappers/bbmap.smk b/rules/mappers/bbmap.smk
@@ -1,63 +1,63 @@
 # Rules for generic read mapping using BBMap
+# TODO: Remove superfluous str conversions when Snakemake is pathlib compatible.
+from pathlib import Path
+
 from snakemake.exceptions import WorkflowError
-import os.path
 
 localrules:
     bbmap_counts_table
     bbmap_featureCounts
 
-if not os.path.isdir(os.path.join(config["bbmap"]["db_path"], "ref")):
-    err_message = "BBMap index not found at: '{}'\n".format(config["bbmap"]["db_path"])
+db_path = Path(config["bbmap"]["db_path"])
+if not Path(db_path/"ref").exists():
+    err_message = "BBMap index not found at: '{}'\n".format(db_path)
     err_message += "Check path in config setting 'bbmap:db_path'.\n"
     err_message += "If you want to skip mapping with BBMap, set mappers:bbmap:False in config.yaml."
     raise WorkflowError(err_message)
 
 # Add final output files from this module to 'all_outputs' from the main
 # Snakefile scope. SAMPLES is also from the main Snakefile scope.
-bbmap_alignments = expand("{outdir}/bbmap/{db_name}/{sample}.{output_type}",
-        outdir=config["outdir"],
+bbmap_alignments = expand(str(OUTDIR/"bbmap/{db_name}/{sample}.{output_type}"),
         db_name=config["bbmap"]["db_name"],
         sample=SAMPLES,
         output_type=("sam.gz", "covstats.txt", "rpkm.txt"))
-counts_table = expand("{outdir}/bbmap/{db_name}/all_samples.counts_table.tab",
-        outdir=config["outdir"],
+counts_table = expand(str(OUTDIR/"bbmap/{db_name}/all_samples.counts_table.tab"),
         db_name=config["bbmap"]["db_name"],
         sample=SAMPLES)
-featureCounts = expand("{outdir}/bbmap/{db_name}/all_samples.featureCounts{output_type}",
-        outdir=config["outdir"],
+featureCounts = expand(str(OUTDIR/"bbmap/{db_name}/all_samples.featureCounts{output_type}"),
         db_name=config["bbmap"]["db_name"],
         sample=SAMPLES,
         output_type=["", ".summary", ".table.tsv"])
 all_outputs.extend(bbmap_alignments)
 if config["bbmap"]["counts_table"]["annotations"]:
-    if not os.path.isfile(config["bbmap"]["counts_table"]["annotations"]):
+    if not Path(config["bbmap"]["counts_table"]["annotations"]).exists():
         err_message = "BBMap counts table annotations not found at: '{}'\n".format(config["bbmap"]["counts_table"]["annotations"])
         err_message += "Check path in config setting 'bbmap:counts_table:annotations'.\n"
         err_message += "If you want to skip read counts summary for BBMap, set bbmap:counts_table:annotations to '' in config.yaml."
         raise WorkflowError(err_message)
     all_outputs.extend(counts_table)
 if config["bbmap"]["featureCounts"]["annotations"]:
-    if not os.path.isfile(config["bbmap"]["featureCounts"]["annotations"]):
+    if not Path(config["bbmap"]["featureCounts"]["annotations"]).exists():
         err_message = "BBMap featureCounts annotations not found at: '{}'\n".format(config["bbmap"]["featureCounts"]["annotations"])
         err_message += "Check path in config setting 'bbmap:featureCounts:annotations'.\n"
         err_message += "If you want to skip mapping with BBMap, set mappers:bbmap:False in config.yaml."
         raise WorkflowError(err_message)
     all_outputs.extend(featureCounts)
 
 bbmap_config = config["bbmap"]
-bbmap_output_folder = config["outdir"]+"/bbmap/{db_name}/".format(db_name=bbmap_config["db_name"])
+bbmap_output_folder = OUTDIR/"bbmap/{db_name}".format(db_name=bbmap_config["db_name"])
 rule bbmap:
     """BBMap"""
     input:
-        read1=config["outdir"]+"/filtered_human/{sample}_R1.filtered_human.fq.gz",
-        read2=config["outdir"]+"/filtered_human/{sample}_R2.filtered_human.fq.gz",
+        read1=OUTDIR/"filtered_human/{sample}_R1.filtered_human.fq.gz",
+        read2=OUTDIR/"filtered_human/{sample}_R2.filtered_human.fq.gz",
     output:
-        sam=bbmap_output_folder+"{sample}.sam.gz",
-        covstats=bbmap_output_folder+"{sample}.covstats.txt",
-        rpkm=bbmap_output_folder+"{sample}.rpkm.txt",
+        sam=bbmap_output_folder/"{sample}.sam.gz",
+        covstats=bbmap_output_folder/"{sample}.covstats.txt",
+        rpkm=bbmap_output_folder/"{sample}.rpkm.txt",
     log:
-        stdout=config["outdir"]+"/logs/bbmap/{sample}.bbmap.stdout.log",
-        stderr=config["outdir"]+"/logs/bbmap/{sample}.bbmap.statsfile.txt"
+        stdout=str(LOGDIR/"bbmap/{sample}.bbmap.stdout.log"),
+        stderr=str(LOGDIR/"bbmap/{sample}.bbmap.statsfile.txt"),
     shadow:
         "shallow"
     conda:
@@ -87,13 +87,13 @@ rule bbmap:
 
 rule bbmap_counts_table:
     input:
-        rpkms=expand(config["outdir"]+"/bbmap/{dbname}/{sample}.rpkm.txt",
+        rpkms=expand(str(OUTDIR/"bbmap/{dbname}/{sample}.rpkm.txt"),
                 dbname=bbmap_config["db_name"],
                 sample=SAMPLES)
     output:
-        counts=config["outdir"]+"/bbmap/{dbname}/all_samples.counts_table.tab".format(dbname=bbmap_config["db_name"]),
+        counts=OUTDIR/"bbmap/{dbname}/all_samples.counts_table.tab".format(dbname=bbmap_config["db_name"]),
     log:
-        config["outdir"]+"/logs/bbmap/{dbname}/all_samples.counts_table.log".format(dbname=bbmap_config["db_name"])
+        str(LOGDIR/"bbmap/{dbname}/all_samples.counts_table.log".format(dbname=bbmap_config["db_name"]))
     shadow:
         "shallow"
     conda:
@@ -115,15 +115,15 @@ rule bbmap_counts_table:
 fc_config = bbmap_config["featureCounts"]
 rule bbmap_featureCounts:
     input:
-        bams=expand(config["outdir"]+"/bbmap/{dbname}/{sample}.sam.gz",
+        bams=expand(str(OUTDIR/"bbmap/{dbname}/{sample}.sam.gz"),
                 dbname=bbmap_config["db_name"],
                 sample=SAMPLES)
     output:
-        counts=config["outdir"]+"/bbmap/{dbname}/all_samples.featureCounts".format(dbname=bbmap_config["db_name"]),
-        counts_table=config["outdir"]+"/bbmap/{dbname}/all_samples.featureCounts.table.tsv".format(dbname=bbmap_config["db_name"]),
-        summary=config["outdir"]+"/bbmap/{dbname}/all_samples.featureCounts.summary".format(dbname=bbmap_config["db_name"]),
+        counts=OUTDIR/"bbmap/{dbname}/all_samples.featureCounts".format(dbname=bbmap_config["db_name"]),
+        counts_table=OUTDIR/"bbmap/{dbname}/all_samples.featureCounts.table.tsv".format(dbname=bbmap_config["db_name"]),
+        summary=OUTDIR/"bbmap/{dbname}/all_samples.featureCounts.summary".format(dbname=bbmap_config["db_name"]),
     log:
-        config["outdir"]+"/logs/bbmap/{dbname}/all_samples.featureCounts.log".format(dbname=bbmap_config["db_name"])
+        str(LOGDIR/"bbmap/{dbname}/all_samples.featureCounts.log".format(dbname=bbmap_config["db_name"]))
     shadow:
         "shallow"
     conda: