Skip to content

Commit

Permalink
[MRG] usesourmash prefetch from sourmash v4.1.0 (#68)
Browse files Browse the repository at this point in the history
* use sourmash prefetch

* adjust sourmash versions etc etc

* add fastp; output interleaved

* clean up

* change length required from 31 to 25 to match previous trimmomatic params

* need to gzip interleaved output

* fix location for fastp reports

* update dependencies

* make tempdir location a configurable list to try; fix possible collisions

* default to config tempdir

* fix sourmash to be conda installed, not pip installed

* ignore errors in removing temp dir

* remove unneeded tools

Co-authored-by: N Tessa Pierce <[email protected]>
  • Loading branch information
ctb and bluegenes authored May 18, 2021
1 parent 8042591 commit 8b9f26d
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 111 deletions.
5 changes: 1 addition & 4 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,5 @@ channels:
- defaults
dependencies:
- mamba
- minimap2=2.17
- samtools=1.10
- screed>=1.0.5,<2
- pip:
- sourmash>=4,<5
- sourmash>=4.1,<5
31 changes: 25 additions & 6 deletions genome_grist/conf/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# - build_consensus
# - make_sgc_conf

import glob, os, csv
import glob, os, csv, tempfile, shutil, sys

SAMPLES=config['sample']
print(f'sample: {SAMPLES}')
Expand All @@ -19,6 +19,19 @@ outdir = config.get('outdir', 'outputs/')
outdir = outdir.rstrip('/')
print('outdir:', outdir)

base_tempdir = None
try_temp_locations = config.get('tempdir', [])
for temp_loc in try_temp_locations:
try:
base_tempdir = tempfile.mkdtemp(dir=temp_loc)
except FileNotFoundError:
pass

if not base_tempdir:
print(f"Could not create a temporary directory in any of {try_temp_locations}", file=sys.stderr)
print("Please set 'tempdir' in the config.", file=sys.stderr)
sys.exit(-1)

ABUNDTRIM_MEMORY = float(config.get('metagenome_trim_memory', '1e9'))

sourmash_db_pattern = config.get('sourmash_database_glob_pattern', 'MUST SPECIFY IN CONFIG')
Expand Down Expand Up @@ -50,6 +63,12 @@ except:

###

onsuccess:
shutil.rmtree(base_tempdir, ignore_errors=True)

onerror:
shutil.rmtree(base_tempdir, ignore_errors=True)

wildcard_constraints:
size="\d+",
sample='[a-zA-Z0-9._-]+' # should be everything but /
Expand Down Expand Up @@ -192,7 +211,7 @@ rule download_sra_wc:
r1 = protected(outdir + "/raw/{sample}_1.fastq.gz"),
r2 = protected(outdir + "/raw/{sample}_2.fastq.gz"),
unp = protected(outdir + "/raw/{sample}_unpaired.fastq.gz"),
temp_dir = temp(directory("/scratch/tmp.{sample}")),
temp_dir = temp(directory(f"{base_tempdir}/{{sample}}.d")),
conda: "env/sra.yml"
resources:
mem_mb=40000,
Expand Down Expand Up @@ -573,13 +592,13 @@ rule sourmash_prefetch_gather_wc:
conda: "env/sourmash.yml"
params:
ksize = SOURMASH_DB_KSIZE,
moltype = SOURMASH_COMPUTE_TYPE,
moltype = f"--{SOURMASH_COMPUTE_TYPE.lower()}",
threshold_bp = SOURMASH_DATABASE_THRESHOLD_BP,
shell: """
echo "DB is {input.db}"
python -Werror -m genome_grist.prefetch_gather --query {input.sig} \
--db {input.db} --save-matches {output.matches} -k {params.ksize} \
--threshold-bp={params.threshold_bp} --moltype {params.moltype}
sourmash prefetch {input.sig} {input.db} \
--save-matches {output.matches} -k {params.ksize} \
--threshold-bp={params.threshold_bp} {params.moltype}
"""

# run sourmash search x genbank and find anything matching.
Expand Down
3 changes: 3 additions & 0 deletions genome_grist/conf/defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ sourmash_compute_ksizes:
- 51
sourmash_scaled: 1000

# by default, leave stuff in /tmp
tempdir:
- /tmp
2 changes: 1 addition & 1 deletion genome_grist/conf/env/sourmash.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ channels:
dependencies:
- python=3.7
- screed
- sourmash>=4.1,<5
- pip
- pip:
- git+https://github.com/dib-lab/genome-grist.git#egg=genome-grist
- sourmash>=4,<5
3 changes: 3 additions & 0 deletions genome_grist/conf/system.conf
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
sourmash_database_glob_pattern: /home/irber/sourmash_databases/outputs/sbt/genbank-*x1e5*k31*
sourmash_database_ksize: 31
tempdir:
- /scratch
- /tmp
100 changes: 0 additions & 100 deletions genome_grist/prefetch_gather.py

This file was deleted.

0 comments on commit 8b9f26d

Please sign in to comment.