Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] update to sourmash 4.1.0 #171

Merged
merged 35 commits into from
May 24, 2021
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
9a1054b
update to sourmash 4.1.0
taylorreiter May 18, 2021
f172ab2
Update env-sourmash.yml
taylorreiter May 18, 2021
7f409c0
Update env-reporting.yml
taylorreiter May 18, 2021
f4f7d84
start switch to 4.1. Prefect, zip of matches, load_file_as_signatures
taylorreiter May 19, 2021
be1704c
make gather stop when there aren't any more mh in the sig
taylorreiter May 19, 2021
fc634b6
rm div 0 in f_major calc, change match sig to zip, fix float point
taylorreiter May 19, 2021
39d2980
fix IndexError when empty sigs
taylorreiter May 19, 2021
18aa62c
change reporting env to >=4.1.0<5
taylorreiter May 19, 2021
304eda9
readd reporting file
taylorreiter May 19, 2021
cf8377b
switch test to load_file_as_signatures
taylorreiter May 19, 2021
38b2d8d
update snakemake test to matches.zip
taylorreiter May 19, 2021
475b4ce
add 2.fa to matches.zip with sourmash sig cat
taylorreiter May 19, 2021
41fd43d
tests/test_contigs_search.py
taylorreiter May 19, 2021
272e661
start switching to zip, still needs debug
taylorreiter May 19, 2021
4accd34
add test dataset
taylorreiter May 19, 2021
cc4144e
woops -- only break if len(mh) == 0
taylorreiter May 19, 2021
fc3d748
replace .get_mins() with .hashes
taylorreiter May 19, 2021
a0a322f
fix ValueError from threshold-bp
taylorreiter May 19, 2021
0e5bf65
woops rm thresholdbp print statement
taylorreiter May 19, 2021
2d01632
update to sourmash 1537 instead of 1537
taylorreiter May 20, 2021
8ca6da4
range for sourmash version
taylorreiter May 20, 2021
fe7997e
switch from gather to prefetch
taylorreiter May 20, 2021
3cbe754
add except for assertion error from sourmash assert db on empty sigs
taylorreiter May 20, 2021
ba291d9
refactor to not have pass block
taylorreiter May 20, 2021
bfc347a
refactor sum_ident
taylorreiter May 20, 2021
8d0b631
update threshold bp for prefetch to 3*scaled
taylorreiter May 20, 2021
21df7da
fix tests
ctb May 21, 2021
66f2a02
wups, remove test debugging
ctb May 21, 2021
c2086b5
remove sourmash 4.1.1 code
ctb May 21, 2021
9386006
remove --no-linear in prefetch bc its default
taylorreiter May 21, 2021
7a92847
fix load sig with file handle
taylorreiter May 21, 2021
716f02a
simplify while 1
taylorreiter May 21, 2021
9d1a895
Merge pull request #172 from dib-lab/update/tr_sourmash_by_ctb
taylorreiter May 21, 2021
cef2195
update to sourmash v4.1.1
ctb May 22, 2021
16bc5bc
Merge pull request #176 from dib-lab/ctb/update_sourmash_411
taylorreiter May 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions charcoal/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -287,22 +287,23 @@ rule contigs_sig:
"""

# run a search, query.x.database.
rule search_all:
rule prefetch_all:
input:
query = stage1_dir + '/{filename}.sig',
databases = config['gather_db']
output:
csv = stage1_dir + '/{filename}.matches.csv',
matches = stage1_dir + '/{filename}.matches.sig',
matches = stage1_dir + '/{filename}.matches.zip',
txt = stage1_dir + '/{filename}.matches.txt'
params:
moltype = "--{}".format(config['moltype'].lower()),
gather_scaled = config['gather_scaled']
gather_scaled = config['gather_scaled'],
threshold_bp = config['gather_scaled']*3
conda: 'conf/env-sourmash.yml'
shell: """
sourmash search {input.query} {input.databases} -o {output.csv} \
--containment {params.moltype} --scaled {params.gather_scaled} \
--save-matches {output.matches} --threshold=0.001 >& {output.txt}
sourmash prefetch --no-linear {input.query} {input.databases} -o {output.csv} \
taylorreiter marked this conversation as resolved.
Show resolved Hide resolved
{params.moltype} --scaled {params.gather_scaled} \
--save-matches {output.matches} --threshold-bp {params.threshold_bp} >& {output.txt}
cat {output.txt}
touch {output.csv} {output.matches}
"""
Expand All @@ -312,7 +313,8 @@ rule make_contigs_taxonomy_json:
input:
genome = genome_dir + '/{f}',
genome_sig = stage1_dir + '/{f}.sig',
matches = stage1_dir + '/{f}.matches.sig',
matches = stage1_dir + '/{f}.matches.zip',

lineages = config['lineages_csv']
output:
json = stage1_dir + '/{f}.contigs-tax.json',
Expand All @@ -333,7 +335,7 @@ rule make_contigs_taxonomy_json:
rule compare_taxonomy_single:
input:
json = stage1_dir + '/{g}.contigs-tax.json',
sig = stage1_dir + '/{g}.matches.sig',
sig = stage1_dir + '/{g}.matches.zip',
lineages = config['lineages_csv'],
provided_lineages = provided_lineages_file,
genome_list = genome_list_file
Expand Down Expand Up @@ -443,7 +445,7 @@ checkpoint hitlist_make_contigs_matches:
input:
genome = genome_dir + '/{g}',
genome_sig = stage1_dir + '/{g}.sig',
matches = stage1_dir + '/{g}.matches.sig',
matches = stage1_dir + '/{g}.matches.zip',
lineages = config['lineages_csv'],
hitlist = output_dir + '/stage1_hitlist.csv'
output:
Expand Down
26 changes: 15 additions & 11 deletions charcoal/compare_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,12 @@ def guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, report_fp):
first_count = count

sum_ident += count

f_ident = sum_ident / len(entire_mh)
f_major = first_count / sum_ident

f_ident = 0
f_major = 0
if sum_ident:
f_ident = sum_ident / len(entire_mh)
f_major = first_count / sum_ident

return first_lin, f_ident, f_major

Expand Down Expand Up @@ -119,11 +122,12 @@ def choose_genome_lineage(guessed_genome_lineage, provided_lineage, match_rank,

def get_genome_taxonomy(matches_filename, genome_sig_filename, provided_lineage,
tax_assign, match_rank, min_f_ident, min_f_major):
with open(matches_filename, 'rt') as fp:
try:
siglist = list(sourmash.load_signatures(fp, do_raise=True, quiet=True))
except sourmash.exceptions.SourmashError:
siglist = None
try:
siglist = list(sourmash.load_file_as_signatures(matches_filename))
# TR for empty signatures; update when sourmash#1537 is fixed.
# TR AssertionError covers "assert db" from sourmash _load_database() in sourmash_args.py
except (IndexError, AssertionError) as e:
siglist = None

if not siglist:
comment = 'no matches for this genome.'
Expand Down Expand Up @@ -233,7 +237,7 @@ def main(args):
detected_contam = {}

summary_d = {}
matches_filename = os.path.join(dirname, genome_name + '.matches.sig')
matches_filename = os.path.join(dirname, genome_name + '.matches.zip')
genome_sig = os.path.join(dirname, genome_name + '.sig')
lineage = provided_lineages.get(genome_name, '')
contigs_json = os.path.join(dirname, genome_name + '.contigs-tax.json')
Expand Down Expand Up @@ -367,8 +371,8 @@ def main(args):
vals['bad_order_bp'],
vals['bad_family_bp'],
vals['bad_genus_bp'],
f'{vals["f_ident"]:.03}',
f'{vals["f_major"]:.03}',
f'{vals["f_ident"]:.03f}',
f'{vals["f_major"]:.03f}',
vals["lineage"],
vals["comment"]])

Expand Down
2 changes: 1 addition & 1 deletion charcoal/conf/env-reporting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ dependencies:
- pip
- pip:
- git+https://github.com/dib-lab/charcoal.git
- sourmash==4.0.0a3
- sourmash>=4.1.0,<5
- pyinterval
2 changes: 1 addition & 1 deletion charcoal/conf/env-sourmash.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ dependencies:
- pip
- pip:
- git+https://github.com/dib-lab/charcoal.git
- sourmash==4.0.0a3
- sourmash>=4.1.0,<5
- pyinterval
18 changes: 11 additions & 7 deletions charcoal/contigs_list_contaminants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ def get_matches(mh, lca_db, lin_db, match_rank, threshold_bp):

# do the gather:
while 1:
results = lca_db.gather(query_sig, threshold_bp=threshold_bp)
try:
results = lca_db.gather(query_sig, threshold_bp=threshold_bp)
except ValueError:
break

if not results:
break

Expand Down Expand Up @@ -78,12 +82,11 @@ def main(args):
genome_sig = sourmash.load_one_signature(args.genome_sig)

# load all of the matches from search --containment in the database
with open(args.matches_sig, 'rt') as fp:
try:
siglist = list(sourmash.load_signatures(fp, do_raise=True,
quiet=False))
except sourmash.exceptions.SourmashError:
siglist = []
try:
siglist = list(sourmash.load_file_as_signatures(args.matches_sig))
# TR for empty signatures; update when sourmash#1537 is fixed.
except IndexError:
siglist = []
print(f"loaded {len(siglist)} matches from '{args.matches_sig}'")

# Hack for examining members of our search database: remove exact matches.
Expand Down Expand Up @@ -132,6 +135,7 @@ def main(args):

screed_iter = screed.open(args.genome)
threshold_bp = empty_mh.scaled * GATHER_MIN_MATCHES

n = - 1
for n, record in enumerate(screed_iter):
# look at each contig individually
Expand Down
24 changes: 12 additions & 12 deletions charcoal/contigs_search_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ def main(args):
genome_sig = sourmash.load_one_signature(args.genome_sig)

# load all of the matches from search --containment in the database
with open(args.matches_sig, 'rt') as fp:
try:
siglist = list(sourmash.load_signatures(fp, do_raise=True,
quiet=False))
except sourmash.exceptions.SourmashError:
siglist = []
#with open(args.matches_sig, 'rt') as fp:
try:
siglist = list(sourmash.load_file_as_signatures(args.matches_sig))
# TR for cases with empty sigs that return IndexError; update when sourmash#1537 is fixed
# TR AssertionError covers "assert db" from sourmash _load_database() in sourmash_args.py
except (IndexError, AssertionError) as e:
siglist = []
print(f"loaded {len(siglist)} matches from '{args.matches_sig}'")

# Hack for examining members of our search database: remove exact matches.
Expand Down Expand Up @@ -88,14 +89,13 @@ def main(args):
# look at each contig individually
mh = empty_mh.copy_and_clear()
mh.add_sequence(record.sequence, force=True)

# collect all the gather results at genus level, together w/counts;
# here, results is a list of (lineage, count) tuples.
results = list(gather_at_rank(mh, lca_db, lin_db, match_rank))

# store together with size of sequence.
info = ContigGatherInfo(len(record.sequence), len(mh), results)
contigs_tax[record.name] = info
if len(mh):
results = list(gather_at_rank(mh, lca_db, lin_db, match_rank))
# store together with size of sequence.
info = ContigGatherInfo(len(record.sequence), len(mh), results)
contigs_tax[record.name] = info

print(f"Processed {len(contigs_tax)} contigs.")

Expand Down
2 changes: 1 addition & 1 deletion charcoal/just_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def main(args):
print(f'loaded {len(tax_assign)} tax assignments.')

with open(args.matches_sig, 'rt') as fp:
siglist = list(sourmash.load_signatures(fp))
siglist = list(sourmash.load_file_as_signatures(fp))
taylorreiter marked this conversation as resolved.
Show resolved Hide resolved

if not siglist:
print('no matches for this genome, exiting.')
Expand Down
5 changes: 5 additions & 0 deletions charcoal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,12 @@ def gather_at_rank(mh, lca_db, lin_db, match_rank):
# do the gather:
counts = Counter()
while 1:
taylorreiter marked this conversation as resolved.
Show resolved Hide resolved

if not query_sig.minhash:
break

results = lca_db.gather(query_sig, threshold_bp=0)

if not results:
break

Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ dependencies:
- click
- pip
- pip:
- sourmash==4.0.0a3
- sourmash>=4.1.0,<5
Binary file added tests/test-data/2.fa.gz.gather-matches.zip
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_compare_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_basic(location):
args.genome = 'loomba'

shutil.copyfile(utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.sig"), os.path.join(location, "loomba.sig"))
shutil.copyfile(utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.sig"), os.path.join(location, "loomba.matches.sig"))
shutil.copyfile(utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.zip"), os.path.join(location, "loomba.matches.zip"))
shutil.copyfile(utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.contigs-tax.json"), os.path.join(location, "loomba.contigs-tax.json"))

status = compare_taxonomy.main(args)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_contigs_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_1(location):
args = utils.Args()
args.genome = utils.relative_file("tests/test-data/genomes/2.fa.gz")
args.genome_sig = utils.relative_file("tests/test-data/genomes/2.fa.gz.sig")
args.matches_sig = utils.relative_file("tests/test-data/2.fa.gz.gather-matches.sig.gz")
args.matches_sig = utils.relative_file("tests/test-data/2.fa.gz.gather-matches.zip")
args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv")
args.json_out = os.path.join(location, 'tax.json')
args.match_rank = 'genus'
Expand All @@ -32,7 +32,7 @@ def test_2_loomba(location):
args = utils.Args()
args.genome = utils.relative_file("demo/genomes/LoombaR_2017__SID1050_bax__bin.11.fa.gz")
args.genome_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.sig")
args.matches_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.sig")
args.matches_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.zip")
args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv")
args.json_out = os.path.join(location, 'tax.json')
args.match_rank = 'genus'
Expand Down
2 changes: 1 addition & 1 deletion tests/test_decontam.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def make_lca_and_lineages(match_files, lineages_csv, scaled, ksize,
# load matches into a list
siglist = []
for filename in match_files:
siglist += list(sourmash.load_signatures(filename))
siglist += list(sourmash.load_file_as_signatures(filename))

# pull in taxonomic assignments
tax_assign, _ = load_taxonomy_assignments(lineages_csv,
Expand Down
2 changes: 1 addition & 1 deletion tests/test_snakemake.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_make_sig(genome_file):
@pytest.mark.parametrize("genome_file", demo_genomes)
def test_make_gather_matches(request, genome_file):
depends(request, [f"test_make_sig[{g}]" for g in demo_genomes])
target = f'stage1/{genome_file}.matches.sig'
target = f'stage1/{genome_file}.matches.zip'
status, out, err = _run_snakemake_test('demo/demo.conf', target)

assert status == 0
Expand Down