From 0997834e012698bfb70359e6a28437d1a46b3b34 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 07:17:44 -0700 Subject: [PATCH 01/41] various cleanups of sourmash_args --- src/sourmash/sourmash_args.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 11d4d304cc..294e795c6e 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -338,7 +338,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): return db -def load_file_as_index(filename, yield_all_files=False): +def load_file_as_index(filename, *, yield_all_files=False): """Load 'filename' as a database; generic database loader. If 'filename' contains an SBT or LCA indexed database, or a regular @@ -356,7 +356,7 @@ def load_file_as_index(filename, yield_all_files=False): return _load_database(filename, yield_all_files) -def load_file_as_signatures(filename, select_moltype=None, ksize=None, +def load_file_as_signatures(filename, *, select_moltype=None, ksize=None, yield_all_files=False, progress=None): """Load 'filename' as a collection of signatures. Return an iterable. @@ -382,7 +382,7 @@ def load_file_as_signatures(filename, select_moltype=None, ksize=None, db = db.select(moltype=select_moltype, ksize=ksize) loader = db.signatures() - if progress: + if progress is not None: return progress.start_file(filename, loader) else: return loader @@ -501,6 +501,9 @@ def __init__(self, reporting_interval=10): self.interval = reporting_interval self.screen_width = 79 + def __len__(self): + return self.n_sig + def short_notify(self, msg_template, *args, **kwargs): """Shorten the notification message so that it fits on one line. @@ -689,7 +692,8 @@ def _exists(self, name): return False def add(self, ss): - assert self.zf + if not self.zf: + raise ValueError("this output is not open") super().add(ss) md5 = ss.md5sum() From 66b059904abfe0f9ee9441a273c07f22e037a249 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 07:23:13 -0700 Subject: [PATCH 02/41] cleanup flakes errors --- src/sourmash/sourmash_args.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 294e795c6e..4085fa5bea 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -3,28 +3,22 @@ """ import sys import os -import argparse -import itertools from enum import Enum import traceback import gzip import zipfile import screed +import sourmash from sourmash.sbtmh import load_sbt_index from sourmash.lca.lca_db import load_single_database import sourmash.exceptions -from . import signature from .logging import notify, error, debug_literal from .index import (LinearIndex, ZipFileLinearIndex, MultiIndex) -from . import signature as sig -from .sbt import SBT -from .sbtmh import SigLeaf -from .lca import LCA_Database -import sourmash +from . import signature as sigmod DEFAULT_LOAD_K = 31 @@ -304,7 +298,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): db = load_fn(filename, traverse_yield_all=traverse_yield_all, cache_size=cache_size) - except ValueError as exc: + except ValueError: debug_literal(f"_load_databases: FAIL on fn {desc}.") debug_literal(traceback.format_exc()) @@ -321,7 +315,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): # CTB: could be kind of time consuming for a big record, but at the # moment screed doesn't expose format detection cleanly. with screed.open(filename) as it: - record = next(iter(it)) + _ = next(iter(it)) successful_screed_load = True except: pass @@ -629,7 +623,7 @@ def add(self, ss): i += 1 with gzip.open(outname, "wb") as fp: - sig.save_signatures([ss], fp, compression=1) + sigmod.save_signatures([ss], fp, compression=1) class SaveSignatures_SigFile(_BaseSaveSignaturesToLocation): From 3a583a9d90ee765c019b27b335852ef16d738869 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 07:36:48 -0700 Subject: [PATCH 03/41] clean up sourmash.sig submodule --- src/sourmash/sig/__main__.py | 51 ++++++++++-------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 879c51cc2c..56cede698a 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -10,7 +10,7 @@ import sourmash from sourmash.sourmash_args import FileOutput -from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug +from sourmash.logging import set_quiet, error, notify, print_results, debug from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled @@ -122,7 +122,6 @@ def split(args): progress = sourmash_args.SignatureLoadingProgress() - total = 0 for sigfile in args.signatures: # load signatures from input file: this_siglist = sourmash_args.load_file_as_signatures(sigfile, @@ -175,9 +174,8 @@ def split(args): notify('loaded {} signatures from {}...', n_signatures, sigfile, end='\r') - total += n_signatures - notify('loaded and split {} signatures total.', total) + notify(f'loaded and split {len(progress)} signatures total.') def describe(args): @@ -201,14 +199,11 @@ def describe(args): # load signatures and display info. progress = sourmash_args.SignatureLoadingProgress() - n_loaded = 0 for signature_file in args.signatures: try: loader = sourmash_args.load_file_as_signatures(signature_file, progress=progress) for sig in loader: - n_loaded += 1 - # extract info, write as appropriate. mh = sig.minhash ksize = mh.ksize @@ -245,7 +240,7 @@ def describe(args): error('(continuing)') raise - notify('loaded {} signatures total.', n_loaded) + notify(f'loaded {len(progress)} signatures total.') if csv_fp: csv_fp.close() @@ -377,7 +372,7 @@ def merge(args): if this_n: notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r') - if not total_loaded: + if not len(progress): error("no signatures to merge!?") sys.exit(-1) @@ -386,7 +381,7 @@ def merge(args): with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([merged_sigobj], fp=fp) - notify('loaded and merged {} signatures', total_loaded) + notify(f'loaded and merged {len(progress)} signatures') def intersect(args): @@ -400,7 +395,6 @@ def intersect(args): first_sig = None mins = None - total_loaded = 0 progress = sourmash_args.SignatureLoadingProgress() @@ -419,10 +413,9 @@ def intersect(args): sys.exit(-1) mins.intersection_update(sigobj.minhash.hashes) - total_loaded += 1 notify('loaded and intersected signatures from {}...', sigfile, end='\r') - if total_loaded == 0: + if len(progress) == 0: error("no signatures to merge!?") sys.exit(-1) @@ -454,7 +447,7 @@ def intersect(args): with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([intersect_sigobj], fp=fp) - notify('loaded and intersected {} signatures', total_loaded) + notify(f'loaded and intersected {len(progress)} signatures') def subtract(args): @@ -478,7 +471,6 @@ def subtract(args): progress = sourmash_args.SignatureLoadingProgress() - total_loaded = 0 for sigfile in args.subtraction_sigs: for sigobj in sourmash_args.load_file_as_signatures(sigfile, ksize=args.ksize, @@ -495,9 +487,8 @@ def subtract(args): subtract_mins -= set(sigobj.minhash.hashes) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') - total_loaded += 1 - if not total_loaded: + if not len(progress): error("no signatures to subtract!?") sys.exit(-1) @@ -510,7 +501,7 @@ def subtract(args): with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([subtract_sigobj], fp=fp) - notify('loaded and subtracted {} signatures', total_loaded) + notify(f'loaded and subtracted {len(progress)} signatures') def rename(args): @@ -538,7 +529,7 @@ def rename(args): save_sigs.close() - notify("set name to '{}' on {} signatures", args.name, len(save_sigs)) + notify(f"set name to '{args.name}' on {len(save_sigs)} signatures") def extract(args): @@ -553,7 +544,6 @@ def extract(args): save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) save_sigs.open() - total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, @@ -561,8 +551,6 @@ def extract(args): progress=progress) siglist = list(siglist) - total_loaded += len(siglist) - # select! if args.md5 is not None: siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] @@ -572,8 +560,7 @@ def extract(args): for ss in siglist: save_sigs.add(ss) - notify("loaded {} total that matched ksize & molecule type", - total_loaded) + notify(f"loaded {len(progress)} total that matched ksize & molecule type") if not save_sigs: error("no matching signatures!") sys.exit(-1) @@ -596,7 +583,6 @@ def filter(args): save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) save_sigs.open() - total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, @@ -604,8 +590,6 @@ def filter(args): progress=progress) siglist = list(siglist) - total_loaded += len(siglist) - # select! if args.md5 is not None: siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] @@ -636,8 +620,7 @@ def filter(args): save_sigs.close() - notify("loaded {} total that matched ksize & molecule type", - total_loaded) + notify(f"loaded {len(progress)} total that matched ksize & molecule type") notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) @@ -654,7 +637,6 @@ def flatten(args): save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) save_sigs.open() - total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, @@ -662,8 +644,6 @@ def flatten(args): progress=progress) siglist = list(siglist) - total_loaded += len(siglist) - # select! if args.md5 is not None: siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] @@ -676,8 +656,7 @@ def flatten(args): save_sigs.close() - notify("loaded {} total that matched ksize & molecule type", - total_loaded) + notify(f"loaded {len(progress)} total that matched ksize & molecule type") notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) @@ -702,7 +681,6 @@ def downsample(args): progress = sourmash_args.SignatureLoadingProgress() - total_loaded = 0 for sigfile in args.signatures: siglist = sourmash_args.load_file_as_signatures(sigfile, ksize=args.ksize, @@ -713,7 +691,6 @@ def downsample(args): mh = sigobj.minhash notify('loading and downsampling signature from {}...', sigfile, end='\r') - total_loaded += 1 if args.scaled: if mh.scaled: mh_new = mh.downsample(scaled=args.scaled) @@ -743,7 +720,7 @@ def downsample(args): save_sigs.close() - notify("loaded and downsampled {} signatures", total_loaded) + notify(f"loaded and downsampled {len(progress)} signatures") def sig_import(args): From bb794ec530f15f27265c45c3877762568ac57bec Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 08:38:24 -0700 Subject: [PATCH 04/41] initial picklist implementation --- src/sourmash/sig/__main__.py | 11 ++-- src/sourmash/sig/picklist.py | 115 +++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 src/sourmash/sig/picklist.py diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 56cede698a..e3f6613b5a 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -216,8 +216,10 @@ def describe(args): if mh.track_abundance: with_abundance = 1 md5 = sig.md5sum() - name = sig.name or "** no name **" - filename = sig.filename or "** no name **" + name = sig.name + p_name = name or "** no name **" + filename = sig.filename + p_filename = filename or "** no name **" license = sig.license if w: @@ -226,8 +228,8 @@ def describe(args): print_results('''\ --- signature filename: {signature_file} -signature: {name} -source file: {filename} +signature: {p_name} +source file: {p_filename} md5: {md5} k={ksize} molecule={moltype} num={num} scaled={scaled} seed={seed} track_abundance={with_abundance} size: {n_hashes} @@ -549,6 +551,7 @@ def extract(args): ksize=args.ksize, select_moltype=moltype, progress=progress) + # CTB: make streaming! siglist = list(siglist) # select! diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py new file mode 100644 index 0000000000..2997af767b --- /dev/null +++ b/src/sourmash/sig/picklist.py @@ -0,0 +1,115 @@ +"Picklist code for extracting subsets of signatures." +import csv + + +# set up preprocessing functions for column stuff +preprocess = {} + +# exact matches +preprocess['name'] = lambda x: x +preprocess['md5'] = lambda x: x + +# identifier matches/prefix foo - space delimited identifiers +preprocess['ident'] = lambda x: x.split(' ')[0].split('.')[0] +preprocess['ident.'] = lambda x: x.split(' ')[0] + +# match 8 characters +preprocess['md5prefix8'] = lambda x: x[:8] + + +class SignaturePicklist: + """Picklist class for subsetting collections of signatures. + + Initialize using ``SignaturePicklist.from_picklist_args(argstr)``, + which takes an argument str like so: 'pickfile:column:coltype'. + + Here, 'pickfile' is the path to a CSV file; 'column' is the name of + the column to select from the CSV file; and 'coltype' is the type of + matching to do on that column. + + 'coltype's that are currently supported: + * 'name' - exact match to signature's name + * 'md5' - exact match to signature's md5sum + * 'md5prefix8' - match to 8-character prefix of signature's md5sum + * 'ident' - exact match to signature's identifier + * 'ident.' - match to signature's identifier, before '.' + + Identifiers are constructed by using the first space delimited word in + the signature name. + """ + def __init__(self, pickfile, column_name, coltype): + self.pickfile = pickfile + self.column_name = column_name + self.coltype = coltype + + if coltype not in ('md5', 'md5prefix8', 'name', 'ident', 'ident.'): + raise ValueError(f"invalid picklist column type '{coltype}'") + + self.preprocess_fn = preprocess[coltype] + self.pickset = None + + @classmethod + def from_picklist_args(cls, argstr): + "load a picklist from an argument string 'pickfile:column:coltype'" + picklist = argstr.split(':') + if len(picklist) != 3: + raise ValueError(f"invalid picklist argument '{argstr}'") + + assert len(picklist) == 3 + pickfile, column, coltype = picklist + + return cls(pickfile, column, coltype) + + def _get_sig_attribute(self, ss): + "for a given SourmashSignature, return attribute for this picklist." + coltype = self.coltype + if coltype == 'md5': + q = ss.md5sum() + elif coltype == 'md5prefix8': + q = ss.md5sum() + elif coltype == 'name': + q = ss.name + elif coltype == 'ident': + q = ss.name + elif coltype == 'ident.': + q = ss.name + + return q + + def load(self): + "load pickset, return num empty vals, and set of duplicate vals." + pickset = set() + n_empty_val = 0 + dup_vals = set() + with open(self.pickfile, newline='') as csvfile: + r = csv.DictReader(csvfile) + + if self.column_name not in r.fieldnames: + raise ValueError("column '{self.column_name}' not in pickfile '{self.pickfile}'") + + for row in r: + # pick out values from column + col = row[self.column_name] + if not col: + n_empty_val += 1 + continue + + col = self.preprocess_fn(col) + + # look for duplicate values or empty values + if col in pickset: + dup_vals.add(col) + else: + pickset.add(col) + + self.pickset = pickset + return n_empty_val, dup_vals + + def __contains__(self, ss): + "does this signature match anything in the picklist?" + q = self._get_sig_attribute(ss) + q = self.preprocess_fn(q) + + if q in self.pickset: + return True + return False From 3ecfb48032e6b79ee2a8076d8e9f273ce29691da Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 08:55:28 -0700 Subject: [PATCH 05/41] integrate picklists into sourmash sig extract --- src/sourmash/cli/sig/extract.py | 4 +++ src/sourmash/sig/__main__.py | 52 ++++++++++++++++++++++++++------- src/sourmash/sig/picklist.py | 5 ++++ 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py index d2066e8bcc..d822ae9db7 100644 --- a/src/sourmash/cli/sig/extract.py +++ b/src/sourmash/cli/sig/extract.py @@ -25,6 +25,10 @@ def subparser(subparsers): '--name', default=None, help='select signatures whose name contains this substring' ) + subparser.add_argument( + '--picklist', default=None, + help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'" + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index e3f6613b5a..d80ab9d09f 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -13,6 +13,7 @@ from sourmash.logging import set_quiet, error, notify, print_results, debug from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled +from .picklist import SignaturePicklist usage=''' sourmash signature [] - manipulate/work with signature files. @@ -188,6 +189,7 @@ def describe(args): w = None csv_fp = None if args.csv: + # CTB: might want to switch to sourmash_args.FileOutputCSV here? csv_fp = open(args.csv, 'w', newline='') w = csv.DictWriter(csv_fp, ['signature_file', 'md5', 'ksize', 'moltype', 'num', @@ -541,6 +543,43 @@ def extract(args): set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = None + if args.picklist: + picklist = SignaturePicklist.from_picklist_args(args.picklist) + + notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") + + n_empty_val, dup_vals = picklist.load() + + notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") + if n_empty_val: + notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in CSV file") + if dup_vals: + notify(f"WARNING: {len(dup_vals)} values in column '{picklist.column_name}' were not distinct") + picklist_filter_fn = picklist.filter + else: + def picklist_filter_fn(it): + for ss in it: + yield ss + + # further filtering on md5 or name? + if args.md5 is not None or args.name is not None: + def filter_fn(it): + for ss in picklist_filter_fn(it): + # match? + keep = False + if args.name and args.name in str(ss): + keep = True + if args.md5 and args.md5 in ss.md5sum(): + keep = True + + if keep: + yield ss + else: + # whatever comes out of the picklist is fine + filter_fn = picklist_filter_fn + + # ok! filtering defined, let's go forward progress = sourmash_args.SignatureLoadingProgress() save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) @@ -551,21 +590,12 @@ def extract(args): ksize=args.ksize, select_moltype=moltype, progress=progress) - # CTB: make streaming! - siglist = list(siglist) - - # select! - if args.md5 is not None: - siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] - if args.name is not None: - siglist = [ ss for ss in siglist if args.name in str(ss) ] - - for ss in siglist: + for ss in filter_fn(siglist): save_sigs.add(ss) notify(f"loaded {len(progress)} total that matched ksize & molecule type") if not save_sigs: - error("no matching signatures!") + error("no matching signatures to save!") sys.exit(-1) save_sigs.close() diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 2997af767b..a702e45ab4 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -113,3 +113,8 @@ def __contains__(self, ss): if q in self.pickset: return True return False + + def filter(self, it): + for ss in it: + if self.__contains__(ss): + yield ss From 505b04f847aff8c981b64d1447e4cf69ecd56c4b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 09:11:24 -0700 Subject: [PATCH 06/41] basic tests for picklist functionality --- src/sourmash/sig/picklist.py | 4 +- tests/test_cmd_signature.py | 155 +++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 2 deletions(-) diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index a702e45ab4..ef3622d03a 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -10,8 +10,8 @@ preprocess['md5'] = lambda x: x # identifier matches/prefix foo - space delimited identifiers -preprocess['ident'] = lambda x: x.split(' ')[0].split('.')[0] -preprocess['ident.'] = lambda x: x.split(' ')[0] +preprocess['ident.'] = lambda x: x.split(' ')[0].split('.')[0] +preprocess['ident'] = lambda x: x.split(' ')[0] # match 8 characters preprocess['md5prefix8'] = lambda x: x[:8] diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 060f7f83ab..db71ec38d6 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1110,6 +1110,161 @@ def test_sig_extract_7_no_ksize(c): assert len(siglist) == 3 +def test_sig_extract_8_picklist_md5(runtmp): + # extract 47 from 47, using a picklist w/full md5 + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:md5full:md5" + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + +def test_sig_extract_8_picklist_name(runtmp): + # extract 47 from 47, using a picklist w/full md5 + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:exactName:name" + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + +def test_sig_extract_8_picklist_ident(runtmp): + # extract 47 from 47, using a picklist w/full md5 + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:fullIdent:ident" + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + +def test_sig_extract_8_picklist_ident_dot(runtmp): + # extract 47 from 47, using a picklist w/full md5 + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:nodotIdent:ident." + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + +def test_sig_extract_8_picklist_md5_short(runtmp): + # extract 47 from 47, using a picklist w/full md5 + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:md5short:md5prefix8" + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + @utils.in_tempdir def test_sig_flatten_1(c): # extract matches to several names from among several signatures & flatten From 74f31f50675d6b420cda234a558fb6e3ce495111 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 09:21:58 -0700 Subject: [PATCH 07/41] track found etc --- src/sourmash/sig/__main__.py | 5 +++++ src/sourmash/sig/picklist.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index d80ab9d09f..d75dc66685 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -602,6 +602,11 @@ def filter_fn(it): notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) + if picklist: + notify(f"for given picklist, found {len(picklist.found)} matches of {len(picklist.pickset)} total") + n_missing = len(picklist.pickset - picklist.found) + if n_missing: + notify(f"WARNING: {n_missing} missing picklist values.") def filter(args): diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index ef3622d03a..287af2bfab 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -47,6 +47,8 @@ def __init__(self, pickfile, column_name, coltype): self.preprocess_fn = preprocess[coltype] self.pickset = None + self.found = set() + self.n_queries = 0 @classmethod def from_picklist_args(cls, argstr): @@ -110,7 +112,9 @@ def __contains__(self, ss): q = self._get_sig_attribute(ss) q = self.preprocess_fn(q) + self.n_queries += 1 if q in self.pickset: + self.found.add(q) return True return False From b1fc982a0d3cee78b14899d83d7b6fd203e7a6db Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 10:01:56 -0700 Subject: [PATCH 08/41] add picklists to selectors --- src/sourmash/index.py | 5 ++++- src/sourmash/lca/lca_db.py | 27 ++++++++++++++++++++++++--- src/sourmash/sbt.py | 26 +++++++++++++++++++++++--- src/sourmash/sig/__main__.py | 14 ++++++-------- src/sourmash/sourmash_args.py | 5 +++-- 5 files changed, 60 insertions(+), 17 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 2aec59283c..b344a3cabc 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -291,7 +291,7 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None, def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0, - containment=False): + containment=False, picklist=None): "Check that the given signature matches the specificed requirements." # ksize match? if ksize and ksize != ss.minhash.ksize: @@ -318,6 +318,9 @@ def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0, if ss.minhash.scaled or num != ss.minhash.num: return False + if picklist is not None and ss not in picklist: + return False + return True diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index a3d90ffd5d..69b776aacc 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -71,6 +71,7 @@ def __init__(self, ksize, scaled, moltype='DNA'): self.lineage_to_lid = {} self.lid_to_lineage = {} self.hashval_to_idx = defaultdict(set) + self.picklists = [] @property def location(self): @@ -176,7 +177,7 @@ def signatures(self): yield v def select(self, ksize=None, moltype=None, num=0, scaled=0, - containment=False): + containment=False, picklist=None): """Make sure this database matches the requested requirements. As with SBTs, queries with higher scaled values than the database @@ -197,6 +198,9 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, if moltype is not None and moltype != self.moltype: raise ValueError(f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}") + if picklist is not None: + self.picklists.append(picklist) + return self @classmethod @@ -416,7 +420,16 @@ def _signatures(self): for idx, mh in mhd.items(): ident = self.idx_to_ident[idx] name = self.ident_to_name[ident] - sigd[idx] = SourmashSignature(mh, name=name) + ss = SourmashSignature(mh, name=name) + + keep = True + for picklist in self.picklists: + if ss not in picklist: + keep = False + break + + if keep: + sigd[idx] = SourmashSignature(mh, name=name) debug('=> {} signatures!', len(sigd)) return sigd @@ -478,7 +491,15 @@ def find(self, search_fn, query, **kwargs): # signal that it is done, or something. if search_fn.passes(score): if search_fn.collect(score, subj): - yield IndexSearchResult(score, subj, self.location) + + # filter on picklists + keep = True + for picklist in self.picklists: + if subj not in picklist: + keep = False + + if keep: + yield IndexSearchResult(score, subj, self.location) @cached_property def lid_to_idx(self): diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index c31a689621..3cbc762e2d 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -148,6 +148,7 @@ def __init__(self, factory, *, d=2, storage=None, cache_size=None): cache_size = sys.maxsize self._nodescache = _NodesCache(maxsize=cache_size) self._location = None + self.picklists = [] @property def location(self): @@ -155,10 +156,17 @@ def location(self): def signatures(self): for k in self.leaves(): - yield k.data + ss = k.data + keep = True + for picklist in self.picklists: + if ss not in picklist: + keep = False + + if keep: + yield k.data def select(self, ksize=None, moltype=None, num=0, scaled=0, - containment=False): + containment=False, picklist=None): """Make sure this database matches the requested requirements. Will always raise ValueError if a requirement cannot be met. @@ -210,6 +218,9 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, if scaled > db_mh.scaled and not containment: raise ValueError(f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}") + if picklist is not None: + self.picklists.append(picklist) + return self def new_node_pos(self, node): @@ -450,7 +461,16 @@ def node_search(node, *args, **kwargs): # & execute! for n in self._find_nodes(node_search, **kwargs): - yield IndexSearchResult(results[n.data], n.data, self.location) + ss = n.data + + # filter on picklists + keep = True + for picklist in self.picklists: + if ss not in picklist: + keep = False + + if keep: + yield IndexSearchResult(results[ss], ss, self.location) def _rebuild_node(self, pos=0): """Recursively rebuilds an internal node (if it is not present). diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index d75dc66685..718c799744 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -556,16 +556,11 @@ def extract(args): notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in CSV file") if dup_vals: notify(f"WARNING: {len(dup_vals)} values in column '{picklist.column_name}' were not distinct") - picklist_filter_fn = picklist.filter - else: - def picklist_filter_fn(it): - for ss in it: - yield ss # further filtering on md5 or name? if args.md5 is not None or args.name is not None: def filter_fn(it): - for ss in picklist_filter_fn(it): + for ss in it: # match? keep = False if args.name and args.name in str(ss): @@ -576,8 +571,10 @@ def filter_fn(it): if keep: yield ss else: - # whatever comes out of the picklist is fine - filter_fn = picklist_filter_fn + # whatever comes out of the database is fine + def filter_fn(it): + for ss in it: + yield ss # ok! filtering defined, let's go forward progress = sourmash_args.SignatureLoadingProgress() @@ -589,6 +586,7 @@ def filter_fn(it): siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, + picklist=picklist, progress=progress) for ss in filter_fn(siglist): save_sigs.add(ss) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 4085fa5bea..40c7d35444 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -351,6 +351,7 @@ def load_file_as_index(filename, *, yield_all_files=False): def load_file_as_signatures(filename, *, select_moltype=None, ksize=None, + picklist=None, yield_all_files=False, progress=None): """Load 'filename' as a collection of signatures. Return an iterable. @@ -367,13 +368,13 @@ def load_file_as_signatures(filename, *, select_moltype=None, ksize=None, underneath this directory into a list of signatures. If yield_all_files=True, will attempt to load all files. - Applies selector function if select_moltype and/or ksize are given. + Applies selector function if select_moltype, ksize or picklist are given. """ if progress: progress.notify(filename) db = _load_database(filename, yield_all_files) - db = db.select(moltype=select_moltype, ksize=ksize) + db = db.select(moltype=select_moltype, ksize=ksize, picklist=picklist) loader = db.signatures() if progress is not None: From a8178436a0849485c88dc3576755010c436d249b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 10:28:23 -0700 Subject: [PATCH 09/41] split pickfile out a little bit --- src/sourmash/sig/__main__.py | 2 +- src/sourmash/sig/picklist.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 718c799744..8bfb87e316 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -549,7 +549,7 @@ def extract(args): notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") - n_empty_val, dup_vals = picklist.load() + n_empty_val, dup_vals = picklist.load(picklist.pickfile) notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") if n_empty_val: diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 287af2bfab..1c6234d167 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -23,6 +23,7 @@ class SignaturePicklist: Initialize using ``SignaturePicklist.from_picklist_args(argstr)``, which takes an argument str like so: 'pickfile:column:coltype'. + # CTB pickfile or pickset? Here, 'pickfile' is the path to a CSV file; 'column' is the name of the column to select from the CSV file; and 'coltype' is the type of matching to do on that column. @@ -38,7 +39,7 @@ class SignaturePicklist: the signature name. """ def __init__(self, pickfile, column_name, coltype): - self.pickfile = pickfile + self.pickfile = pickfile # note: can be None self.column_name = column_name self.coltype = coltype @@ -78,16 +79,19 @@ def _get_sig_attribute(self, ss): return q - def load(self): + def load(self, pickfile): "load pickset, return num empty vals, and set of duplicate vals." - pickset = set() + pickset = self.pickset + if pickset is None: + pickset = set() + n_empty_val = 0 dup_vals = set() - with open(self.pickfile, newline='') as csvfile: + with open(pickfile, newline='') as csvfile: r = csv.DictReader(csvfile) if self.column_name not in r.fieldnames: - raise ValueError("column '{self.column_name}' not in pickfile '{self.pickfile}'") + raise ValueError("column '{self.column_name}' not in pickfile '{pickfile}'") for row in r: # pick out values from column From def1933e1ff12c4f2c3b4b2d9b3a95ad97c11078 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 10:48:15 -0700 Subject: [PATCH 10/41] split column_type out of SignaturePicklist a bit --- src/sourmash/sig/__main__.py | 2 +- src/sourmash/sig/picklist.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 8bfb87e316..36af8b6a9c 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -549,7 +549,7 @@ def extract(args): notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") - n_empty_val, dup_vals = picklist.load(picklist.pickfile) + n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") if n_empty_val: diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 1c6234d167..579594ffd7 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -40,7 +40,7 @@ class SignaturePicklist: """ def __init__(self, pickfile, column_name, coltype): self.pickfile = pickfile # note: can be None - self.column_name = column_name + self.column_name = column_name # note: can be None self.coltype = coltype if coltype not in ('md5', 'md5prefix8', 'name', 'ident', 'ident.'): @@ -79,7 +79,12 @@ def _get_sig_attribute(self, ss): return q - def load(self, pickfile): + def init(self, values=[]): + if self.pickset is not None: + raise ValueError("already initialized?") + self.pickset = set(values) + + def load(self, pickfile, column_name): "load pickset, return num empty vals, and set of duplicate vals." pickset = self.pickset if pickset is None: @@ -90,12 +95,12 @@ def load(self, pickfile): with open(pickfile, newline='') as csvfile: r = csv.DictReader(csvfile) - if self.column_name not in r.fieldnames: - raise ValueError("column '{self.column_name}' not in pickfile '{pickfile}'") + if column_name not in r.fieldnames: + raise ValueError("column '{column_name}' not in pickfile '{pickfile}'") for row in r: # pick out values from column - col = row[self.column_name] + col = row[column_name] if not col: n_empty_val += 1 continue @@ -111,6 +116,9 @@ def load(self, pickfile): self.pickset = pickset return n_empty_val, dup_vals + def add(self, value): + self.pickset.add(value) + def __contains__(self, ss): "does this signature match anything in the picklist?" q = self._get_sig_attribute(ss) From de6fc063472d540661efb32ae72e1fd606cbc6d8 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 10:48:30 -0700 Subject: [PATCH 11/41] picklist tests for .signatures() methods on Index classes --- tests/test_index.py | 24 ++++++++++++++++++++++++ tests/test_lca.py | 21 +++++++++++++++++++++ tests/test_sbt.py | 27 +++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/tests/test_index.py b/tests/test_index.py index 5e30c7c585..22551e6d88 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -634,6 +634,30 @@ def test_linear_index_moltype_select(): assert len(linear2) == 0 +def test_linear_index_picklist_select(): + # test select with a picklist + from sourmash.sig.picklist import SignaturePicklist + + # this loads three ksizes, 21/31/51 + sig2 = utils.get_test_data('2.fa.sig') + siglist = sourmash.load_file_as_signatures(sig2) + + linear = LinearIndex() + for ss in siglist: + linear.insert(ss) + + # construct a picklist... + picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist.init(['f3a90d4e']) + + # select on picklist + linear2 = linear.select(picklist=picklist) + assert len(linear2) == 1 + ss = list(linear2.signatures())[0] + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('f3a90d4e55') + + @utils.in_tempdir def test_index_same_md5sum_fsstorage(c): testdata1 = utils.get_test_data('img/2706795855.sig') diff --git a/tests/test_lca.py b/tests/test_lca.py index a6443dbc6c..a13f020870 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -460,6 +460,27 @@ def test_lca_index_select(): db.select(moltype='protein') +def test_lca_index_select_picklist(): + # test 'select' method from Index base class with a picklist. + from sourmash.sig.picklist import SignaturePicklist + + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + # construct a picklist... + picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist.init(['50a92740']) + + xx = db.select(picklist=picklist) + assert xx == db + + siglist = list(db.signatures()) + assert len(siglist) == 1 + ss = siglist[0] + assert ss.md5sum().startswith('50a92740') + assert ss.minhash.ksize == 31 + + def test_search_db_scaled_gt_sig_scaled(): dbfile = utils.get_test_data('lca/47+63.lca.json') db, ksize, scaled = lca_utils.load_single_database(dbfile) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 1678dbf177..29b8d15fdf 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -636,6 +636,33 @@ def test_sbt_as_index_select(): tree.select(moltype='protein') +def test_sbt_as_index_select_picklist(): + # test 'select' method from Index base class with a picklist + from sourmash.sig.picklist import SignaturePicklist + + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(sig47) + tree.insert(sig63) + + # construct a picklist... + picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist.init(['09a08691']) + + # select on picklist + tree = tree.select(picklist=picklist) + siglist = list(tree.signatures()) + assert len(siglist) == 1 + + ss = siglist[0] + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('09a08691c') + + def test_sbt_as_index_signatures(): # test 'signatures' method from Index base class. factory = GraphFactory(31, 1e5, 4) From 1bdf88e87ce3e4a0f67fc071c601416c75fb65c2 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 10:28:23 -0700 Subject: [PATCH 12/41] split pickfile out a little bit --- src/sourmash/sig/__main__.py | 2 +- src/sourmash/sig/picklist.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index d75dc66685..d3bef25052 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -549,7 +549,7 @@ def extract(args): notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") - n_empty_val, dup_vals = picklist.load() + n_empty_val, dup_vals = picklist.load(picklist.pickfile) notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") if n_empty_val: diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 287af2bfab..1c6234d167 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -23,6 +23,7 @@ class SignaturePicklist: Initialize using ``SignaturePicklist.from_picklist_args(argstr)``, which takes an argument str like so: 'pickfile:column:coltype'. + # CTB pickfile or pickset? Here, 'pickfile' is the path to a CSV file; 'column' is the name of the column to select from the CSV file; and 'coltype' is the type of matching to do on that column. @@ -38,7 +39,7 @@ class SignaturePicklist: the signature name. """ def __init__(self, pickfile, column_name, coltype): - self.pickfile = pickfile + self.pickfile = pickfile # note: can be None self.column_name = column_name self.coltype = coltype @@ -78,16 +79,19 @@ def _get_sig_attribute(self, ss): return q - def load(self): + def load(self, pickfile): "load pickset, return num empty vals, and set of duplicate vals." - pickset = set() + pickset = self.pickset + if pickset is None: + pickset = set() + n_empty_val = 0 dup_vals = set() - with open(self.pickfile, newline='') as csvfile: + with open(pickfile, newline='') as csvfile: r = csv.DictReader(csvfile) if self.column_name not in r.fieldnames: - raise ValueError("column '{self.column_name}' not in pickfile '{self.pickfile}'") + raise ValueError("column '{self.column_name}' not in pickfile '{pickfile}'") for row in r: # pick out values from column From 3c05f95bc31952046fb1bb47a7476351454a9463 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 10:48:15 -0700 Subject: [PATCH 13/41] split column_type out of SignaturePicklist a bit --- src/sourmash/sig/__main__.py | 2 +- src/sourmash/sig/picklist.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index d3bef25052..dc4d97138b 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -549,7 +549,7 @@ def extract(args): notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") - n_empty_val, dup_vals = picklist.load(picklist.pickfile) + n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") if n_empty_val: diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 1c6234d167..579594ffd7 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -40,7 +40,7 @@ class SignaturePicklist: """ def __init__(self, pickfile, column_name, coltype): self.pickfile = pickfile # note: can be None - self.column_name = column_name + self.column_name = column_name # note: can be None self.coltype = coltype if coltype not in ('md5', 'md5prefix8', 'name', 'ident', 'ident.'): @@ -79,7 +79,12 @@ def _get_sig_attribute(self, ss): return q - def load(self, pickfile): + def init(self, values=[]): + if self.pickset is not None: + raise ValueError("already initialized?") + self.pickset = set(values) + + def load(self, pickfile, column_name): "load pickset, return num empty vals, and set of duplicate vals." pickset = self.pickset if pickset is None: @@ -90,12 +95,12 @@ def load(self, pickfile): with open(pickfile, newline='') as csvfile: r = csv.DictReader(csvfile) - if self.column_name not in r.fieldnames: - raise ValueError("column '{self.column_name}' not in pickfile '{pickfile}'") + if column_name not in r.fieldnames: + raise ValueError("column '{column_name}' not in pickfile '{pickfile}'") for row in r: # pick out values from column - col = row[self.column_name] + col = row[column_name] if not col: n_empty_val += 1 continue @@ -111,6 +116,9 @@ def load(self, pickfile): self.pickset = pickset return n_empty_val, dup_vals + def add(self, value): + self.pickset.add(value) + def __contains__(self, ss): "does this signature match anything in the picklist?" q = self._get_sig_attribute(ss) From 54407a3c47170be178731af36cafb96afafbe387 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Jun 2021 11:16:02 -0700 Subject: [PATCH 14/41] test 'Index.find' on picklists for SBTs and LCAs --- tests/test_lca.py | 36 ++++++++++++++++++++++++++++++++++++ tests/test_sbt.py | 41 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/tests/test_lca.py b/tests/test_lca.py index a13f020870..2c36f13c62 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -11,6 +11,7 @@ import sourmash from sourmash import load_one_signature, SourmashSignature +from sourmash.search import make_jaccard_search_query from sourmash.lca import lca_utils from sourmash.lca.lca_utils import LineagePair @@ -31,6 +32,41 @@ def test_api_create_search(): assert match.minhash == ss.minhash +def test_api_find_picklist_select(): + # does 'find' respect picklists? + from sourmash.sig.picklist import SignaturePicklist + + sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), + ksize=31) + sig63 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), + ksize=31) + + lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) + lca_db.insert(sig47) + lca_db.insert(sig63) + + # construct a picklist... + picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist.init(['09a08691']) + + # run a 'find' with sig63, should find 47 and 63 both. + search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) + results = list(lca_db.find(search_obj, sig63)) + print(results) + assert len(results) == 2 + + # now, select on picklist and do another find... + lca_db = lca_db.select(picklist=picklist) + results = list(lca_db.find(search_obj, sig63)) + print(results) + assert len(results) == 1 + + # and check that it is the expected one! + ss = results[0].signature + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('09a08691c') + + def test_api_create_insert(): # test some internal implementation stuff: create & then insert a sig. ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 29b8d15fdf..9e3c96698b 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -231,10 +231,12 @@ def test_search_minhashes(): # this fails if 'search_obj' is calc containment and not similarity. search_obj = make_jaccard_search_query(threshold=0.08) results = tree.find(search_obj, to_search.data) - for sr in results: + + n = 0 + for n, sr in enumerate(results): assert to_search.data.jaccard(sr.signature) >= 0.08 - print(results) + assert n == 1 def test_binary_nary_tree(): @@ -663,6 +665,41 @@ def test_sbt_as_index_select_picklist(): assert ss.md5sum().startswith('09a08691c') +def test_sbt_as_index_find_picklist(): + # test 'select' method from Index base class with a picklist + from sourmash.sig.picklist import SignaturePicklist + + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(sig47) + tree.insert(sig63) + + # construct a picklist... + picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist.init(['09a08691']) + + # run a 'find' with sig63, should find 47 and 63 both. + search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) + results = list(tree.find(search_obj, sig63)) + print(results) + assert len(results) == 2 + + # now, select on picklist and do another find... + tree = tree.select(picklist=picklist) + results = list(tree.find(search_obj, sig63)) + print(results) + assert len(results) == 1 + + # and check that it is the expected one! + ss = results[0].signature + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('09a08691c') + + def test_sbt_as_index_signatures(): # test 'signatures' method from Index base class. factory = GraphFactory(31, 1e5, 4) From a88b66dd366bf66776d489f2336fc0acfd8d11b0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Jun 2021 06:32:38 -0700 Subject: [PATCH 15/41] factor out picklist checks to 'passes_all_picklists' fn --- src/sourmash/lca/lca_db.py | 19 ++++--------------- src/sourmash/sbt.py | 19 ++++++------------- src/sourmash/sig/picklist.py | 8 ++++++++ 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 69b776aacc..ce8498b31d 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -375,6 +375,7 @@ def get_lineage_assignments(self, hashval): @cached_property def _signatures(self): "Create a _signatures member dictionary that contains {idx: sigobj}." + from sourmash.sig.picklist import passes_all_picklists from sourmash import MinHash, SourmashSignature is_protein = False @@ -422,13 +423,7 @@ def _signatures(self): name = self.ident_to_name[ident] ss = SourmashSignature(mh, name=name) - keep = True - for picklist in self.picklists: - if ss not in picklist: - keep = False - break - - if keep: + if passes_all_picklists(ss, self.picklists): sigd[idx] = SourmashSignature(mh, name=name) debug('=> {} signatures!', len(sigd)) @@ -444,6 +439,7 @@ def find(self, search_fn, query, **kwargs): can still be used for containment search, but not for similarity search. See SBT.select(...) for details. """ + from sourmash.sig.picklist import passes_all_picklists search_fn.check_is_compatible(query) # make sure we're looking at the same scaled value as database @@ -491,14 +487,7 @@ def find(self, search_fn, query, **kwargs): # signal that it is done, or something. if search_fn.passes(score): if search_fn.collect(score, subj): - - # filter on picklists - keep = True - for picklist in self.picklists: - if subj not in picklist: - keep = False - - if keep: + if passes_all_picklists(subj, self.picklists): yield IndexSearchResult(score, subj, self.location) @cached_property diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 3cbc762e2d..a684bfe207 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -155,15 +155,12 @@ def location(self): return self._location def signatures(self): + from .sig.picklist import passes_all_picklists + for k in self.leaves(): ss = k.data - keep = True - for picklist in self.picklists: - if ss not in picklist: - keep = False - - if keep: - yield k.data + if passes_all_picklists(ss, self.picklists): + yield ss def select(self, ksize=None, moltype=None, num=0, scaled=0, containment=False, picklist=None): @@ -372,6 +369,7 @@ def find(self, search_fn, query, **kwargs): search. See SBT.select(...) for details. """ from .sbtmh import SigLeaf + from .sig.picklist import passes_all_picklists search_fn.check_is_compatible(query) @@ -464,12 +462,7 @@ def node_search(node, *args, **kwargs): ss = n.data # filter on picklists - keep = True - for picklist in self.picklists: - if ss not in picklist: - keep = False - - if keep: + if passes_all_picklists(ss, self.picklists): yield IndexSearchResult(results[ss], ss, self.location) def _rebuild_node(self, pos=0): diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 579594ffd7..3593998e81 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -134,3 +134,11 @@ def filter(self, it): for ss in it: if self.__contains__(ss): yield ss + + +def passes_all_picklists(ss, picklists): + "does the signature 'ss' pass all of the picklists?" + for picklist in picklists: + if ss not in picklist: + return False + return True From aaa45485d9c180bb9b64c1e186a0f2aa946aead1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 09:18:08 -0700 Subject: [PATCH 16/41] update comments, constructor, etc. --- src/sourmash/sig/picklist.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 579594ffd7..03895031d5 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -1,7 +1,6 @@ "Picklist code for extracting subsets of signatures." import csv - # set up preprocessing functions for column stuff preprocess = {} @@ -15,6 +14,7 @@ # match 8 characters preprocess['md5prefix8'] = lambda x: x[:8] +preprocess['md5short'] = lambda x: x[:8] class SignaturePicklist: @@ -32,18 +32,23 @@ class SignaturePicklist: * 'name' - exact match to signature's name * 'md5' - exact match to signature's md5sum * 'md5prefix8' - match to 8-character prefix of signature's md5sum + * 'md5short' - same as md5prefix8 * 'ident' - exact match to signature's identifier - * 'ident.' - match to signature's identifier, before '.' + * 'identprefix' - match to signature's identifier, before '.' Identifiers are constructed by using the first space delimited word in the signature name. """ - def __init__(self, pickfile, column_name, coltype): - self.pickfile = pickfile # note: can be None - self.column_name = column_name # note: can be None + supported_coltypes = ('md5', 'md5prefix8', 'md5short', + 'name', 'ident', 'identprefix') + + def __init__(self, coltype, *, pickfile=None, column_name=None): + "create a picklist of column type 'coltype'." self.coltype = coltype + self.pickfile = pickfile + self.column_name = column_name - if coltype not in ('md5', 'md5prefix8', 'name', 'ident', 'ident.'): + if coltype not in self.supported_coltypes: raise ValueError(f"invalid picklist column type '{coltype}'") self.preprocess_fn = preprocess[coltype] @@ -61,7 +66,7 @@ def from_picklist_args(cls, argstr): assert len(picklist) == 3 pickfile, column, coltype = picklist - return cls(pickfile, column, coltype) + return cls(coltype, pickfile=pickfile, column_name=column) def _get_sig_attribute(self, ss): "for a given SourmashSignature, return attribute for this picklist." @@ -80,6 +85,7 @@ def _get_sig_attribute(self, ss): return q def init(self, values=[]): + "initialize a Picklist object with given values." if self.pickset is not None: raise ValueError("already initialized?") self.pickset = set(values) @@ -117,20 +123,28 @@ def load(self, pickfile, column_name): return n_empty_val, dup_vals def add(self, value): + "Add a value to this picklist." self.pickset.add(value) def __contains__(self, ss): "does this signature match anything in the picklist?" + # pull out the relevant signature attribute q = self._get_sig_attribute(ss) + + # mangle into the kinds of values we support here q = self.preprocess_fn(q) + # add to the number of queries performed, self.n_queries += 1 + + # determine if ok or not. if q in self.pickset: self.found.add(q) return True return False def filter(self, it): + "yield all signatures in the given iterator that are in the picklist" for ss in it: if self.__contains__(ss): yield ss From 9b50748e85a876d143a8c2f05d94143ae5e8c81d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 09:19:40 -0700 Subject: [PATCH 17/41] fix tests :) --- src/sourmash/sig/picklist.py | 6 ++++-- tests/test_cmd_signature.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 03895031d5..d59cfee658 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -9,7 +9,7 @@ preprocess['md5'] = lambda x: x # identifier matches/prefix foo - space delimited identifiers -preprocess['ident.'] = lambda x: x.split(' ')[0].split('.')[0] +preprocess['identprefix'] = lambda x: x.split(' ')[0].split('.')[0] preprocess['ident'] = lambda x: x.split(' ')[0] # match 8 characters @@ -79,8 +79,10 @@ def _get_sig_attribute(self, ss): q = ss.name elif coltype == 'ident': q = ss.name - elif coltype == 'ident.': + elif coltype == 'identprefix': q = ss.name + else: + assert 0 return q diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index db71ec38d6..07a7c02b45 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1222,7 +1222,7 @@ def test_sig_extract_8_picklist_ident_dot(runtmp): w.writeheader() w.writerow(row) - picklist_arg = f"{picklist_csv}:nodotIdent:ident." + picklist_arg = f"{picklist_csv}:nodotIdent:identprefix" runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) # stdout should be new signature From 207a813ab5083c15e9f70c4942dbf08d5cb963cc Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 09:31:22 -0700 Subject: [PATCH 18/41] more picklist tests --- src/sourmash/sig/picklist.py | 10 +---- tests/test_cmd_signature.py | 81 ++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index d59cfee658..6c2594c100 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -71,15 +71,9 @@ def from_picklist_args(cls, argstr): def _get_sig_attribute(self, ss): "for a given SourmashSignature, return attribute for this picklist." coltype = self.coltype - if coltype == 'md5': + if coltype in ('md5', 'md5prefix8', 'md5short'): q = ss.md5sum() - elif coltype == 'md5prefix8': - q = ss.md5sum() - elif coltype == 'name': - q = ss.name - elif coltype == 'ident': - q = ss.name - elif coltype == 'identprefix': + elif coltype in ('name', 'ident', 'identprefix'): q = ss.name else: assert 0 diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 07a7c02b45..5930e8eb7e 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1265,6 +1265,87 @@ def test_sig_extract_8_picklist_md5_short(runtmp): assert actual_extract_sig == test_extract_sig +def test_sig_extract_8_picklist_md5_short_alias(runtmp): + # extract 47 from 47, using a picklist w/full md5 + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:md5short:md5short" + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + +def test_sig_extract_8_picklist_md5_nomatch(runtmp): + # use an empty picklist => no match + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=['md5short']) + w.writeheader() + + picklist_arg = f"{picklist_csv}:md5short:md5prefix8" + + with pytest.raises(ValueError): + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', + picklist_arg) + + # stdout should be new signature + out = runtmp.last_result.out + print(out) + err = runtmp.last_result.err + print(err) + assert "no matching signatures to save!" in err + assert runtmp.last_result.status != 0 + + +def test_sig_extract_9_picklist_md5_ksize_hp_select(runtmp): + # test with -k and moltype selector + sigdir = utils.get_test_data('prot/') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=['md5']) + w.writeheader() + w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + + picklist_arg = f"{picklist_csv}:md5:md5" + + runtmp.sourmash('sig', 'extract', sigdir, '--picklist', + picklist_arg, '-k', '19', '--hp') + + # stdout should be new signature + out = runtmp.last_result.out + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig.minhash.ksize == 19 + assert actual_extract_sig.minhash.moltype == 'hp' + + @utils.in_tempdir def test_sig_flatten_1(c): # extract matches to several names from among several signatures & flatten From 14a88a76f0b6cc6176dc8002a4cb454121dd5de2 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 09:34:43 -0700 Subject: [PATCH 19/41] verify output --- src/sourmash/sig/__main__.py | 2 +- tests/test_cmd_signature.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index dc4d97138b..0483963287 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -603,7 +603,7 @@ def filter_fn(it): notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) if picklist: - notify(f"for given picklist, found {len(picklist.found)} matches of {len(picklist.pickset)} total") + notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values") n_missing = len(picklist.pickset - picklist.found) if n_missing: notify(f"WARNING: {n_missing} missing picklist values.") diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 5930e8eb7e..1dda2390a1 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1140,6 +1140,14 @@ def test_sig_extract_8_picklist_md5(runtmp): assert actual_extract_sig == test_extract_sig + err = runtmp.last_result.err + + print(err) + assert "loaded 1 distinct values into picklist." in err + assert "loaded 2 total that matched ksize & molecule type" in err + assert "extracted 1 signatures from 2 file(s)" in err + assert "for given picklist, found 1 matches to 1 distinct values" in err + def test_sig_extract_8_picklist_name(runtmp): # extract 47 from 47, using a picklist w/full md5 From 3d23d87d9e4de345fc3f3f3cda690a134dfb7ecf Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 09:39:59 -0700 Subject: [PATCH 20/41] add --picklist-require-all &c --- src/sourmash/cli/sig/extract.py | 4 +++ src/sourmash/sig/__main__.py | 3 +++ tests/test_cmd_signature.py | 47 +++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py index d822ae9db7..81bc788a93 100644 --- a/src/sourmash/cli/sig/extract.py +++ b/src/sourmash/cli/sig/extract.py @@ -29,6 +29,10 @@ def subparser(subparsers): '--picklist', default=None, help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'" ) + subparser.add_argument( + '--picklist-require-all', default=False, action='store_true', + help="require that all picklist values be found or else fail" + ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 0483963287..e342f14470 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -607,6 +607,9 @@ def filter_fn(it): n_missing = len(picklist.pickset - picklist.found) if n_missing: notify(f"WARNING: {n_missing} missing picklist values.") + if args.picklist_require_all: + error("ERROR: failing because --picklist-require-all was set") + sys.exit(-1) def filter(args): diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 1dda2390a1..f62ab58a3a 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1149,6 +1149,53 @@ def test_sig_extract_8_picklist_md5(runtmp): assert "for given picklist, found 1 matches to 1 distinct values" in err +def test_sig_extract_8_picklist_md5_require_all(runtmp): + # extract 47 from 47, using a picklist w/full md5; + # confirm that check missing picklist val errors out on --picklist-require + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + w.writerow(dict(exactName='', md5full='BAD MD5', + md5short='', fullIdent='', nodotIdent='')) + + picklist_arg = f"{picklist_csv}:md5full:md5" + with pytest.raises(ValueError): + runtmp.sourmash('sig', 'extract', sig47, sig63, + '--picklist', picklist_arg, + '--picklist-require-all') + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + err = runtmp.last_result.err + + print(err) + assert "loaded 2 distinct values into picklist." in err + assert "loaded 2 total that matched ksize & molecule type" in err + assert "extracted 1 signatures from 2 file(s)" in err + assert "for given picklist, found 1 matches to 2 distinct values" in err + assert 'WARNING: 1 missing picklist values.' in err + assert 'ERROR: failing because --picklist-require-all was set' in err + + def test_sig_extract_8_picklist_name(runtmp): # extract 47 from 47, using a picklist w/full md5 sig47 = utils.get_test_data('47.fa.sig') From 9d60e3221e186989eb16b65fcccf3ca254c81306 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 10:15:06 -0700 Subject: [PATCH 21/41] documentation --- doc/command-line.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/doc/command-line.md b/doc/command-line.md index 6489b3167f..051134b98f 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -818,6 +818,40 @@ sourmash signature extract tests/test-data/*.fa.sig --name NC_009665 will extract the same signature, which has an accession number of `NC_009665.1`. +#### Using picklists with `sourmash sig extract` + +As of sourmash 4.2.0, `extract` also supports picklists, a feature by +which you can select signatures based on values in a CSV file. + +For example, +``` +sourmash sig extract --picklist list.csv:md5:md5sum +``` +will extract only the signatures that have md5sums matching the +column `md5sum` in the CSV file `list.csv`. + +The `--picklist` argument string must be of the format +`pickfile:colname:coltype`, where `pickfile` is the path to a CSV +file, `colname` is the name of the column to select from the CSV +file (based on the headers in the first line of the CSV file), +and `coltype` is the type of match. + +The following `coltype`s are currently supported by `sourmash sig extract`: + +* `name` - exact match to signature's name +* `md5` - exact match to signature's md5sum +* `md5prefix8` - match to 8-character prefix of signature's md5sum +* `md5short` - same as `md5prefix8` +* `ident` - exact match to signature's identifier +* `identprefix` - match to signature's identifier, before '.' + +Identifiers are constructed by using the first space delimited word in +the signature name. + +One way to build a picklist is to use `sourmash sig describe --csv +out.csv ` to construct an initial CSV file that you can +then edit further. + ### `sourmash signature flatten` - remove abundance information from signatures Flatten the specified signature(s), removing abundances and setting From 8f65f223c726e8a83c348deda5abd51ff3bb09bf Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 10:17:50 -0700 Subject: [PATCH 22/41] test with --md5 selector --- tests/test_cmd_signature.py | 61 +++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index f62ab58a3a..c15bd4d724 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1351,6 +1351,67 @@ def test_sig_extract_8_picklist_md5_short_alias(runtmp): assert actual_extract_sig == test_extract_sig +def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch(runtmp): + # extract 47 from 47, using a picklist w/full md5 and also md5 selector + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:md5short:md5short" + with pytest.raises(ValueError): + runtmp.sourmash('sig', 'extract', sig47, sig63, + '--picklist', picklist_arg, + '--md5', 'XXX') # no match to md5 selector here + + err = runtmp.last_result.err + assert "no matching signatures to save!" in err + + +def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector(runtmp): + # extract 47 from 47, using a picklist w/full md5 and also md5 selector + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:md5short:md5short" + runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg, + '--md5', '09a08691ce5295215') + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + def test_sig_extract_8_picklist_md5_nomatch(runtmp): # use an empty picklist => no match sig47 = utils.get_test_data('47.fa.sig') From 4f8e20c88a3fde5792351dd83a6dece08245dc97 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 11:16:49 -0700 Subject: [PATCH 23/41] cover untested code with tests --- src/sourmash/sig/__main__.py | 4 +- src/sourmash/sig/picklist.py | 10 ++-- tests/test_cmd_signature.py | 98 ++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 8 deletions(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index e342f14470..eab898b39b 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -553,9 +553,9 @@ def extract(args): notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") if n_empty_val: - notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in CSV file") + notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file") if dup_vals: - notify(f"WARNING: {len(dup_vals)} values in column '{picklist.column_name}' were not distinct") + notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct") picklist_filter_fn = picklist.filter else: def picklist_filter_fn(it): diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 6c2594c100..4d07f309dc 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -85,12 +85,11 @@ def init(self, values=[]): if self.pickset is not None: raise ValueError("already initialized?") self.pickset = set(values) + return self.pickset def load(self, pickfile, column_name): "load pickset, return num empty vals, and set of duplicate vals." - pickset = self.pickset - if pickset is None: - pickset = set() + pickset = self.init() n_empty_val = 0 dup_vals = set() @@ -98,7 +97,7 @@ def load(self, pickfile, column_name): r = csv.DictReader(csvfile) if column_name not in r.fieldnames: - raise ValueError("column '{column_name}' not in pickfile '{pickfile}'") + raise ValueError(f"column '{column_name}' not in pickfile '{pickfile}'") for row in r: # pick out values from column @@ -113,9 +112,8 @@ def load(self, pickfile, column_name): if col in pickset: dup_vals.add(col) else: - pickset.add(col) + self.add(col) - self.pickset = pickset return n_empty_val, dup_vals def add(self, value): diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index c15bd4d724..e243234de0 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1462,6 +1462,104 @@ def test_sig_extract_9_picklist_md5_ksize_hp_select(runtmp): assert actual_extract_sig.minhash.moltype == 'hp' +def test_sig_extract_10_picklist_md5_dups_and_empty(runtmp): + # test empty picklist values, and duplicate picklist values + sigdir = utils.get_test_data('prot/') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=['md5']) + w.writeheader() + w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5='')) + + picklist_arg = f"{picklist_csv}:md5:md5" + + runtmp.sourmash('sig', 'extract', sigdir, '--picklist', + picklist_arg, '-k', '19', '--hp') + + # stdout should be new signature + out = runtmp.last_result.out + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig.minhash.ksize == 19 + assert actual_extract_sig.minhash.moltype == 'hp' + + err = runtmp.last_result.err + print(err) + + assert "WARNING: 1 empty values in column 'md5' in picklist file" in err + assert "WARNING: 1 values in picklist column 'md5' were not distinct" in err + + +def test_sig_extract_11_picklist_bad_coltype(runtmp): + # test with invalid picklist coltype + sigdir = utils.get_test_data('prot/') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=['md5']) + w.writeheader() + w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + + picklist_arg = f"{picklist_csv}:md5:BADCOLTYPE" + + with pytest.raises(ValueError): + runtmp.sourmash('sig', 'extract', sigdir, '--picklist', + picklist_arg, '-k', '19', '--hp') + + err = runtmp.last_result.err + print(err) + assert "ValueError: invalid picklist column type 'BADCOLTYPE'" in err + + +def test_sig_extract_12_picklist_bad_argstr(runtmp): + # test with invalid argument format to --picklist + sigdir = utils.get_test_data('prot/') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=['md5']) + w.writeheader() + w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + + picklist_arg = f"{picklist_csv}" + + with pytest.raises(ValueError): + runtmp.sourmash('sig', 'extract', sigdir, '--picklist', + picklist_arg, '-k', '19', '--hp') + + err = runtmp.last_result.err + print(err) + assert "invalid picklist argument" in err + + +def test_sig_extract_12_picklist_bad_colname(runtmp): + # test with invalid picklist colname + sigdir = utils.get_test_data('prot/') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=['md5']) + w.writeheader() + w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + + picklist_arg = f"{picklist_csv}:BADCOLNAME:md5" + + with pytest.raises(ValueError): + runtmp.sourmash('sig', 'extract', sigdir, '--picklist', + picklist_arg, '-k', '19', '--hp') + + err = runtmp.last_result.err + print(err) + assert "ValueError: column 'BADCOLNAME' not in pickfile" in err + + @utils.in_tempdir def test_sig_flatten_1(c): # extract matches to several names from among several signatures & flatten From 14b87d415b8d6eff062b9bf284bb26f9de1305df Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 11:18:32 -0700 Subject: [PATCH 24/41] trap errors and be nice to users --- src/sourmash/sig/__main__.py | 7 ++++++- tests/test_cmd_signature.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index eab898b39b..90ffa78951 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -545,7 +545,12 @@ def extract(args): picklist = None if args.picklist: - picklist = SignaturePicklist.from_picklist_args(args.picklist) + try: + picklist = SignaturePicklist.from_picklist_args(args.picklist) + except ValueError as exc: + error("ERROR: could not load picklist.") + error(str(exc)) + sys.exit(-1) notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index e243234de0..9de8e89267 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1513,7 +1513,7 @@ def test_sig_extract_11_picklist_bad_coltype(runtmp): err = runtmp.last_result.err print(err) - assert "ValueError: invalid picklist column type 'BADCOLTYPE'" in err + assert "invalid picklist column type 'BADCOLTYPE'" in err def test_sig_extract_12_picklist_bad_argstr(runtmp): @@ -1557,7 +1557,7 @@ def test_sig_extract_12_picklist_bad_colname(runtmp): err = runtmp.last_result.err print(err) - assert "ValueError: column 'BADCOLNAME' not in pickfile" in err + assert "column 'BADCOLNAME' not in pickfile" in err @utils.in_tempdir From 04c209cdfd6a3bd5163faf4a104bf99f46b3355d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 11:27:21 -0700 Subject: [PATCH 25/41] remove comment --- src/sourmash/sig/picklist.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py index 4d07f309dc..61d5ccc656 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/sig/picklist.py @@ -23,7 +23,6 @@ class SignaturePicklist: Initialize using ``SignaturePicklist.from_picklist_args(argstr)``, which takes an argument str like so: 'pickfile:column:coltype'. - # CTB pickfile or pickset? Here, 'pickfile' is the path to a CSV file; 'column' is the name of the column to select from the CSV file; and 'coltype' is the type of matching to do on that column. From 21ce4b72623dd5aba4348625642abb19d6f98455 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 16 Jun 2021 15:41:51 -0700 Subject: [PATCH 26/41] fix tests for new SignaturePicklist --- tests/test_index.py | 2 +- tests/test_lca.py | 4 ++-- tests/test_sbt.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_index.py b/tests/test_index.py index 22551e6d88..97738f21fc 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -647,7 +647,7 @@ def test_linear_index_picklist_select(): linear.insert(ss) # construct a picklist... - picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist = SignaturePicklist('md5prefix8') picklist.init(['f3a90d4e']) # select on picklist diff --git a/tests/test_lca.py b/tests/test_lca.py index 2c36f13c62..d5e1d751ac 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -46,7 +46,7 @@ def test_api_find_picklist_select(): lca_db.insert(sig63) # construct a picklist... - picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist = SignaturePicklist('md5prefix8') picklist.init(['09a08691']) # run a 'find' with sig63, should find 47 and 63 both. @@ -504,7 +504,7 @@ def test_lca_index_select_picklist(): db, ksize, scaled = lca_utils.load_single_database(filename) # construct a picklist... - picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist = SignaturePicklist('md5prefix8') picklist.init(['50a92740']) xx = db.select(picklist=picklist) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 9e3c96698b..526893c5ae 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -652,7 +652,7 @@ def test_sbt_as_index_select_picklist(): tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist = SignaturePicklist('md5prefix8') picklist.init(['09a08691']) # select on picklist @@ -679,7 +679,7 @@ def test_sbt_as_index_find_picklist(): tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist(None, None, 'md5prefix8') + picklist = SignaturePicklist('md5prefix8') picklist.init(['09a08691']) # run a 'find' with sig63, should find 47 and 63 both. From b3c6bb9ee9a8ffb5095da89ba213a58c54700691 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 06:09:40 -0700 Subject: [PATCH 27/41] move picklist.py from sourmash.sig into sourmash --- src/sourmash/lca/lca_db.py | 3 +-- src/sourmash/{sig => }/picklist.py | 0 src/sourmash/sbt.py | 4 +--- src/sourmash/sig/__main__.py | 2 +- tests/test_index.py | 2 +- tests/test_lca.py | 3 +-- tests/test_sbt.py | 3 +-- 7 files changed, 6 insertions(+), 11 deletions(-) rename src/sourmash/{sig => }/picklist.py (100%) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index ce8498b31d..9446414108 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -9,6 +9,7 @@ from sourmash.minhash import _get_max_hash_for_scaled from sourmash.logging import notify, error, debug from sourmash.index import Index, IndexSearchResult +from sourmash.picklist import passes_all_picklists def cached_property(fun): @@ -375,7 +376,6 @@ def get_lineage_assignments(self, hashval): @cached_property def _signatures(self): "Create a _signatures member dictionary that contains {idx: sigobj}." - from sourmash.sig.picklist import passes_all_picklists from sourmash import MinHash, SourmashSignature is_protein = False @@ -439,7 +439,6 @@ def find(self, search_fn, query, **kwargs): can still be used for containment search, but not for similarity search. See SBT.select(...) for details. """ - from sourmash.sig.picklist import passes_all_picklists search_fn.check_is_compatible(query) # make sure we're looking at the same scaled value as database diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/picklist.py similarity index 100% rename from src/sourmash/sig/picklist.py rename to src/sourmash/picklist.py diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index a684bfe207..498365dbd6 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -20,6 +20,7 @@ from .sbt_storage import FSStorage, IPFSStorage, RedisStorage, ZipStorage from .logging import error, notify, debug from .index import Index, IndexSearchResult +from .picklist import passes_all_picklists from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions @@ -155,8 +156,6 @@ def location(self): return self._location def signatures(self): - from .sig.picklist import passes_all_picklists - for k in self.leaves(): ss = k.data if passes_all_picklists(ss, self.picklists): @@ -369,7 +368,6 @@ def find(self, search_fn, query, **kwargs): search. See SBT.select(...) for details. """ from .sbtmh import SigLeaf - from .sig.picklist import passes_all_picklists search_fn.check_is_compatible(query) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 8bf37b436b..fa31ba66b7 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -13,7 +13,7 @@ from sourmash.logging import set_quiet, error, notify, print_results, debug from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled -from .picklist import SignaturePicklist +from sourmash.picklist import SignaturePicklist usage=''' sourmash signature [] - manipulate/work with signature files. diff --git a/tests/test_index.py b/tests/test_index.py index 97738f21fc..2da959410a 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -17,6 +17,7 @@ from sourmash.sbtmh import SigLeaf from sourmash import sourmash_args from sourmash.search import JaccardSearch, SearchType +from sourmash.picklist import SignaturePicklist import sourmash_tst_utils as utils @@ -636,7 +637,6 @@ def test_linear_index_moltype_select(): def test_linear_index_picklist_select(): # test select with a picklist - from sourmash.sig.picklist import SignaturePicklist # this loads three ksizes, 21/31/51 sig2 = utils.get_test_data('2.fa.sig') diff --git a/tests/test_lca.py b/tests/test_lca.py index d5e1d751ac..cdb1983a2f 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -14,6 +14,7 @@ from sourmash.search import make_jaccard_search_query from sourmash.lca import lca_utils from sourmash.lca.lca_utils import LineagePair +from sourmash.picklist import SignaturePicklist def test_api_create_search(): @@ -34,7 +35,6 @@ def test_api_create_search(): def test_api_find_picklist_select(): # does 'find' respect picklists? - from sourmash.sig.picklist import SignaturePicklist sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) @@ -498,7 +498,6 @@ def test_lca_index_select(): def test_lca_index_select_picklist(): # test 'select' method from Index base class with a picklist. - from sourmash.sig.picklist import SignaturePicklist filename = utils.get_test_data('lca/47+63.lca.json') db, ksize, scaled = lca_utils.load_single_database(filename) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 526893c5ae..f1980d1015 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -14,6 +14,7 @@ from sourmash.sbt_storage import (FSStorage, RedisStorage, IPFSStorage, ZipStorage) from sourmash.search import make_jaccard_search_query +from sourmash.picklist import SignaturePicklist import sourmash_tst_utils as utils @@ -640,7 +641,6 @@ def test_sbt_as_index_select(): def test_sbt_as_index_select_picklist(): # test 'select' method from Index base class with a picklist - from sourmash.sig.picklist import SignaturePicklist factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) @@ -667,7 +667,6 @@ def test_sbt_as_index_select_picklist(): def test_sbt_as_index_find_picklist(): # test 'select' method from Index base class with a picklist - from sourmash.sig.picklist import SignaturePicklist factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) From fddf1413263db2d6c9e57af64299760722f23c88 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 06:14:09 -0700 Subject: [PATCH 28/41] move picklist reporting into sourmash_args --- src/sourmash/sig/__main__.py | 29 ++-------------------------- src/sourmash/sourmash_args.py | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index fa31ba66b7..22bb54c0e6 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -542,25 +542,7 @@ def extract(args): """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) - - picklist = None - if args.picklist: - try: - picklist = SignaturePicklist.from_picklist_args(args.picklist) - except ValueError as exc: - error("ERROR: could not load picklist.") - error(str(exc)) - sys.exit(-1) - - notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") - - n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) - - notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") - if n_empty_val: - notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file") - if dup_vals: - notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct") + picklist = sourmash_args.load_picklist(args) # further filtering on md5 or name? if args.md5 is not None or args.name is not None: @@ -606,14 +588,7 @@ def filter_fn(it): notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) if picklist: - notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values") - n_missing = len(picklist.pickset - picklist.found) - if n_missing: - notify(f"WARNING: {n_missing} missing picklist values.") - if args.picklist_require_all: - error("ERROR: failing because --picklist-require-all was set") - sys.exit(-1) - + sourmash_args.report_picklist(args, picklist) def filter(args): """ diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 40c7d35444..e044c27ce3 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -19,6 +19,8 @@ from .index import (LinearIndex, ZipFileLinearIndex, MultiIndex) from . import signature as sigmod +from .picklist import SignaturePicklist + DEFAULT_LOAD_K = 31 @@ -57,6 +59,40 @@ def calculate_moltype(args, default=None): return moltype +def load_picklist(args): + "Load a SignaturePicklist from --picklist arguments." + picklist = None + if args.picklist: + try: + picklist = SignaturePicklist.from_picklist_args(args.picklist) + except ValueError as exc: + error("ERROR: could not load picklist.") + error(str(exc)) + sys.exit(-1) + + notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") + + n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) + + notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") + if n_empty_val: + notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file") + if dup_vals: + notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct") + + return picklist + + +def report_picklist(args, picklist): + notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values") + n_missing = len(picklist.pickset - picklist.found) + if n_missing: + notify(f"WARNING: {n_missing} missing picklist values.") + if args.picklist_require_all: + error("ERROR: failing because --picklist-require-all was set") + sys.exit(-1) + + def load_query_signature(filename, ksize, select_moltype, select_md5=None): """Load a single signature to use as a query. From 984a557899edec1af9b5a2a514d4912ea6cb3f83 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 06:14:23 -0700 Subject: [PATCH 29/41] fix space --- src/sourmash/sig/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 22bb54c0e6..e15cd9840d 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -590,6 +590,7 @@ def filter_fn(it): if picklist: sourmash_args.report_picklist(args, picklist) + def filter(args): """ filter hashes by abundance in all of the signatures From ced72d2d22b34d4b1d640c403e0e4d3794a2a0eb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 06:25:01 -0700 Subject: [PATCH 30/41] add picklist args throughout, eek. --- src/sourmash/cli/compare.py | 4 +++- src/sourmash/cli/gather.py | 9 ++++++--- src/sourmash/cli/index.py | 6 ++++-- src/sourmash/cli/lca/index.py | 9 ++++++--- src/sourmash/cli/prefetch.py | 4 +++- src/sourmash/cli/search.py | 4 +++- src/sourmash/cli/sig/extract.py | 12 +++--------- src/sourmash/cli/utils.py | 10 ++++++++++ src/sourmash/commands.py | 7 ++++++- src/sourmash/sourmash_args.py | 6 +++++- 10 files changed, 49 insertions(+), 22 deletions(-) diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py index dcec015bd5..f7387f68b2 100644 --- a/src/sourmash/cli/compare.py +++ b/src/sourmash/cli/compare.py @@ -1,6 +1,7 @@ """compare sequence signatures made by compute""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -47,6 +48,7 @@ def subparser(subparsers): subparser.add_argument( '-p', '--processes', metavar='N', type=int, default=None, help='Number of processes to use to calculate similarity') + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/gather.py b/src/sourmash/cli/gather.py index 3d2e6d1a24..6e0addd427 100644 --- a/src/sourmash/cli/gather.py +++ b/src/sourmash/cli/gather.py @@ -1,6 +1,7 @@ """search a metagenome signature against dbs""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -60,8 +61,6 @@ def subparser(subparsers): '--cache-size', default=0, type=int, metavar='N', help='number of internal SBT nodes to cache in memory (default: 0, cache all nodes)' ) - add_ksize_arg(subparser, 31) - add_moltype_args(subparser) # advanced parameters subparser.add_argument( @@ -80,6 +79,10 @@ def subparser(subparsers): help="use prefetch before gather; see documentation", ) + add_ksize_arg(subparser, 31) + add_moltype_args(subparser) + add_picklist_args(subparser) + def main(args): import sourmash diff --git a/src/sourmash/cli/index.py b/src/sourmash/cli/index.py index 1be7f06690..334b394bfe 100644 --- a/src/sourmash/cli/index.py +++ b/src/sourmash/cli/index.py @@ -25,7 +25,8 @@ --- """ -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -44,7 +45,6 @@ def subparser(subparsers): '-q', '--quiet', action='store_true', help='suppress non-error output' ) - add_ksize_arg(subparser, 31) subparser.add_argument( '-d', '--n_children', metavar='D', type=int, default=2, help='number of children for internal nodes; default=2' @@ -70,7 +70,9 @@ def subparser(subparsers): '--scaled', metavar='FLOAT', type=float, default=0, help='downsample signatures to the specified scaled factor' ) + add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/lca/index.py b/src/sourmash/cli/lca/index.py index 581ff63dcd..09bc7f75fb 100644 --- a/src/sourmash/cli/lca/index.py +++ b/src/sourmash/cli/lca/index.py @@ -1,6 +1,7 @@ """create LCA database""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -18,8 +19,6 @@ def subparser(subparsers): subparser.add_argument( '--scaled', metavar='S', default=10000, type=float ) - add_ksize_arg(subparser, 31) - add_moltype_args(subparser) subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -53,6 +52,10 @@ def subparser(subparsers): help='ignore signatures with no taxonomy entry' ) + add_ksize_arg(subparser, 31) + add_moltype_args(subparser) + add_picklist_args(subparser) + def main(args): import sourmash diff --git a/src/sourmash/cli/prefetch.py b/src/sourmash/cli/prefetch.py index 27a254c68e..e04c537193 100644 --- a/src/sourmash/cli/prefetch.py +++ b/src/sourmash/cli/prefetch.py @@ -1,6 +1,7 @@ """search a signature against dbs, find all overlaps""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -63,6 +64,7 @@ def subparser(subparsers): ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index 9ff4ab9985..c4e1d41323 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -1,6 +1,7 @@ """search a signature against other signatures""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -59,6 +60,7 @@ def subparser(subparsers): ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py index 81bc788a93..9ea71eb229 100644 --- a/src/sourmash/cli/sig/extract.py +++ b/src/sourmash/cli/sig/extract.py @@ -2,7 +2,8 @@ import sys -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): @@ -25,16 +26,9 @@ def subparser(subparsers): '--name', default=None, help='select signatures whose name contains this substring' ) - subparser.add_argument( - '--picklist', default=None, - help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'" - ) - subparser.add_argument( - '--picklist-require-all', default=False, action='store_true', - help="require that all picklist values be found or else fail" - ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 4bb918643a..1409ed6225 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -50,6 +50,16 @@ def add_ksize_arg(parser, default=31): help='k-mer size; default={d}'.format(d=default) ) +def add_picklist_args(parser): + parser.add_argument( + '--picklist', default=None, + help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'" + ) + parser.add_argument( + '--picklist-require-all', default=False, action='store_true', + help="require that all picklist values be found or else fail" + ) + def opfilter(path): return not path.startswith('__') and path not in ['utils'] diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 0d8a9f2266..2a7d26b050 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -429,6 +429,7 @@ def search(args): set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) # set up the query. query = sourmash_args.load_query_signature(args.query, @@ -458,7 +459,8 @@ def search(args): sys.exit(-1) databases = sourmash_args.load_dbs_and_sigs(args.databases, query, - not is_containment) + not is_containment, + picklist=picklist) if not len(databases): error('Nothing found to search!') @@ -531,6 +533,9 @@ def search(args): for sr in results: save_sig.add(sr.match) + if picklist: + sourmash_args.report_picklist(args, picklist) + def categorize(args): "Use a database to find the best match to many signatures." diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index e044c27ce3..60850469c3 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -173,7 +173,8 @@ def traverse_find_sigs(filenames, yield_all_files=False): yield fullname -def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None): +def load_dbs_and_sigs(filenames, query, is_similarity_query, *, + cache_size=None, picklist=None): """ Load one or more SBTs, LCAs, and/or collections of signatures. @@ -215,6 +216,9 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) notify(f"no compatible signatures found in '{filename}'") sys.exit(-1) + if picklist: + db = db.select(picklist=picklist) + databases.append(db) # calc num loaded info. From 7a30b2076284ea7a5cb709bc94be39f0264131e6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 07:12:17 -0700 Subject: [PATCH 31/41] add picklists and tests for search, gather, index --- src/sourmash/commands.py | 20 +++++- src/sourmash/index.py | 1 + tests/test-data/gather/all-picklist.csv | 37 ++++++++++ tests/test-data/gather/campy-picklist.csv | 4 ++ .../test-data/gather/salmonella-picklist.csv | 25 +++++++ .../test-data/gather/thermotoga-picklist.csv | 10 +++ tests/test_sourmash.py | 71 +++++++++++++++++++ 7 files changed, 166 insertions(+), 2 deletions(-) create mode 100644 tests/test-data/gather/all-picklist.csv create mode 100644 tests/test-data/gather/campy-picklist.csv create mode 100644 tests/test-data/gather/salmonella-picklist.csv create mode 100644 tests/test-data/gather/thermotoga-picklist.csv diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 2a7d26b050..ae1fb8e57e 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -336,6 +336,7 @@ def index(args): """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) if args.append: tree = load_sbt_index(args.sbt_name) @@ -372,6 +373,7 @@ def index(args): ksize=args.ksize, select_moltype=moltype, yield_all_files=args.force, + picklist=picklist, progress=progress) # load all matching signatures in this file @@ -417,6 +419,9 @@ def index(args): error('no signatures found to load into tree!? failing.') sys.exit(-1) + if picklist: + sourmash_args.report_picklist(args, picklist) + notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name) tree.save(args.sbt_name, sparseness=args.sparseness) if tree.storage: @@ -620,6 +625,7 @@ def gather(args): set_quiet(args.quiet, args.debug) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) # load the query signature & figure out all the things query = sourmash_args.load_query_signature(args.query, @@ -651,7 +657,8 @@ def gather(args): if args.cache_size == 0: cache_size = None databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False, - cache_size=cache_size) + cache_size=cache_size, + picklist=picklist) if not len(databases): error('Nothing found to search!') @@ -669,7 +676,12 @@ def gather(args): counters = [] for db in databases: - counter = db.counter_gather(prefetch_query, args.threshold_bp) + try: + counter = db.counter_gather(prefetch_query, args.threshold_bp) + except ValueError: + if picklist: + # catch "no signatures to search" ValueError... + continue save_prefetch.add_many(counter.siglist) counters.append(counter) @@ -774,6 +786,10 @@ def gather(args): with FileOutput(args.output_unassigned, 'wt') as fp: sig.save_signatures([ next_query ], fp) + + if picklist: + sourmash_args.report_picklist(args, picklist) + # DONE w/gather function. diff --git a/src/sourmash/index.py b/src/sourmash/index.py index b344a3cabc..477f141849 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -393,6 +393,7 @@ def signatures(self): "Return the selected signatures." db = self.db.select(**self.selection_dict) for ss in db.signatures(): + print('MATCH!', ss) yield ss def signatures_with_location(self): diff --git a/tests/test-data/gather/all-picklist.csv b/tests/test-data/gather/all-picklist.csv new file mode 100644 index 0000000000..b9ebbaa522 --- /dev/null +++ b/tests/test-data/gather/all-picklist.csv @@ -0,0 +1,37 @@ +signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,323c1a1712b0949268dd6fb93be63ae2,11,DNA,0,10000,150,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,263c2de20b597d6e33b81ec91d8672b5,21,DNA,0,10000,485,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,dc12a6d8fd63122aa68f78facf9bed94,31,DNA,0,10000,490,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,df24140b1c6cad16b30abeaf03019eb5,11,DNA,0,10000,158,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,fd958e3b5649bc03890517ff239970ea,21,DNA,0,10000,445,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,8c22dff88a2239607762da00f7fd1725,31,DNA,0,10000,472,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,9db6efc92a041e11713ccfa8597edae5,11,DNA,0,10000,150,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,8996699a05d3e5a05fa3fe94bfa41431,21,DNA,0,10000,472,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,85c3aeec6457c0b1d210472ddeb67714,31,DNA,0,10000,468,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,74b928d3db1f7f033c0dcca6c6e52aea,11,DNA,0,10000,84,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,ba9947e078cab29e20bc7d31bc1b9f0d,21,DNA,0,10000,192,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,1bfe96d76ec9cdb60779a1a9223c424e,31,DNA,0,10000,187,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,752280e9969ce750e2c80477c1b7b0e7,11,DNA,0,10000,61,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,eba0eb3ce984cc53c36f134a752c52c5,21,DNA,0,10000,157,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,953156e9f4da8cf22e7e0b4b88261fae,31,DNA,0,10000,167,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0f35aeadda1532ed450bd6de1e73545d,11,DNA,0,10000,148,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,405ae3300f28ca5fe5c223cbf7e28734,21,DNA,0,10000,471,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0842f7edb426fc4fa2701c107e678279,31,DNA,0,10000,461,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,d883538a0c983a863fa4b6e5fcd19612,11,DNA,0,10000,148,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,9133bd71b86628b38c665ab7e5eb8712,21,DNA,0,10000,457,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,afadabf39aec247929e84a29fd797117,31,DNA,0,10000,461,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,feef9e4d39fecd3d9292b76c0cc72b81,11,DNA,0,10000,155,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,cc80cb247b195ca3dfa0756257d882b6,21,DNA,0,10000,427,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,bb365606acbf08d183399f139af80c32,31,DNA,0,10000,459,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,4cec832176c4831239faed42c0616ef4,11,DNA,0,10000,155,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,43a9d80a4cd995779c7538a32088dd0e,21,DNA,0,10000,469,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,d0cfbe22579f98fd5de2d41203589964,31,DNA,0,10000,480,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,328f7b0643bdb6c76135292b5afc8fa7,11,DNA,0,10000,82,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,a77789e831fcd2436c3b9e4e22fb173e,21,DNA,0,10000,190,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,50d8efd580ff2000cb38d1f8cc9cf9b4,31,DNA,0,10000,185,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,989f88420b193ef39c4dbe3b268e0049,11,DNA,0,10000,90,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,bebcd0dcc0ed3b120ad16c4e15805370,21,DNA,0,10000,188,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,4289d4241be8573145282352215ca3c4,31,DNA,0,10000,198,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,40df36a7eb411022be4b1d6a7af05496,11,DNA,0,10000,161,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,ffa92983f7e67454c407499cbfbabf88,21,DNA,0,10000,487,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,cb26db5716a213c9a6614021e7176c1d,31,DNA,0,10000,512,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 diff --git a/tests/test-data/gather/campy-picklist.csv b/tests/test-data/gather/campy-picklist.csv new file mode 100644 index 0000000000..5490c2de61 --- /dev/null +++ b/tests/test-data/gather/campy-picklist.csv @@ -0,0 +1,4 @@ +signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,752280e9969ce750e2c80477c1b7b0e7,11,DNA,0,10000,61,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,eba0eb3ce984cc53c36f134a752c52c5,21,DNA,0,10000,157,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,953156e9f4da8cf22e7e0b4b88261fae,31,DNA,0,10000,167,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 diff --git a/tests/test-data/gather/salmonella-picklist.csv b/tests/test-data/gather/salmonella-picklist.csv new file mode 100644 index 0000000000..ae048b99d4 --- /dev/null +++ b/tests/test-data/gather/salmonella-picklist.csv @@ -0,0 +1,25 @@ +signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,323c1a1712b0949268dd6fb93be63ae2,11,DNA,0,10000,150,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,263c2de20b597d6e33b81ec91d8672b5,21,DNA,0,10000,485,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,dc12a6d8fd63122aa68f78facf9bed94,31,DNA,0,10000,490,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,df24140b1c6cad16b30abeaf03019eb5,11,DNA,0,10000,158,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,fd958e3b5649bc03890517ff239970ea,21,DNA,0,10000,445,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,8c22dff88a2239607762da00f7fd1725,31,DNA,0,10000,472,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,9db6efc92a041e11713ccfa8597edae5,11,DNA,0,10000,150,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,8996699a05d3e5a05fa3fe94bfa41431,21,DNA,0,10000,472,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,85c3aeec6457c0b1d210472ddeb67714,31,DNA,0,10000,468,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0f35aeadda1532ed450bd6de1e73545d,11,DNA,0,10000,148,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,405ae3300f28ca5fe5c223cbf7e28734,21,DNA,0,10000,471,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0842f7edb426fc4fa2701c107e678279,31,DNA,0,10000,461,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,d883538a0c983a863fa4b6e5fcd19612,11,DNA,0,10000,148,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,9133bd71b86628b38c665ab7e5eb8712,21,DNA,0,10000,457,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,afadabf39aec247929e84a29fd797117,31,DNA,0,10000,461,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,feef9e4d39fecd3d9292b76c0cc72b81,11,DNA,0,10000,155,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,cc80cb247b195ca3dfa0756257d882b6,21,DNA,0,10000,427,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,bb365606acbf08d183399f139af80c32,31,DNA,0,10000,459,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,4cec832176c4831239faed42c0616ef4,11,DNA,0,10000,155,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,43a9d80a4cd995779c7538a32088dd0e,21,DNA,0,10000,469,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,d0cfbe22579f98fd5de2d41203589964,31,DNA,0,10000,480,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,40df36a7eb411022be4b1d6a7af05496,11,DNA,0,10000,161,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,ffa92983f7e67454c407499cbfbabf88,21,DNA,0,10000,487,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,cb26db5716a213c9a6614021e7176c1d,31,DNA,0,10000,512,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 diff --git a/tests/test-data/gather/thermotoga-picklist.csv b/tests/test-data/gather/thermotoga-picklist.csv new file mode 100644 index 0000000000..4606e0ca47 --- /dev/null +++ b/tests/test-data/gather/thermotoga-picklist.csv @@ -0,0 +1,10 @@ +signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,74b928d3db1f7f033c0dcca6c6e52aea,11,DNA,0,10000,84,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,ba9947e078cab29e20bc7d31bc1b9f0d,21,DNA,0,10000,192,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,1bfe96d76ec9cdb60779a1a9223c424e,31,DNA,0,10000,187,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,328f7b0643bdb6c76135292b5afc8fa7,11,DNA,0,10000,82,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,a77789e831fcd2436c3b9e4e22fb173e,21,DNA,0,10000,190,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,50d8efd580ff2000cb38d1f8cc9cf9b4,31,DNA,0,10000,185,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,989f88420b193ef39c4dbe3b268e0049,11,DNA,0,10000,90,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,bebcd0dcc0ed3b120ad16c4e15805370,21,DNA,0,10000,188,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,4289d4241be8573145282352215ca3c4,31,DNA,0,10000,198,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 6474df077c..7fd815a5a2 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -2023,6 +2023,29 @@ def test_search_metagenome_downsample_index(c): assert '12 matches; showing first 3:' in str(c) +def test_search_with_picklist(runtmp): + # test 'sourmash search' with picklists + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + metag_sig = utils.get_test_data('gather/combined.sig') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + out = runtmp.last_result.out + print(out) + assert "3 matches:" in out + assert "13.1% NC_000853.1 Thermotoga" in out + assert "13.0% NC_009486.1 Thermotoga" in out + assert "12.8% NC_011978.1 Thermotoga" in out + + def test_mash_csv_to_sig(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa.msh.dump') @@ -3965,6 +3988,30 @@ def test_gather_query_downsample_explicit(linear_gather, prefetch_gather): 'NC_003197.2' in out)) +def test_gather_with_picklist(runtmp, linear_gather, prefetch_gather): + # test 'sourmash gather' with picklists + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + metag_sig = utils.get_test_data('gather/combined.sig') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', + '-k', '21', '--picklist', f"{picklist}:md5:md5", + linear_gather, prefetch_gather) + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + out = runtmp.last_result.out + print(out) + assert "found 3 matches total;" in out + assert "1.9 Mbp 13.1% 100.0% NC_000853.1 Thermotoga" in out + assert "1.9 Mbp 11.5% 89.9% NC_011978.1 Thermotoga" in out + assert "1.9 Mbp 6.3% 48.4% NC_009486.1 Thermotoga" in out + + def test_gather_save_matches(linear_gather, prefetch_gather): with utils.TempDirectory() as location: testdata_glob = utils.get_test_data('gather/GCF*.sig') @@ -4805,3 +4852,27 @@ def test_do_sourmash_index_zipfile_append(c): sbts = [c for c in content if c.endswith(".sbt.json")] assert len(sbts) == 1 assert sbts[0] == "zzz.sbt.json" + + +def test_index_with_picklist(runtmp): + # test 'sourmash index' with picklists + gcf_sig_dir = utils.get_test_data('gather/') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + output_db = runtmp.output('thermo.sbt.zip') + + runtmp.sourmash('index', output_db, gcf_sig_dir, + '-k', '31', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + # verify: + siglist = list(sourmash.load_file_as_signatures(output_db)) + assert len(siglist) == 3 + for ss in siglist: + assert 'Thermotoga' in ss.name From c0e57816c4eef83fcb2bee113f5ea4ef3adaff95 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 07:25:59 -0700 Subject: [PATCH 32/41] add picklists to prefetch --- src/sourmash/commands.py | 8 ++++++-- tests/test_prefetch.py | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index ae1fb8e57e..26ee39c1b7 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -1073,6 +1073,7 @@ def prefetch(args): # figure out what k-mer size and molecule type we're looking for here ksize = args.ksize moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) # load the query signature & figure out all the things query = sourmash_args.load_query_signature(args.query, @@ -1141,7 +1142,8 @@ def prefetch(args): db = LazyLinearIndex(db) db = db.select(ksize=ksize, moltype=moltype, - containment=True, scaled=True) + containment=True, scaled=True, + picklist=picklist) if not db: notify(f"...no compatible signatures in '{dbfilename}'; skipping") @@ -1206,5 +1208,7 @@ def prefetch(args): with open(filename, "wt") as fp: sig.save_signatures([ss], fp) + if picklist: + sourmash_args.report_picklist(args, picklist) + return 0 - diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py index da37559d2b..72556f5135 100644 --- a/tests/test_prefetch.py +++ b/tests/test_prefetch.py @@ -4,6 +4,7 @@ import os import csv import pytest +import glob import sourmash_tst_utils as utils import sourmash @@ -444,3 +445,26 @@ def test_prefetch_basic_many_sigs(runtmp, linear_gather): assert "total of 10 matching signatures." in c.last_result.err assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err assert "a total of 0 query hashes remain unmatched." in c.last_result.err + + +def test_prefetch_with_picklist(runtmp): + # test 'sourmash prefetch' with picklists + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + metag_sig = utils.get_test_data('gather/combined.sig') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + out = runtmp.last_result.out + print(out) + + assert "total of 3 matching signatures." in err + assert "of 1466 distinct query hashes, 453 were found in matches above threshold." in err + assert "a total of 1013 query hashes remain unmatched." in err From a0335a31426a3eaba33e12e090795434de78def3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 07:32:10 -0700 Subject: [PATCH 33/41] add picklists to sourmash compare --- src/sourmash/commands.py | 7 ++++++- tests/test_sourmash.py | 21 +++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 26ee39c1b7..ec18dfd0e9 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -28,6 +28,7 @@ def compare(args): set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) inp_files = list(args.signatures) if args.from_file: @@ -45,11 +46,12 @@ def compare(args): loaded = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, + picklist=picklist, yield_all_files=args.force, progress=progress) loaded = list(loaded) if not loaded: - notify('\nwarning: no signatures loaded at given ksize/molecule type from {}', filename) + notify('\nwarning: no signatures loaded at given ksize/molecule type/picklist from {}', filename) siglist.extend(loaded) # track ksizes/moltypes @@ -79,6 +81,9 @@ def compare(args): notify(' '*79, end='\r') notify('loaded {} signatures total.'.format(len(siglist))) + if picklist: + sourmash_args.report_picklist(args, picklist) + # check to make sure they're potentially compatible - either using # scaled, or not. scaled_sigs = [s.minhash.scaled for s in siglist] diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 7fd815a5a2..f15eaa4fcd 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -2909,6 +2909,27 @@ def test_compare_with_abundance_3(): assert '70.5%' in out +def test_compare_with_picklist(runtmp): + # test 'sourmash compare' with picklists + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + runtmp.sourmash('compare', *gcf_sigs, + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + out = runtmp.last_result.out + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert "for given picklist, found 3 matches to 9 distinct values" in err + assert "WARNING: 6 missing picklist values." in err + + assert "0-NC_009486.1 The...\t[1. 0.331 0.036]" in out + assert "1-NC_000853.1 The...\t[0.331 1. 0.053]" in out + assert "2-NC_011978.1 The...\t[0.036 0.053 1. ]" in out + + def test_gather(linear_gather, prefetch_gather): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') From a074127b78cddb5db128109194a451280e2663e1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 07:38:45 -0700 Subject: [PATCH 34/41] add picklists to lca index --- src/sourmash/lca/command_index.py | 5 +++++ tests/test_lca.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index 883d861a75..8f7aace1f4 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -145,6 +145,7 @@ def index(args): args.ksize = DEFAULT_LOAD_K moltype = sourmash_args.calculate_moltype(args, default='DNA') + picklist = sourmash_args.load_picklist(args) notify('Building LCA database with ksize={} scaled={} moltype={}.', args.ksize, args.scaled, moltype) @@ -190,6 +191,7 @@ def index(args): n += 1 it = load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, + picklist=picklist, yield_all_files=args.force) for sig in it: notify(u'\r\033[K', end=u'') @@ -265,6 +267,9 @@ def index(args): notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx), args.ksize, args.scaled) + if picklist: + sourmash_args.report_picklist(args, picklist) + # summarize: notify('{} assigned lineages out of {} distinct lineages in spreadsheet.', len(record_used_lineages), len(set(assignments.values()))) diff --git a/tests/test_lca.py b/tests/test_lca.py index cdb1983a2f..89a2ebf6e8 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -2494,3 +2494,31 @@ def test_lca_db_dayhoff_command_search(c): c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + + +def test_lca_index_with_picklist(runtmp): + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + outdb = runtmp.output('gcf.lca.json') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + # create an empty spreadsheet + with open(runtmp.output('empty.csv'), 'wt') as fp: + fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + + runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + out = runtmp.last_result.out + err = runtmp.last_result.err + + print(out) + print(err) + + assert "for given picklist, found 3 matches to 9 distinct values" in err + assert "WARNING: 6 missing picklist values." + assert "WARNING: no lineage provided for 3 signatures" in err + + siglist = list(sourmash.load_file_as_signatures(outdb)) + assert len(siglist) == 3 + for ss in siglist: + assert 'Thermotoga' in ss.name From ba5c8bcb13b627aca3b1de28e97dda08efbc5cef Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 07:47:13 -0700 Subject: [PATCH 35/41] block multiple picklists on SBTs and LCAs, for now --- src/sourmash/lca/lca_db.py | 2 ++ src/sourmash/sbt.py | 2 ++ tests/test_lca.py | 20 ++++++++++++++++++++ tests/test_sbt.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 9446414108..19cfdfb11d 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -201,6 +201,8 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, if picklist is not None: self.picklists.append(picklist) + if len(self.picklists) > 1: + raise ValueError("we do not (yet) support multiple picklists for LCA databases") return self diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 498365dbd6..5206af1229 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -216,6 +216,8 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, if picklist is not None: self.picklists.append(picklist) + if len(self.picklists) > 1: + raise ValueError("we do not (yet) support multiple picklists for SBTs") return self diff --git a/tests/test_lca.py b/tests/test_lca.py index 89a2ebf6e8..eb66562a3e 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -516,6 +516,26 @@ def test_lca_index_select_picklist(): assert ss.minhash.ksize == 31 +def test_lca_index_select_picklist_twice(): + # test 'select' method from Index base class with a picklist. + + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['50a92740']) + + xx = db.select(picklist=picklist) + assert xx == db + + with pytest.raises(ValueError) as exc: + xx = db.select(picklist=picklist) + + assert "we do not (yet) support multiple picklists for LCA databases" in str(exc) + + + def test_search_db_scaled_gt_sig_scaled(): dbfile = utils.get_test_data('lca/47+63.lca.json') db, ksize, scaled = lca_utils.load_single_database(dbfile) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index f1980d1015..cf853d1b6c 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -699,6 +699,36 @@ def test_sbt_as_index_find_picklist(): assert ss.md5sum().startswith('09a08691c') +def test_sbt_as_index_find_picklist_twice(): + # test 'select' method from Index base class with a picklist + + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(sig47) + tree.insert(sig63) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['09a08691']) + + # run a 'find' with sig63, should find 47 and 63 both. + search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) + results = list(tree.find(search_obj, sig63)) + print(results) + assert len(results) == 2 + + # now, select twice on picklists... + tree = tree.select(picklist=picklist) + + with pytest.raises(ValueError): + tree = tree.select(picklist=picklist) + assert "we do not (yet) support multiple picklists for SBT databases" in str(exc) + + def test_sbt_as_index_signatures(): # test 'signatures' method from Index base class. factory = GraphFactory(31, 1e5, 4) From ca6ea4f5958c1b976f560fb6263821fe2dbdbff1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 08:34:08 -0700 Subject: [PATCH 36/41] add picklist test that checks indexing-and-then-search == index --- tests/test_sourmash.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index f15eaa4fcd..60d7597aff 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -4897,3 +4897,44 @@ def test_index_with_picklist(runtmp): assert len(siglist) == 3 for ss in siglist: assert 'Thermotoga' in ss.name + + +def test_index_matches_search_with_picklist(runtmp): + # test 'sourmash index' with picklists + gcf_sig_dir = utils.get_test_data('gather/') + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + metag_sig = utils.get_test_data('gather/combined.sig') + + output_db = runtmp.output('thermo.sbt.zip') + + runtmp.sourmash('index', output_db, gcf_sig_dir, '-k', '21') + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + # verify: + siglist = list(sourmash.load_file_as_signatures(output_db)) + assert len(siglist) > 3 # all signatures included... + + n_thermo = 0 + for ss in siglist: + if 'Thermotoga' in ss.name: + n_thermo += 1 + + assert n_thermo == 3 + + runtmp.sourmash('search', metag_sig, output_db, '--containment', + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + out = runtmp.last_result.out + print(out) + assert "3 matches:" in out + assert "13.1% NC_000853.1 Thermotoga" in out + assert "13.0% NC_009486.1 Thermotoga" in out + assert "12.8% NC_011978.1 Thermotoga" in out From c965648f217675537dae74522069be23eae1e687 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 08:44:32 -0700 Subject: [PATCH 37/41] add a test for using prefetch CSV as picklist --- tests/test_sourmash.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 60d7597aff..37dd6ca75d 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -4938,3 +4938,39 @@ def test_index_matches_search_with_picklist(runtmp): assert "13.1% NC_000853.1 Thermotoga" in out assert "13.0% NC_009486.1 Thermotoga" in out assert "12.8% NC_011978.1 Thermotoga" in out + + +def test_gather_with_prefetch_picklist(runtmp, linear_gather): + # test 'gather' using a picklist taken from 'sourmash prefetch' output + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + metag_sig = utils.get_test_data('gather/combined.sig') + prefetch_csv = runtmp.output('prefetch-out.csv') + + runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, + '-k', '21', '-o', prefetch_csv) + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert "total of 12 matching signatures." in err + assert "of 1466 distinct query hashes, 1466 were found in matches above threshold." in err + + # now, do a gather with the results + runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather, + '-k', '21', '--picklist', + f'{prefetch_csv}:match_md5:md5short') + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert "found 11 matches total;" in out + assert "the recovered matches hit 99.9% of the query" in out + + assert "4.9 Mbp 33.2% 100.0% NC_003198.1 " in out + assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out From ab286cfc3631cd0cab617136e9fd910facc42d27 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 08:50:57 -0700 Subject: [PATCH 38/41] remove debugging print --- src/sourmash/index.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 477f141849..b344a3cabc 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -393,7 +393,6 @@ def signatures(self): "Return the selected signatures." db = self.db.select(**self.selection_dict) for ss in db.signatures(): - print('MATCH!', ss) yield ss def signatures_with_location(self): From 4d156e939c27273691aa65c120ae772e70520765 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 09:13:52 -0700 Subject: [PATCH 39/41] add docs --- doc/command-line.md | 99 ++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 38 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 051134b98f..6e7ceda09a 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -177,15 +177,14 @@ sourmash compare file1.sig [ file2.sig ... ] ``` Options: -``` ---output -- save the distance matrix to this file (as a numpy binary matrix) ---ksize -- do the comparisons at this k-mer size. ---containment -- calculate containment instead of similarity. - C(i, j) = size(i intersection j) / size(i). ---from-file -- append the list of files in this text file to the input + +* `--output` -- save the distance matrix to this file (as a numpy binary matrix) +* `--ksize` -- do the comparisons at this k-mer size. +* `--containment` -- calculate containment instead of similarity; `C(i, j) = size(i intersection j) / size(i)` +* `--from-file` -- append the list of files in this text file to the input signatures. ---ignore-abundance -- ignore abundances in signatures. -``` +* `--ignore-abundance` -- ignore abundances in signatures. +* `--picklist` -- select a subset of signatures with [a picklist](#using-picklists-to-subset-large-collections-of-signatures) **Note:** compare by default produces a symmetric similarity matrix that can be used as an input to clustering. With `--containment`, however, this matrix is no longer symmetric and cannot formally be used for clustering. @@ -249,6 +248,9 @@ similarity match ... ``` +Note, as of sourmash 4.2.0, `search` supports `--picklist`, to +[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). + ### `sourmash gather` - find metagenome members The `gather` subcommand selects the best reference genomes to use for @@ -289,6 +291,9 @@ which matches are no longer reported; by default, this is set to 50kb. see the Appendix in [Classifying Signatures](classifying-signatures.md) for details. +As of sourmash 4.2.0, `gather` supports `--picklist`, to +[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). + Note: Use `sourmash gather` to classify a metagenome against a collection of @@ -350,6 +355,9 @@ containing a list of file names to index; you can also provide individual signature files, directories full of signatures, or other sourmash databases. +As of sourmash 4.2.0, `index` supports `--picklist`, to +[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). + ### `sourmash prefetch` - select subsets of very large databases for more processing The `prefetch` subcommand searches a collection of scaled signatures @@ -375,6 +383,7 @@ Other options include: * `--threshold-bp` to require a minimum estimated bp overlap for output; * `--scaled` for downsampling; * `--force` to continue past survivable errors; +* `--picklist` select a subset of signatures with [a picklist](#using-picklists-to-subset-large-collections-of-signatures) ### Alternative search mode for low-memory (but slow) search: `--linear` @@ -589,6 +598,9 @@ see You can use `--from-file` to pass `lca index` a text file containing a list of file names to index. +As of sourmash 4.2.0, `lca index` supports `--picklist`, to +[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). + ### `sourmash lca rankinfo` - examine an LCA database The `sourmash lca rankinfo` command displays k-mer specificity @@ -821,36 +833,8 @@ will extract the same signature, which has an accession number of #### Using picklists with `sourmash sig extract` As of sourmash 4.2.0, `extract` also supports picklists, a feature by -which you can select signatures based on values in a CSV file. - -For example, -``` -sourmash sig extract --picklist list.csv:md5:md5sum -``` -will extract only the signatures that have md5sums matching the -column `md5sum` in the CSV file `list.csv`. - -The `--picklist` argument string must be of the format -`pickfile:colname:coltype`, where `pickfile` is the path to a CSV -file, `colname` is the name of the column to select from the CSV -file (based on the headers in the first line of the CSV file), -and `coltype` is the type of match. - -The following `coltype`s are currently supported by `sourmash sig extract`: - -* `name` - exact match to signature's name -* `md5` - exact match to signature's md5sum -* `md5prefix8` - match to 8-character prefix of signature's md5sum -* `md5short` - same as `md5prefix8` -* `ident` - exact match to signature's identifier -* `identprefix` - match to signature's identifier, before '.' - -Identifiers are constructed by using the first space delimited word in -the signature name. - -One way to build a picklist is to use `sourmash sig describe --csv -out.csv ` to construct an initial CSV file that you can -then edit further. +which you can select signatures based on values in a CSV file. See +[Using picklists to subset large collections of signatures](#using-picklists-to-subset-large-collections-of-signatures), below. ### `sourmash signature flatten` - remove abundance information from signatures @@ -963,6 +947,45 @@ signatures with multiple ksizes or moltypes at the same time; you need to pick the ksize and moltype to use for your search. Where possible, scaled values will be made compatible. +### Using picklists to subset large collections of signatures + +As of sourmash 4.2.0, many commands support *picklists*, a feature by +which you can select or "pick out" signatures based on values in a CSV +file. + +For example, +``` +sourmash sig extract --picklist list.csv:md5:md5sum +``` +will extract only the signatures that have md5sums matching the +column `md5sum` in the CSV file `list.csv`. + +The `--picklist` argument string must be of the format +`pickfile:colname:coltype`, where `pickfile` is the path to a CSV +file, `colname` is the name of the column to select from the CSV +file (based on the headers in the first line of the CSV file), +and `coltype` is the type of match. + +The following `coltype`s are currently supported by `sourmash sig extract`: + +* `name` - exact match to signature's name +* `md5` - exact match to signature's md5sum +* `md5prefix8` - match to 8-character prefix of signature's md5sum +* `md5short` - same as `md5prefix8` +* `ident` - exact match to signature's identifier +* `identprefix` - match to signature's identifier, before '.' + +Identifiers are constructed by using the first space delimited word in +the signature name. + +One way to build a picklist is to use `sourmash sig describe --csv +out.csv ` to construct an initial CSV file that you can +then edit further. + +In addition to `sig extract`, the following commands support +`--picklist` selection: `index`, `search`, `gather`, `prefetch`, +`compare`, `index`, and `lca index`. + ### Storing (and searching) signatures Backing up a little, there are many ways to store and search From de6f3c47ac230f0d5b469e542cf985d1513ec82b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 11:38:35 -0700 Subject: [PATCH 40/41] remove order dependence from test --- tests/test_sourmash.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 37dd6ca75d..9969f833a6 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -2925,9 +2925,9 @@ def test_compare_with_picklist(runtmp): assert "for given picklist, found 3 matches to 9 distinct values" in err assert "WARNING: 6 missing picklist values." in err - assert "0-NC_009486.1 The...\t[1. 0.331 0.036]" in out - assert "1-NC_000853.1 The...\t[0.331 1. 0.053]" in out - assert "2-NC_011978.1 The...\t[0.036 0.053 1. ]" in out + assert "0-NC_009486.1 The..." in out + assert "1-NC_000853.1 The..." in out + assert "2-NC_011978.1 The..." in out def test_gather(linear_gather, prefetch_gather): From 8812142330ce74e107a2dd2ad9af3ee0a3bd2379 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 17 Jun 2021 12:20:45 -0700 Subject: [PATCH 41/41] further attempt to fix test --- tests/test_sourmash.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 9969f833a6..c6abad69e4 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -2925,9 +2925,9 @@ def test_compare_with_picklist(runtmp): assert "for given picklist, found 3 matches to 9 distinct values" in err assert "WARNING: 6 missing picklist values." in err - assert "0-NC_009486.1 The..." in out - assert "1-NC_000853.1 The..." in out - assert "2-NC_011978.1 The..." in out + assert "NC_009486.1 The..." in out + assert "NC_000853.1 The..." in out + assert "NC_011978.1 The..." in out def test_gather(linear_gather, prefetch_gather):