From 92e5fdc62272de4183427d919d9ed1663f76da48 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 6 Mar 2021 15:50:50 -0800 Subject: [PATCH 01/46] add an IndexOfIndexes class --- src/sourmash/index.py | 56 +++++++++++++++++++++++++++++ tests/test_index.py | 84 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 8dd4069f04..5945f0c662 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -159,3 +159,59 @@ def select_sigs(siglist, ksize, moltype): siglist=select_sigs(self._signatures, ksize, moltype) return LinearIndex(siglist, self.filename) + + +class IndexOfIndexes(Index): + def __init__(self, index_list, source_list): + self.index_list = list(index_list) + self.source_list = list(source_list) + assert len(index_list) == len(source_list) + + def signatures(self): + for idx in self.index_list: + for ss in idx.signatures(): + yield ss + + def insert(self, *args): + raise NotImplementedError + + @classmethod + def load(self, *args): + raise NotImplementedError + + def save(self, *args): + raise NotImplementedError + + def select(self, ksize=None, moltype=None): + new_idx_list = [] + new_src_list = [] + for idx, src in zip(self.index_list, self.source_list): + idx = idx.select(ksize=ksize, moltype=moltype) + new_idx_list.append(idx) + new_src_list.append(src) + + return IndexOfIndexes(new_idx_list, new_src_list) + + + def search(self, query, *args, **kwargs): + # do the actual search: + matches = [] + for idx, src in zip(self.index_list, self.source_list): + for (score, ss, filename) in idx.search(query, *args, **kwargs): + matches.append((score, ss, src)) + + # sort! + matches.sort(key=lambda x: -x[0]) + return matches + + def gather(self, query, *args, **kwargs): + "Return the match with the best Jaccard containment in the Index." + # actually do search! + results = [] + for idx, src in zip(self.index_list, self.source_list): + for (score, ss, filename) in idx.gather(query, *args, **kwargs): + results.append((score, ss, src)) + + results.sort(reverse=True, key=lambda x: (x[0], x[1].md5sum())) + + return results diff --git a/tests/test_index.py b/tests/test_index.py index 1313162a2e..ceea3559e5 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -4,7 +4,7 @@ import sourmash from sourmash import load_one_signature, SourmashSignature -from sourmash.index import LinearIndex +from sourmash.index import LinearIndex, IndexOfIndexes from sourmash.sbt import SBT, GraphFactory, Leaf import sourmash_tst_utils as utils @@ -393,3 +393,85 @@ def test_index_same_md5sum_zipstorage(c): # should have 3 files, 1 internal and two sigs. We check for 4 because the # directory also shows in namelist() assert len([f for f in zout.namelist() if f.startswith(".sbt.zzz/")]) == 4 + + +def test_indexindex_search(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx1 = LinearIndex() + lidx1.insert(ss2) + lidx2 = LinearIndex() + lidx2.insert(ss47) + lidx3 = LinearIndex() + lidx3.insert(ss63) + + lidx = IndexOfIndexes([lidx1, lidx2, lidx3], [sig2, sig47, sig63]) + + # now, search for sig2 + sr = lidx.search(ss2, threshold=1.0) + print([s[1].name for s in sr]) + assert len(sr) == 1 + assert sr[0][1] == ss2 + + # search for sig47 with lower threshold; search order not guaranteed. + sr = lidx.search(ss47, threshold=0.1) + print([s[1].name for s in sr]) + assert len(sr) == 2 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss47 + assert sr[1][1] == ss63 + + # search for sig63 with lower threshold; search order not guaranteed. + sr = lidx.search(ss63, threshold=0.1) + print([s[1].name for s in sr]) + assert len(sr) == 2 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss63 + assert sr[1][1] == ss47 + + # search for sig63 with high threshold => 1 match + sr = lidx.search(ss63, threshold=0.8) + print([s[1].name for s in sr]) + assert len(sr) == 1 + sr.sort(key=lambda x: -x[0]) + assert sr[0][1] == ss63 + + +def test_indexindex_gather(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + + lidx1 = LinearIndex() + lidx1.insert(ss2) + lidx2 = LinearIndex() + lidx2.insert(ss47) + lidx3 = LinearIndex() + lidx3.insert(ss63) + + lidx = IndexOfIndexes([lidx1, lidx2, lidx3], [sig2, sig47, sig63]) + + matches = lidx.gather(ss2) + assert len(matches) == 1 + assert matches[0][0] == 1.0 + assert matches[0][1] == ss2 + + matches = lidx.gather(ss47) + assert len(matches) == 2 + assert matches[0][0] == 1.0 + assert matches[0][1] == ss47 + assert round(matches[1][0], 2) == 0.49 + assert matches[1][1] == ss63 + + From 5c71e11d47f78f43bf9953351d0c32d84b7b9a07 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 6 Mar 2021 16:41:59 -0800 Subject: [PATCH 02/46] rename to MultiIndex --- src/sourmash/index.py | 3 ++- tests/test_index.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 5945f0c662..ed6b2be399 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -161,7 +161,8 @@ def select_sigs(siglist, ksize, moltype): return LinearIndex(siglist, self.filename) -class IndexOfIndexes(Index): +class MultiIndex(Index): + "An Index class that wraps other Index classes." def __init__(self, index_list, source_list): self.index_list = list(index_list) self.source_list = list(source_list) diff --git a/tests/test_index.py b/tests/test_index.py index ceea3559e5..11b46b6e6c 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -4,7 +4,7 @@ import sourmash from sourmash import load_one_signature, SourmashSignature -from sourmash.index import LinearIndex, IndexOfIndexes +from sourmash.index import LinearIndex, MultiIndex from sourmash.sbt import SBT, GraphFactory, Leaf import sourmash_tst_utils as utils @@ -411,7 +411,7 @@ def test_indexindex_search(): lidx3 = LinearIndex() lidx3.insert(ss63) - lidx = IndexOfIndexes([lidx1, lidx2, lidx3], [sig2, sig47, sig63]) + lidx = MultiIndex([lidx1, lidx2, lidx3], [sig2, sig47, sig63]) # now, search for sig2 sr = lidx.search(ss2, threshold=1.0) @@ -460,7 +460,7 @@ def test_indexindex_gather(): lidx3 = LinearIndex() lidx3.insert(ss63) - lidx = IndexOfIndexes([lidx1, lidx2, lidx3], [sig2, sig47, sig63]) + lidx = MultiIndex([lidx1, lidx2, lidx3], [sig2, sig47, sig63]) matches = lidx.gather(ss2) assert len(matches) == 1 From 85efdaf70eea915c3c47538be01a44ff22cacf75 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 6 Mar 2021 20:22:30 -0800 Subject: [PATCH 03/46] switch to using MultiIndex for loading from a directory --- src/sourmash/index.py | 3 +-- src/sourmash/sourmash_args.py | 27 +++++++++++++-------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index ed6b2be399..d755b743d4 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -191,8 +191,7 @@ def select(self, ksize=None, moltype=None): new_idx_list.append(idx) new_src_list.append(src) - return IndexOfIndexes(new_idx_list, new_src_list) - + return MultiIndex(new_idx_list, new_src_list) def search(self, query, *args, **kwargs): # do the actual search: diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 258b99cee0..64a2825773 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -16,7 +16,7 @@ from . import signature from .logging import notify, error -from .index import LinearIndex +from .index import LinearIndex, MultiIndex from . import signature as sig from .sbt import SBT from .sbtmh import SigLeaf @@ -278,12 +278,10 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) if os.path.isdir(filename): assert dbtype == DatabaseType.SIGLIST - siglist = _select_sigs(db, moltype=query_moltype, ksize=query_ksize) - siglist = filter_compatible_signatures(query, siglist, 1) - linear = LinearIndex(siglist, filename=filename) - databases.append((linear, filename, False)) + db = db.select(moltype=query_moltype, ksize=query_ksize) + databases.append((db, filename, False)) - n_signatures += len(linear) + n_signatures += 1 # @CTB # SBT elif dbtype == DatabaseType.SBT: @@ -372,22 +370,23 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): # load signatures from directory if not loaded and os.path.isdir(filename): - all_sigs = [] + index_list = [] + source_list = [] for thisfile in traverse_find_sigs([filename], traverse_yield_all): try: - with open(thisfile, 'rt') as fp: - x = signature.load_signatures(fp, do_raise=True) - siglist = list(x) - all_sigs.extend(siglist) + idx = LinearIndex.load(thisfile) + index_list.append(idx) + source_list.append(thisfile) except (IOError, sourmash.exceptions.SourmashError): if traverse_yield_all: continue else: raise - loaded=True - db = all_sigs - dbtype = DatabaseType.SIGLIST + if index_list: + loaded=True + db = MultiIndex(index_list, source_list) + dbtype = DatabaseType.SIGLIST # load signatures from single file try: From 04f9de17d67191cb45e08971aca63936761d87a7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 7 Mar 2021 06:46:29 -0800 Subject: [PATCH 04/46] some more MultiIndex tests --- src/sourmash/index.py | 7 +++++-- tests/test_index.py | 41 +++++++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index d755b743d4..a8503fe455 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -122,6 +122,7 @@ def select(self, ksize=None, moltype=None): "" class LinearIndex(Index): + "An Index for a collection of signatures. Can load from a .sig file." def __init__(self, _signatures=None, filename=None): self._signatures = [] if _signatures: @@ -198,7 +199,8 @@ def search(self, query, *args, **kwargs): matches = [] for idx, src in zip(self.index_list, self.source_list): for (score, ss, filename) in idx.search(query, *args, **kwargs): - matches.append((score, ss, src)) + best_src = src or filename # override if src provided + matches.append((score, ss, best_src)) # sort! matches.sort(key=lambda x: -x[0]) @@ -210,7 +212,8 @@ def gather(self, query, *args, **kwargs): results = [] for idx, src in zip(self.index_list, self.source_list): for (score, ss, filename) in idx.gather(query, *args, **kwargs): - results.append((score, ss, src)) + best_src = src or filename # override if src provided + results.append((score, ss, best_src)) results.sort(reverse=True, key=lambda x: (x[0], x[1].md5sum())) diff --git a/tests/test_index.py b/tests/test_index.py index 11b46b6e6c..05e1a49750 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -395,7 +395,7 @@ def test_index_same_md5sum_zipstorage(c): assert len([f for f in zout.namelist() if f.startswith(".sbt.zzz/")]) == 4 -def test_indexindex_search(): +def test_multi_index_search(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') @@ -404,20 +404,20 @@ def test_indexindex_search(): ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) - lidx1 = LinearIndex() - lidx1.insert(ss2) - lidx2 = LinearIndex() - lidx2.insert(ss47) - lidx3 = LinearIndex() - lidx3.insert(ss63) + lidx1 = LinearIndex.load(sig2) + lidx2 = LinearIndex.load(sig47) + lidx3 = LinearIndex.load(sig63) - lidx = MultiIndex([lidx1, lidx2, lidx3], [sig2, sig47, sig63]) + # create MultiIindex with source location override + lidx = MultiIndex([lidx1, lidx2, lidx3], ['A', None, 'C']) + lidx = lidx.select(ksize=31) # now, search for sig2 sr = lidx.search(ss2, threshold=1.0) print([s[1].name for s in sr]) assert len(sr) == 1 assert sr[0][1] == ss2 + assert sr[0][2] == 'A' # source override # search for sig47 with lower threshold; search order not guaranteed. sr = lidx.search(ss47, threshold=0.1) @@ -425,7 +425,9 @@ def test_indexindex_search(): assert len(sr) == 2 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss47 + assert sr[0][2] == sig47 # source was set to None, so no override assert sr[1][1] == ss63 + assert sr[1][2] == 'C' # source override # search for sig63 with lower threshold; search order not guaranteed. sr = lidx.search(ss63, threshold=0.1) @@ -433,7 +435,9 @@ def test_indexindex_search(): assert len(sr) == 2 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss63 + assert sr[0][2] == 'C' # source override assert sr[1][1] == ss47 + assert sr[1][2] == sig47 # source was set to None, so no override # search for sig63 with high threshold => 1 match sr = lidx.search(ss63, threshold=0.8) @@ -441,9 +445,10 @@ def test_indexindex_search(): assert len(sr) == 1 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss63 + assert sr[0][2] == 'C' # source override -def test_indexindex_gather(): +def test_multi_index_gather(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') @@ -452,26 +457,26 @@ def test_indexindex_gather(): ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) + lidx1 = LinearIndex.load(sig2) + lidx2 = LinearIndex.load(sig47) + lidx3 = LinearIndex.load(sig63) - lidx1 = LinearIndex() - lidx1.insert(ss2) - lidx2 = LinearIndex() - lidx2.insert(ss47) - lidx3 = LinearIndex() - lidx3.insert(ss63) - - lidx = MultiIndex([lidx1, lidx2, lidx3], [sig2, sig47, sig63]) + # create MultiIindex with source location override + lidx = MultiIndex([lidx1, lidx2, lidx3], ['A', None, 'C']) + lidx = lidx.select(ksize=31) matches = lidx.gather(ss2) assert len(matches) == 1 assert matches[0][0] == 1.0 - assert matches[0][1] == ss2 + assert matches[0][2] == 'A' matches = lidx.gather(ss47) assert len(matches) == 2 assert matches[0][0] == 1.0 assert matches[0][1] == ss47 + assert matches[0][2] == sig47 # no source override assert round(matches[1][0], 2) == 0.49 assert matches[1][1] == ss63 + assert matches[1][2] == 'C' # source override From 201a89a0f29c61191a069ba3f6b01ce670e1dcde Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 7 Mar 2021 06:55:18 -0800 Subject: [PATCH 05/46] add test of MultiIndex.signatures --- tests/test_index.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_index.py b/tests/test_index.py index 05e1a49750..1b9ae93402 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -480,3 +480,25 @@ def test_multi_index_gather(): assert matches[1][2] == 'C' # source override +def test_multi_index_signatures(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47) + ss63 = sourmash.load_one_signature(sig63) + + lidx1 = LinearIndex.load(sig2) + lidx2 = LinearIndex.load(sig47) + lidx3 = LinearIndex.load(sig63) + + # create MultiIindex with source location override + lidx = MultiIndex([lidx1, lidx2, lidx3], ['A', None, 'C']) + lidx = lidx.select(ksize=31) + + siglist = list(lidx.signatures()) + assert len(siglist) == 3 + assert ss2 in siglist + assert ss47 in siglist + assert ss63 in siglist From 07d2c321bd7cf219377d01db28a44a24c6364aa0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 7 Mar 2021 06:55:35 -0800 Subject: [PATCH 06/46] add docstring for MultiIndex --- src/sourmash/index.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index a8503fe455..4160730da4 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -163,7 +163,16 @@ def select_sigs(siglist, ksize, moltype): class MultiIndex(Index): - "An Index class that wraps other Index classes." + """An Index class that wraps other Index classes. + + The MultiIndex constructor takes two arguments: a list of Index + objects, and a matching list of sources (filenames, etc.) If the + source is not None, then it will be used to override the 'filename' + in the triple that is returned by search and gather. + + One specific use for this is when loading signatures from a directory; + MultiIndex will properly record which files provided which signatures. + """ def __init__(self, index_list, source_list): self.index_list = list(index_list) self.source_list = list(source_list) From 61d15c324f09975f81c58e8abbe7f5f663cf3ef5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 7 Mar 2021 07:13:57 -0800 Subject: [PATCH 07/46] stop special-casing SIGLISTs --- src/sourmash/sourmash_args.py | 57 ++++++++++++++++------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 64a2825773..a818cc1ea9 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -306,7 +306,8 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) # signature file elif dbtype == DatabaseType.SIGLIST: - siglist = _select_sigs(db, moltype=query_moltype, ksize=query_ksize) + db = db.select(moltype=query_moltype, ksize=query_ksize) + siglist = db.signatures() siglist = filter_compatible_signatures(query, siglist, False) siglist = list(siglist) if not siglist: @@ -388,32 +389,33 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): db = MultiIndex(index_list, source_list) dbtype = DatabaseType.SIGLIST - # load signatures from single file - try: - # CTB: could make this a generator, with some trickery; but for - # now, just force into list. - with open(filename, 'rt') as fp: - db = signature.load_signatures(fp, do_raise=True) - db = list(db) - - loaded = True - dbtype = DatabaseType.SIGLIST - except Exception as exc: - pass + # load signatures from single signature file + if not loaded: + try: + with open(filename, 'rt') as fp: + db = LinearIndex.load(fp) + dbtype = DatabaseType.SIGLIST + loaded = True + except Exception as exc: + pass # try load signatures from single file (list of signature paths) if not loaded: try: - db = [] - with open(filename, 'rt') as fp: - for line in fp: - line = line.strip() - if line: - sigs = load_file_as_signatures(line) - db += list(sigs) + idx_list = [] + src_list = [] - loaded = True + file_list = load_file_list_of_signatures(filename) + for fname in file_list: + idx = load_file_as_index(fname) + src = fname + + idx_list.append(idx) + src_list.append(src) + + db = MultiIndex(idx_list, src_list) dbtype = DatabaseType.SIGLIST + loaded = True except Exception as exc: pass @@ -451,7 +453,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): raise OSError("Error while reading signatures from '{}' - got sequences instead! Is this a FASTA/FASTQ file?".format(filename)) if not loaded: - raise OSError("Error while reading signatures from '{}'.".format(filename)) + raise OSError(f"Error while reading signatures from '{filename}'.") return db, dbtype @@ -509,15 +511,8 @@ def load_file_as_signatures(filename, select_moltype=None, ksize=None, progress.notify(filename) db, dbtype = _load_database(filename, yield_all_files) - - loader = None - if dbtype in (DatabaseType.LCA, DatabaseType.SBT): - db = db.select(moltype=select_moltype, ksize=ksize) - loader = db.signatures() - elif dbtype == DatabaseType.SIGLIST: - loader = _select_sigs(db, moltype=select_moltype, ksize=ksize) - else: - assert 0 # unknown enum!? + db = db.select(moltype=select_moltype, ksize=ksize) + loader = db.signatures() if progress: return progress.start_file(filename, loader) From 16f9ee2c1d73e822f219d696e661626dd838c897 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 7 Mar 2021 07:15:58 -0800 Subject: [PATCH 08/46] fix test to match more informative error message --- tests/test_sourmash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 2f144e32ed..47e05d57fd 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -3447,7 +3447,7 @@ def test_gather_error_no_sigs_traverse(c): err = c.last_result.err print(err) - assert '** ERROR: no signatures or databases loaded?' in err + assert f"Error while reading signatures from '{emptydir}'" in err assert not 'found 0 matches total;' in err From c6bf3148b1627b72aaff834bcca41bf0d62d0b63 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 7 Mar 2021 07:22:43 -0800 Subject: [PATCH 09/46] switch to using LinearIndex.load for stdin, too --- src/sourmash/sourmash_args.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index a818cc1ea9..43b4c77805 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -364,10 +364,9 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): # special case stdin if not loaded and filename == '-': - db = signature.load_signatures(sys.stdin, do_raise=True) - db = list(db) - loaded = True + db = LinearIndex.load(sys.stdin) dbtype = DatabaseType.SIGLIST + loaded = True # load signatures from directory if not loaded and os.path.isdir(filename): @@ -480,14 +479,7 @@ def load_file_as_index(filename, yield_all_files=False): attempt to load all files. """ db, dbtype = _load_database(filename, yield_all_files) - if dbtype in (DatabaseType.LCA, DatabaseType.SBT): - return db # already an index! - elif dbtype == DatabaseType.SIGLIST: - # turn siglist into a LinearIndex - idx = LinearIndex(db, filename) - return idx - else: - assert 0 # unknown enum!? + return db def load_file_as_signatures(filename, select_moltype=None, ksize=None, From dd0f3b8d264d71e7c559b3a8af815f4834d9cb99 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 8 Mar 2021 06:38:23 -0800 Subject: [PATCH 10/46] add __len__ to MultiIndex --- src/sourmash/index.py | 3 +++ src/sourmash/sourmash_args.py | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 4160730da4..a8cc322d55 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -183,6 +183,9 @@ def signatures(self): for ss in idx.signatures(): yield ss + def __len__(self): + return sum([ len(idx) for idx in self.index_list ]) + def insert(self, *args): raise NotImplementedError diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 43b4c77805..37a52aa6b4 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -273,15 +273,17 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) sys.exit(-1) # are we collecting signatures from a directory/path? - # NOTE: error messages about loading will now be attributed to - # directory, not individual file. if os.path.isdir(filename): + # CTB: combine with SIGLIST below? assert dbtype == DatabaseType.SIGLIST db = db.select(moltype=query_moltype, ksize=query_ksize) + if not db: + notify("no compatible signatures found in '{}'", filename) + sys.exit(-1) databases.append((db, filename, False)) - n_signatures += 1 # @CTB + n_signatures += len(db) # SBT elif dbtype == DatabaseType.SBT: @@ -297,7 +299,6 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) elif dbtype == DatabaseType.LCA: if not check_lca_db_is_compatible(filename, db, query): sys.exit(-1) - query_scaled = query.minhash.scaled notify('loaded LCA {}', filename, end='\r') n_databases += 1 From 9211a74e7731fd0aa4bb2511c3465a10af125b7e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 8 Mar 2021 06:53:33 -0800 Subject: [PATCH 11/46] add check_csv to check for appropriate filename loading info --- tests/test_sourmash.py | 67 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 47e05d57fd..cd3b669176 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1570,6 +1570,33 @@ def test_search_metagenome_traverse(): assert '13 matches; showing first 3:' in out +def test_search_metagenome_traverse_check_csv(): + with utils.TempDirectory() as location: + testdata_dir = utils.get_test_data('gather') + + query_sig = utils.get_test_data('gather/combined.sig') + out_csv = os.path.join(location, 'out.csv') + + cmd = f'search {query_sig} {testdata_dir} -k 21 -o {out_csv}' + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + print(out) + print(err) + + with open(out_csv, 'rt') as fp: + prefix_len = len(testdata_dir) + r = csv.DictReader(fp) + for row in r: + filename = row['filename'] + assert filename.startswith(testdata_dir) + # should have full path to file sig was loaded from + assert len(filename) > prefix_len + + assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T...' in out + assert '13 matches; showing first 3:' in out + + # explanation: you cannot downsample a scaled SBT to match a scaled # signature, so make sure that when you try such a search, it fails! # (you *can* downsample a signature to match an SBT.) @@ -3246,6 +3273,46 @@ def test_gather_metagenome_traverse(): 'NC_011294.1 Salmonella enterica subsp...' in out)) +def test_gather_metagenome_traverse_check_csv(): + with utils.TempDirectory() as location: + # set up a directory $location/gather that contains + # everything in the 'tests/test-data/gather' directory + # *except* the query sequence, which is 'combined.sig'. + testdata_dir = utils.get_test_data('gather') + copy_testdata = os.path.join(location, 'somesigs') + shutil.copytree(testdata_dir, copy_testdata) + os.unlink(os.path.join(copy_testdata, 'combined.sig')) + + query_sig = utils.get_test_data('gather/combined.sig') + out_csv = os.path.join(location, 'out.csv') + + # now, feed in the new directory -- + cmd = f'gather {query_sig} {copy_testdata} -k 21 --threshold-bp=0' + cmd += f' -o {out_csv}' + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + print(cmd) + print(out) + print(err) + + with open(out_csv, 'rt') as fp: + prefix_len = len(copy_testdata) + r = csv.DictReader(fp) + for row in r: + filename = row['filename'] + assert filename.startswith(copy_testdata) + # should have full path to file sig was loaded from + assert len(filename) > prefix_len + + assert 'found 12 matches total' in out + assert 'the recovered matches hit 100.0% of the query' in out + assert all(('4.9 Mbp 33.2% 100.0%' in out, + 'NC_003198.1 Salmonella enterica subsp...' in out)) + assert all(('4.7 Mbp 0.5% 1.5%' in out, + 'NC_011294.1 Salmonella enterica subsp...' in out)) + + def test_gather_metagenome_output_unassigned(): with utils.TempDirectory() as location: testdata_glob = utils.get_test_data('gather/GCF_000195995*g') From 75069ff0ae43107b18f780a8298661c1b8b08bb4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 8 Mar 2021 06:56:43 -0800 Subject: [PATCH 12/46] add comment --- tests/test_sourmash.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index cd3b669176..71b5ebec66 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1571,6 +1571,9 @@ def test_search_metagenome_traverse(): def test_search_metagenome_traverse_check_csv(): + # this test confirms that the CSV 'filename' output for signatures loaded + # via directory traversal properly contains the actual path to the + # signature file from which the signature was loaded. with utils.TempDirectory() as location: testdata_dir = utils.get_test_data('gather') @@ -3274,6 +3277,9 @@ def test_gather_metagenome_traverse(): def test_gather_metagenome_traverse_check_csv(): + # this test confirms that the CSV 'filename' output for signatures loaded + # via directory traversal properly contains the actual path to the + # signature file from which the signature was loaded. with utils.TempDirectory() as location: # set up a directory $location/gather that contains # everything in the 'tests/test-data/gather' directory From 9f39623f04e928a6af2f1cf8d182d94d5e773976 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 8 Mar 2021 17:24:21 -0800 Subject: [PATCH 13/46] fix databases load --- src/sourmash/sourmash_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 906431cc67..b36cf3bbbd 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -283,7 +283,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) if not db: notify("no compatible signatures found in '{}'", filename) sys.exit(-1) - databases.append((db, filename, False)) + databases.append(db) n_signatures += len(db) # SBT From ac63cf8fe7d3070c25691bce02a9e393da34bc40 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 8 Mar 2021 17:26:38 -0800 Subject: [PATCH 14/46] more tests needed --- src/sourmash/sourmash_args.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index b36cf3bbbd..0b826f92fe 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -280,6 +280,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) assert dbtype == DatabaseType.SIGLIST db = db.select(moltype=query_moltype, ksize=query_ksize) + # @CTB filter compatible signatures here, too. write test. if not db: notify("no compatible signatures found in '{}'", filename) sys.exit(-1) From 5590d70ddf226cbcca691c5c308ef42ce431da69 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 9 Mar 2021 06:41:49 -0800 Subject: [PATCH 15/46] add tests for incompatible signatures --- src/sourmash/sourmash_args.py | 11 ++++++-- tests/test_sourmash.py | 52 +++++++++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 0b826f92fe..f0a8e5b3c5 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -311,8 +311,15 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) elif dbtype == DatabaseType.SIGLIST: db = db.select(moltype=query_moltype, ksize=query_ksize) siglist = db.signatures() - siglist = filter_compatible_signatures(query, siglist, False) - siglist = list(siglist) + try: + # CTB: it's not clear to me that filter_compatible_signatures + # should fail here, on incompatible signatures; but that's + # what we have it doing currently. Revisit. + siglist = filter_compatible_signatures(query, siglist, False) + siglist = list(siglist) + except ValueError: + siglist = [] + if not siglist: notify("no compatible signatures found in '{}'", filename) sys.exit(-1) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 71b5ebec66..224602a06c 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -859,8 +859,8 @@ def test_compare_no_matching_sigs(c): query = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') with pytest.raises(ValueError) as exc: - c.last_result.status, c.last_result.out, c.last_result.err = c.run_sourmash('compare', '-k', '100', query, - fail_ok=True) + c.last_result.status, c.last_result.out, c.last_result.err = \ + c.run_sourmash('compare', '-k', '100', query, fail_ok=True) print(c.last_result.out) print(c.last_result.err) @@ -1600,6 +1600,37 @@ def test_search_metagenome_traverse_check_csv(): assert '13 matches; showing first 3:' in out +@utils.in_thisdir +def test_search_incompatible(c): + num_sig = utils.get_test_data('num/47.fa.sig') + scaled_sig = utils.get_test_data('47.fa.sig') + + with pytest.raises(ValueError) as exc: + c.run_sourmash("search", scaled_sig, num_sig, fail_ok=True) + assert c.last_result.status != 0 + print(c.last_result.out) + print(c.last_result.err) + assert 'incompatible - cannot compare.' in c.last_result.err + assert 'was calculated with --scaled,' in c.last_result.err + + +@utils.in_tempdir +def test_search_traverse_incompatible(c): + searchdir = c.output('searchme') + os.mkdir(searchdir) + + num_sig = utils.get_test_data('num/47.fa.sig') + scaled_sig = utils.get_test_data('47.fa.sig') + shutil.copyfile(num_sig, c.output('searchme/num.sig')) + shutil.copyfile(scaled_sig, c.output('searchme/scaled.sig')) + + c.run_sourmash("search", scaled_sig, c.output('searchme')) + print(c.last_result.out) + print(c.last_result.err) + assert 'incompatible - cannot compare.' in c.last_result.err + assert 'was calculated with --scaled,' in c.last_result.err + + # explanation: you cannot downsample a scaled SBT to match a scaled # signature, so make sure that when you try such a search, it fails! # (you *can* downsample a signature to match an SBT.) @@ -3319,6 +3350,23 @@ def test_gather_metagenome_traverse_check_csv(): 'NC_011294.1 Salmonella enterica subsp...' in out)) +@utils.in_tempdir +def test_gather_traverse_incompatible(c): + searchdir = c.output('searchme') + os.mkdir(searchdir) + + num_sig = utils.get_test_data('num/47.fa.sig') + scaled_sig = utils.get_test_data('47.fa.sig') + shutil.copyfile(num_sig, c.output('searchme/num.sig')) + shutil.copyfile(scaled_sig, c.output('searchme/scaled.sig')) + + c.run_sourmash("gather", scaled_sig, c.output('searchme')) + print(c.last_result.out) + print(c.last_result.err) + assert 'incompatible - cannot compare.' in c.last_result.err + assert 'was calculated with --scaled,' in c.last_result.err + + def test_gather_metagenome_output_unassigned(): with utils.TempDirectory() as location: testdata_glob = utils.get_test_data('gather/GCF_000195995*g') From 14891bd4af9a6da2fe8c0ea01bf32dfef4880017 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 9 Mar 2021 07:09:33 -0800 Subject: [PATCH 16/46] add filter to LinearIndex and MultiIndex --- src/sourmash/index.py | 28 ++++++++++++++++++++++------ src/sourmash/sourmash_args.py | 35 +++++++++++++---------------------- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index a8cc322d55..a4b287b90f 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -152,13 +152,19 @@ def load(cls, location): return lidx def select(self, ksize=None, moltype=None): - def select_sigs(siglist, ksize, moltype): - for ss in siglist: - if (ksize is None or ss.minhash.ksize == ksize) and \ - (moltype is None or ss.minhash.moltype == moltype): - yield ss + def select_sigs(ss, ksize=ksize, moltype=moltype): + if (ksize is None or ss.minhash.ksize == ksize) and \ + (moltype is None or ss.minhash.moltype == moltype): + return True + + return self.filter(select_sigs) + + def filter(self, filter_fn): + siglist = [] + for ss in self._signatures: + if filter_fn(ss): + siglist.append(ss) - siglist=select_sigs(self._signatures, ksize, moltype) return LinearIndex(siglist, self.filename) @@ -206,6 +212,16 @@ def select(self, ksize=None, moltype=None): return MultiIndex(new_idx_list, new_src_list) + def filter(self, filter_fn): + new_idx_list = [] + new_src_list = [] + for idx, src in zip(self.index_list, self.source_list): + idx = idx.filter(filter_fn) + new_idx_list.append(idx) + new_src_list.append(src) + + return MultiIndex(new_idx_list, new_src_list) + def search(self, query, *args, **kwargs): # do the actual search: matches = [] diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index f0a8e5b3c5..8f8b1f7624 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -181,16 +181,7 @@ def traverse_find_sigs(filenames, yield_all_files=False): yield fullname -def filter_compatible_signatures(query, siglist, force=False): - for ss in siglist: - if check_signatures_are_compatible(query, ss): - yield ss - else: - if not force: - raise ValueError("incompatible signature") - - -def check_signatures_are_compatible(query, subject): +def _check_signatures_are_compatible(query, subject): # is one scaled, and the other not? cannot do search if query.minhash.scaled and not subject.minhash.scaled or \ not query.minhash.scaled and subject.minhash.scaled: @@ -280,7 +271,8 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) assert dbtype == DatabaseType.SIGLIST db = db.select(moltype=query_moltype, ksize=query_ksize) - # @CTB filter compatible signatures here, too. write test. + filter_fn = lambda s: _check_signatures_are_compatible(query, s) + db = db.filter(filter_fn) if not db: notify("no compatible signatures found in '{}'", filename) sys.exit(-1) @@ -315,21 +307,20 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) # CTB: it's not clear to me that filter_compatible_signatures # should fail here, on incompatible signatures; but that's # what we have it doing currently. Revisit. - siglist = filter_compatible_signatures(query, siglist, False) - siglist = list(siglist) + filter_fn = lambda s: _check_signatures_are_compatible(query, + s) + db = db.filter(filter_fn) except ValueError: - siglist = [] + db = None - if not siglist: - notify("no compatible signatures found in '{}'", filename) + if not db: + notify(f"no compatible signatures found in '{filename}'") sys.exit(-1) - linear = LinearIndex(siglist, filename=filename) - databases.append(linear) + databases.append(db) - notify('loaded {} signatures from {}', len(linear), - filename, end='\r') - n_signatures += len(linear) + notify(f'loaded {len(db)} signatures from {filename}', end='\r') + n_signatures += len(db) # unknown!? else: @@ -402,7 +393,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): if not loaded: try: with open(filename, 'rt') as fp: - db = LinearIndex.load(fp) + db = LinearIndex.load(filename) dbtype = DatabaseType.SIGLIST loaded = True except Exception as exc: From 40395ffe85e82ff92ed5c350bc7fc53bb9140294 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 9 Mar 2021 07:17:36 -0800 Subject: [PATCH 17/46] clean up sourmash_args some more --- src/sourmash/sourmash_args.py | 39 ++++++----------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 8f8b1f7624..4766fe7664 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -266,21 +266,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) sys.exit(-1) # are we collecting signatures from a directory/path? - if os.path.isdir(filename): - # CTB: combine with SIGLIST below? - assert dbtype == DatabaseType.SIGLIST - - db = db.select(moltype=query_moltype, ksize=query_ksize) - filter_fn = lambda s: _check_signatures_are_compatible(query, s) - db = db.filter(filter_fn) - if not db: - notify("no compatible signatures found in '{}'", filename) - sys.exit(-1) - databases.append(db) - n_signatures += len(db) - - # SBT - elif dbtype == DatabaseType.SBT: + if dbtype == DatabaseType.SBT: if not check_tree_is_compatible(filename, db, query, is_similarity_query): sys.exit(-1) @@ -303,15 +289,8 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) elif dbtype == DatabaseType.SIGLIST: db = db.select(moltype=query_moltype, ksize=query_ksize) siglist = db.signatures() - try: - # CTB: it's not clear to me that filter_compatible_signatures - # should fail here, on incompatible signatures; but that's - # what we have it doing currently. Revisit. - filter_fn = lambda s: _check_signatures_are_compatible(query, - s) - db = db.filter(filter_fn) - except ValueError: - db = None + filter_fn = lambda s: _check_signatures_are_compatible(query, s) + db = db.filter(filter_fn) if not db: notify(f"no compatible signatures found in '{filename}'") @@ -369,7 +348,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): dbtype = DatabaseType.SIGLIST loaded = True - # load signatures from directory + # load signatures from directory, using MultiIndex to preserve source. if not loaded and os.path.isdir(filename): index_list = [] source_list = [] @@ -400,6 +379,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): pass # try load signatures from single file (list of signature paths) + # use MultiIndex to preserve source filenames. if not loaded: try: idx_list = [] @@ -458,14 +438,6 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): return db, dbtype -# note: dup from index.py internal function. -def _select_sigs(siglist, ksize, moltype): - for ss in siglist: - if (ksize is None or ss.minhash.ksize == ksize) and \ - (moltype is None or ss.minhash.moltype == moltype): - yield ss - - def load_file_as_index(filename, yield_all_files=False): """Load 'filename' as a database; generic database loader. @@ -512,6 +484,7 @@ def load_file_as_signatures(filename, select_moltype=None, ksize=None, else: return loader + def load_file_list_of_signatures(filename): "Load a list-of-files text file." try: From f377dc4764e85d89011a460aa4c54b84f6a85569 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Mar 2021 08:28:00 -0700 Subject: [PATCH 18/46] shift loading over to Index classes --- src/sourmash/index.py | 42 +++++++++++++++++++++++++++++++++ src/sourmash/sourmash_args.py | 44 +++++++++++------------------------ 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 07e9c21b6a..3d2ff67c21 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -2,6 +2,7 @@ from abc import abstractmethod, ABC from collections import namedtuple +import os class Index(ABC): @@ -203,6 +204,47 @@ def insert(self, *args): def load(self, *args): raise NotImplementedError + @classmethod + def load_from_directory(cls, dirname, traverse_yield_all): + from .sourmash_args import traverse_find_sigs + if not os.path.isdir(dirname): + raise ValueError(f"'{dirname}' must be a directory") + + index_list = [] + source_list = [] + for thisfile in traverse_find_sigs([dirname], traverse_yield_all): + try: + idx = LinearIndex.load(thisfile) + index_list.append(idx) + source_list.append(thisfile) + except (IOError, sourmash.exceptions.SourmashError): + if traverse_yield_all: + continue + else: + raise + + db = None + if index_list: + db = cls(index_list, source_list) + + return db + + @classmethod + def load_from_file_list(cls, filename): + idx_list = [] + src_list = [] + + file_list = load_file_list_of_signatures(filename) + for fname in file_list: + idx = load_file_as_index(fname) + src = fname + + idx_list.append(idx) + src_list.append(src) + + db = MultiIndex(idx_list, src_list) + return db + def save(self, *args): raise NotImplementedError diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 5a3358783c..c6ad9125e3 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -73,6 +73,7 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): sl = list(sl) except (OSError, ValueError): error("Cannot open file '{}'", filename) + raise sys.exit(-1) if len(sl) and select_md5: @@ -342,6 +343,8 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): loaded = False dbtype = None + print('XXX', filename) + # special case stdin if not loaded and filename == '-': db = LinearIndex.load(sys.stdin) @@ -349,30 +352,20 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): loaded = True # load signatures from directory, using MultiIndex to preserve source. - if not loaded and os.path.isdir(filename): - index_list = [] - source_list = [] - for thisfile in traverse_find_sigs([filename], traverse_yield_all): - try: - idx = LinearIndex.load(thisfile) - index_list.append(idx) - source_list.append(thisfile) - except (IOError, sourmash.exceptions.SourmashError): - if traverse_yield_all: - continue - else: - raise - - if index_list: - loaded=True - db = MultiIndex(index_list, source_list) + if not loaded: + try: + db = MultiIndex.load_from_directory(filename, traverse_yield_all) dbtype = DatabaseType.SIGLIST + loaded = True + except Exception as exc: + import traceback + traceback.print_exc() + pass # load signatures from single signature file if not loaded: try: - with open(filename, 'rt') as fp: - db = LinearIndex.load(filename) + db = LinearIndex.load(filename) dbtype = DatabaseType.SIGLIST loaded = True except Exception as exc: @@ -382,18 +375,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): # use MultiIndex to preserve source filenames. if not loaded: try: - idx_list = [] - src_list = [] - - file_list = load_file_list_of_signatures(filename) - for fname in file_list: - idx = load_file_as_index(fname) - src = fname - - idx_list.append(idx) - src_list.append(src) - - db = MultiIndex(idx_list, src_list) + db = MultiIndex.load_from_file_list(filename) dbtype = DatabaseType.SIGLIST loaded = True except Exception as exc: From 250c49a0b04972e36f4d2dc5ab9e57875234160b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 24 Mar 2021 08:43:38 -0700 Subject: [PATCH 19/46] refactor, fix tests --- src/sourmash/index.py | 4 ++++ src/sourmash/sig/__main__.py | 2 +- src/sourmash/sourmash_args.py | 8 +++++--- tests/test_cmd_signature.py | 3 +++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 3d2ff67c21..b17a51d6f3 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -226,11 +226,15 @@ def load_from_directory(cls, dirname, traverse_yield_all): db = None if index_list: db = cls(index_list, source_list) + else: + raise ValueError(f"no signatures to load under directory '{dirname}'") return db @classmethod def load_from_file_list(cls, filename): + from .sourmash_args import (load_file_list_of_signatures, + load_file_as_index) idx_list = [] src_list = [] diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 75f738ab14..076a3412b1 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -73,10 +73,10 @@ def cat(args): siglist = [] for sigfile in args.signatures: this_siglist = [] + n_loaded = 0 try: loader = sourmash_args.load_file_as_signatures(sigfile, progress=progress) - n_loaded = 0 for sig in loader: n_loaded += 1 diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index c6ad9125e3..29c9c3bd6e 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -343,7 +343,8 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): loaded = False dbtype = None - print('XXX', filename) + # @CTB add debug/more informative error messages if we're going to + # catch all exceptions. Or, standardize on ValueError or something. # special case stdin if not loaded and filename == '-': @@ -358,8 +359,6 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): dbtype = DatabaseType.SIGLIST loaded = True except Exception as exc: - import traceback - traceback.print_exc() pass # load signatures from single signature file @@ -417,6 +416,9 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): if not loaded: raise OSError(f"Error while reading signatures from '{filename}'.") + if loaded: + assert db + return db, dbtype diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 4152ceab33..07e343f707 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -778,6 +778,9 @@ def test_sig_cat_filelist_with_dbs(c): c.run_sourmash('sig', 'cat', filelist, '-o', 'out.sig') + print(c.last_result.out) + print(c.last_result.err) + # stdout should be same signatures out = c.output('out.sig') From 9a921f97e7ee75e5833d0c57d9c31554d94c1d2c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 25 Mar 2021 10:06:23 -0700 Subject: [PATCH 20/46] switch to a list of loader functions --- src/sourmash/sourmash_args.py | 123 +++++++++++++++++++++------------- tests/test_api.py | 4 +- 2 files changed, 79 insertions(+), 48 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 29c9c3bd6e..b4a8e80cf0 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -6,6 +6,7 @@ import argparse import itertools from enum import Enum +import traceback import screed @@ -14,7 +15,7 @@ import sourmash.exceptions from . import signature -from .logging import notify, error +from .logging import notify, error, debug from .index import LinearIndex, MultiIndex from . import signature as sig @@ -73,7 +74,7 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): sl = list(sl) except (OSError, ValueError): error("Cannot open file '{}'", filename) - raise + raise # @CTB testme? sys.exit(-1) if len(sl) and select_md5: @@ -333,6 +334,68 @@ class DatabaseType(Enum): LCA = 3 +def _load_stdin(filename, **kwargs): + db = None + if filename == '-': + db = LinearIndex.load(sys.stdin) + + return (db, DatabaseType.SIGLIST) + + +def _multiindex_load_from_file_list(filename, **kwargs): + try: + db = MultiIndex.load_from_file_list(filename) + except UnicodeDecodeError as exc: + raise ValueError(exc) + + return (db, DatabaseType.SIGLIST) + + +def _multiindex_load_from_directory(filename, **kwargs): + traverse_yield_all = kwargs['traverse_yield_all'] + db = MultiIndex.load_from_directory(filename, traverse_yield_all) + + return (db, DatabaseType.SIGLIST) + +def _load_sigfile(filename, **kwargs): + try: + db = LinearIndex.load(filename) + except sourmash.exceptions.SerdeError as exc: + raise ValueError(exc) + except FileNotFoundError: + raise ValueError(f"Error while reading signatures from '{filename}'") + except Exception as exc: + raise ValueError(f"Error while reading signatures from '{filename}'") + #raise ValueError(exc) # load_signature line 255 raises general exc @CTB + return (db, DatabaseType.SIGLIST) + + +def _load_sbt(filename, **kwargs): + cache_size = kwargs.get('cache_size') + + try: + db = load_sbt_index(filename, cache_size=cache_size) + except FileNotFoundError as exc: + raise ValueError(exc) + + return (db, DatabaseType.SBT) + + +def _load_revindex(filename, **kwargs): + db, _, _ = load_single_database(filename) + return (db, DatabaseType.LCA) + + +_loader_functions = [ + ("load from stdin", _load_stdin), + ("load from directory", _multiindex_load_from_directory), + ("load from sig file", _load_sigfile), + ("load from file list", _multiindex_load_from_file_list), + ("load SBT", _load_sbt), + ("load revindex", _load_revindex), + ] + + def _load_database(filename, traverse_yield_all, *, cache_size=None): """Load file as a database - list of signatures, LCA, SBT, etc. @@ -346,55 +409,23 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): # @CTB add debug/more informative error messages if we're going to # catch all exceptions. Or, standardize on ValueError or something. - # special case stdin - if not loaded and filename == '-': - db = LinearIndex.load(sys.stdin) - dbtype = DatabaseType.SIGLIST - loaded = True - - # load signatures from directory, using MultiIndex to preserve source. - if not loaded: + for (desc, load_fn) in _loader_functions: + #print(f"trying loader fn {desc} for {filename}", file=sys.stderr) try: - db = MultiIndex.load_from_directory(filename, traverse_yield_all) - dbtype = DatabaseType.SIGLIST - loaded = True - except Exception as exc: + db, dbtype = load_fn(filename, + traverse_yield_all=traverse_yield_all, + cache_size=cache_size) + except ValueError as exc: + #print(f"FAIL load {desc}", file=sys.stderr) pass - - # load signatures from single signature file - if not loaded: - try: - db = LinearIndex.load(filename) - dbtype = DatabaseType.SIGLIST - loaded = True except Exception as exc: - pass + #print(f"FAIL load {desc}", file=sys.stderr) + #print(traceback.format_exc(), file=sys.stderr) + raise - # try load signatures from single file (list of signature paths) - # use MultiIndex to preserve source filenames. - if not loaded: - try: - db = MultiIndex.load_from_file_list(filename) - dbtype = DatabaseType.SIGLIST + if db: loaded = True - except Exception as exc: - pass - - if not loaded: # try load as SBT - try: - db = load_sbt_index(filename, cache_size=cache_size) - loaded = True - dbtype = DatabaseType.SBT - except: - pass - - if not loaded: # try load as LCA - try: - db, _, _ = load_single_database(filename) - loaded = True - dbtype = DatabaseType.LCA - except: - pass + break # check to see if it's a FASTA/FASTQ record (i.e. screed loadable) # so we can provide a better error message to users. diff --git a/tests/test_api.py b/tests/test_api.py index a6c298d3c3..4e5930b193 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -20,9 +20,9 @@ def test_sourmash_signature_api(c): @utils.in_tempdir def test_load_index_0_no_file(c): - with pytest.raises(OSError) as exc: + with pytest.raises(Exception) as exc: # @CTB fix exception type idx = sourmash.load_file_as_index(c.output('does-not-exist')) - assert 'Error while reading signatures from ' in str(exc.value) + assert 'Error ' in str(exc.value) # @CTB fixme def test_load_index_1(): From 780fb716ba3b53ea7141f3c60a9fa0120cb3600a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 26 Mar 2021 09:53:30 -0700 Subject: [PATCH 21/46] comments, docstrings, and tests passing --- src/sourmash/lca/lca_db.py | 5 ++++- src/sourmash/sig/__main__.py | 2 +- src/sourmash/sourmash_args.py | 23 +++++++++++++++++++---- tests/test_cmd_signature.py | 3 +++ 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 9c305c80b4..137f0fbd5c 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -1,5 +1,5 @@ "LCA database class and utilities." - +import os import json import gzip from collections import OrderedDict, defaultdict, Counter @@ -187,6 +187,9 @@ def load(cls, db_name): "Load LCA_Database from a JSON file." from .lca_utils import taxlist, LineagePair + if not os.path.isfile(db_name): + raise ValueError(f"'{db_name}' is not a file and cannot be loaded as an LCA database") + xopen = open if db_name.endswith('.gz'): xopen = gzip.open diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 076a3412b1..e39f54d3a9 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -73,7 +73,7 @@ def cat(args): siglist = [] for sigfile in args.signatures: this_siglist = [] - n_loaded = 0 + n_loaded = 0 # @CTB testme. try: loader = sourmash_args.load_file_as_signatures(sigfile, progress=progress) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index b4a8e80cf0..c70efd52ec 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -259,11 +259,12 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) n_databases = 0 databases = [] for filename in filenames: - notify('loading from {}...', filename, end='\r') + notify(f'loading from {filename}...', end='\r') try: db, dbtype = _load_database(filename, False, cache_size=cache_size) except IOError as e: + raise notify(str(e)) sys.exit(-1) @@ -335,6 +336,7 @@ class DatabaseType(Enum): def _load_stdin(filename, **kwargs): + "Load collection from .sig file streamed in via stdin" db = None if filename == '-': db = LinearIndex.load(sys.stdin) @@ -343,6 +345,7 @@ def _load_stdin(filename, **kwargs): def _multiindex_load_from_file_list(filename, **kwargs): + "Load collection from a list of signature/database files" try: db = MultiIndex.load_from_file_list(filename) except UnicodeDecodeError as exc: @@ -352,12 +355,15 @@ def _multiindex_load_from_file_list(filename, **kwargs): def _multiindex_load_from_directory(filename, **kwargs): + "Load collection from a directory." traverse_yield_all = kwargs['traverse_yield_all'] db = MultiIndex.load_from_directory(filename, traverse_yield_all) return (db, DatabaseType.SIGLIST) + def _load_sigfile(filename, **kwargs): + "Load collection from a signature JSON file" try: db = LinearIndex.load(filename) except sourmash.exceptions.SerdeError as exc: @@ -371,6 +377,7 @@ def _load_sigfile(filename, **kwargs): def _load_sbt(filename, **kwargs): + "Load collection from an SBT." cache_size = kwargs.get('cache_size') try: @@ -382,10 +389,15 @@ def _load_sbt(filename, **kwargs): def _load_revindex(filename, **kwargs): - db, _, _ = load_single_database(filename) + "Load collection from an LCA database/reverse index." + try: + db, _, _ = load_single_database(filename) + except FileNotFoundError as exc: + raise ValueError(exc) return (db, DatabaseType.LCA) +# all loader functions, in order. _loader_functions = [ ("load from stdin", _load_stdin), ("load from directory", _multiindex_load_from_directory), @@ -505,10 +517,13 @@ def load_file_list_of_signatures(filename): try: with open(filename, 'rt') as fp: file_list = [ x.rstrip('\r\n') for x in fp ] + + if not os.path.exists(file_list[0]): + raise ValueError("first element of list-of-files does not exist") except OSError: - raise ValueError("cannot open file '{}'".format(filename)) + raise ValueError(f"cannot open file '{filename}'") except UnicodeDecodeError: - raise ValueError("cannot parse file '{}' as list of filenames".format(filename)) + raise ValueError(f"cannot parse file '{filename}' as list of filenames") return file_list diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 07e343f707..2dd0cac078 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -625,6 +625,9 @@ def test_sig_rename_3_file_dne(c): with pytest.raises(ValueError) as e: c.run_sourmash('sig', 'rename', 'no-such-sig', 'fiz bar') + print('out', (c.last_result.out,)) + print('err', (c.last_result.err,)) + assert "Error while reading signatures from 'no-such-sig'" in c.last_result.err From d2619636b3eeecdc1cb02e3b8ea76700f4efc0a4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 26 Mar 2021 10:19:23 -0700 Subject: [PATCH 22/46] update to use f strings throughout sourmash_args.py --- src/sourmash/sourmash_args.py | 59 ++++++++++++++--------------------- tests/test_api.py | 6 ++-- 2 files changed, 28 insertions(+), 37 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index c70efd52ec..9012e7d66c 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -25,7 +25,6 @@ import sourmash DEFAULT_LOAD_K = 31 -DEFAULT_N = 500 def get_moltype(sig, require=False): @@ -84,8 +83,7 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): if sig_md5.startswith(select_md5.lower()): # make sure we pick only one -- if found_sig is not None: - error("Error! Multiple signatures start with md5 '{}'", - select_md5) + error(f"Error! Multiple signatures start with md5 '{select_md5}'") error("Please use a longer --md5 selector.") sys.exit(-1) else: @@ -98,16 +96,16 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): if len(ksizes) == 1: ksize = ksizes.pop() sl = [ ss for ss in sl if ss.minhash.ksize == ksize ] - notify('select query k={} automatically.', ksize) + notify(f'select query k={ksize} automatically.') elif DEFAULT_LOAD_K in ksizes: sl = [ ss for ss in sl if ss.minhash.ksize == DEFAULT_LOAD_K ] - notify('selecting default query k={}.', DEFAULT_LOAD_K) + notify(f'selecting default query k={DEFAULT_LOAD_K}.') elif ksize: - notify('selecting specified query k={}', ksize) + notify(f'selecting specified query k={ksize}') if len(sl) != 1: - error('When loading query from "{}"', filename) - error('{} signatures matching ksize and molecule type;', len(sl)) + error(f"When loading query from '{filename}'", filename) + error(f'{len(sl)} signatures matching ksize and molecule type;') error('need exactly one. Specify --ksize or --dna, --rna, or --protein.') sys.exit(-1) @@ -190,11 +188,9 @@ def _check_signatures_are_compatible(query, subject): error("signature {} and {} are incompatible - cannot compare.", query, subject) if query.minhash.scaled: - error("{} was calculated with --scaled, {} was not.", - query, subject) + error(f"{query} was calculated with --scaled, {subject} was not.") if subject.minhash.scaled: - error("{} was calculated with --scaled, {} was not.", - subject, query) + error(f"{subject} was calculated with --scaled, {query} was not.") return 0 return 1 @@ -208,15 +204,14 @@ def check_tree_is_compatible(treename, tree, query, is_similarity_query): query_mh = query.minhash if tree_mh.ksize != query_mh.ksize: - error("ksize on tree '{}' is {};", treename, tree_mh.ksize) - error('this is different from query ksize of {}.', query_mh.ksize) + error(f"ksize on tree '{treename}' is {tree_mh.ksize};") + error(f"this is different from query ksize of {query_mh.ksize}.") return 0 # is one scaled, and the other not? cannot do search. if (tree_mh.scaled and not query_mh.scaled) or \ (query_mh.scaled and not tree_mh.scaled): - error("for tree '{}', tree and query are incompatible for search.", - treename) + error(f"for tree '{treename}', tree and query are incompatible for search.") if tree_mh.scaled: error("tree was calculated with scaled, query was not.") else: @@ -226,9 +221,8 @@ def check_tree_is_compatible(treename, tree, query, is_similarity_query): # are the scaled values incompatible? cannot downsample tree for similarity if tree_mh.scaled and tree_mh.scaled < query_mh.scaled and \ is_similarity_query: - error("for tree '{}', scaled value is smaller than query.", treename) - error("tree scaled: {}; query scaled: {}. Cannot do similarity search.", - tree_mh.scaled, query_mh.scaled) + error(f"for tree '{treename}', scaled value is smaller than query.") + error(f"tree scaled: {tree_mh.scaled}; query scaled: {query_mh.scaled}. Cannot do similarity search.") return 0 return 1 @@ -237,8 +231,8 @@ def check_tree_is_compatible(treename, tree, query, is_similarity_query): def check_lca_db_is_compatible(filename, db, query): query_mh = query.minhash if db.ksize != query_mh.ksize: - error("ksize on db '{}' is {};", filename, db.ksize) - error('this is different from query ksize of {}.', query_mh.ksize) + error(f"ksize on db '{filename}' is {db.ksize};") + error(f"this is different from query ksize of {query_mh.ksize}.") return 0 return 1 @@ -275,7 +269,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) sys.exit(-1) databases.append(db) - notify('loaded SBT {}', filename, end='\r') + notify(f'loaded SBT {filename}', end='\r') n_databases += 1 # LCA @@ -283,7 +277,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) if not check_lca_db_is_compatible(filename, db, query): sys.exit(-1) - notify('loaded LCA {}', filename, end='\r') + notify('loaded LCA {filename}', end='\r') n_databases += 1 databases.append(db) @@ -306,19 +300,18 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) # unknown!? else: - raise Exception("unknown dbtype {}".format(dbtype)) + raise ValueError("unknown dbtype {dbtype}") # END for loop notify(' '*79, end='\r') if n_signatures and n_databases: - notify('loaded {} signatures and {} databases total.', n_signatures, - n_databases) + notify(f'loaded {n_signatures} signatures and {n_databases} databases total.') elif n_signatures: - notify('loaded {} signatures.', n_signatures) + notify(f'loaded {n_signatures} signatures.') elif n_databases: - notify('loaded {} databases.', n_databases) + notify(f'loaded {n_databases} databases.') else: notify('** ERROR: no signatures or databases loaded?') sys.exit(-1) @@ -430,10 +423,6 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): except ValueError as exc: #print(f"FAIL load {desc}", file=sys.stderr) pass - except Exception as exc: - #print(f"FAIL load {desc}", file=sys.stderr) - #print(traceback.format_exc(), file=sys.stderr) - raise if db: loaded = True @@ -454,12 +443,12 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): pass if successful_screed_load: - raise OSError("Error while reading signatures from '{}' - got sequences instead! Is this a FASTA/FASTQ file?".format(filename)) + raise ValueError(f"Error while reading signatures from '{filename}' - got sequences instead! Is this a FASTA/FASTQ file?") if not loaded: - raise OSError(f"Error while reading signatures from '{filename}'.") + raise ValueError(f"Error while reading signatures from '{filename}'.") - if loaded: + if loaded: # this is a bit redundant but safe > sorry assert db return db, dbtype diff --git a/tests/test_api.py b/tests/test_api.py index 4e5930b193..b9f24fc385 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -53,7 +53,9 @@ def test_load_fasta_as_signature(): # try loading a fasta file - should fail with informative exception testfile = utils.get_test_data('short.fa') - with pytest.raises(OSError) as e: + with pytest.raises(ValueError) as exc: idx = sourmash.load_file_as_index(testfile) - assert "Error while reading signatures from '{}' - got sequences instead! Is this a FASTA/FASTQ file?".format(testfile) in str(e) + print(exc.value) + + assert f"Error while reading signatures from '{testfile}' - got sequences instead! Is this a FASTA/FASTQ file?" in str(exc.value) From 93fca04823d1ecd31ac965bc073572d309c8d010 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 26 Mar 2021 10:23:15 -0700 Subject: [PATCH 23/46] add docstrings --- src/sourmash/index.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index b17a51d6f3..8b62d5c23b 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -206,6 +206,7 @@ def load(self, *args): @classmethod def load_from_directory(cls, dirname, traverse_yield_all): + "Create a MultiIndex from all files under a directory." from .sourmash_args import traverse_find_sigs if not os.path.isdir(dirname): raise ValueError(f"'{dirname}' must be a directory") @@ -233,6 +234,7 @@ def load_from_directory(cls, dirname, traverse_yield_all): @classmethod def load_from_file_list(cls, filename): + "Create a MultiIndex from all files listed in a text file." from .sourmash_args import (load_file_list_of_signatures, load_file_as_index) idx_list = [] From 020335775a7c71abf7343d97465b4695daca0426 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 26 Mar 2021 10:27:46 -0700 Subject: [PATCH 24/46] update comments --- src/sourmash/sourmash_args.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 9012e7d66c..fab04dd477 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -245,6 +245,8 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) Check for compatibility with query. This is basically a user-focused wrapping of _load_databases. + + @CTB this can be refactored into a more generic function with 'filter'. """ query_ksize = query.minhash.ksize query_moltype = get_moltype(query) @@ -262,7 +264,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) notify(str(e)) sys.exit(-1) - # are we collecting signatures from a directory/path? + # are we collecting signatures from an SBT? if dbtype == DatabaseType.SBT: if not check_tree_is_compatible(filename, db, query, is_similarity_query): @@ -272,7 +274,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) notify(f'loaded SBT {filename}', end='\r') n_databases += 1 - # LCA + # or an LCA? elif dbtype == DatabaseType.LCA: if not check_lca_db_is_compatible(filename, db, query): sys.exit(-1) @@ -282,7 +284,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) databases.append(db) - # signature file + # or a mixed collection of signatures? elif dbtype == DatabaseType.SIGLIST: db = db.select(moltype=query_moltype, ksize=query_ksize) siglist = db.signatures() @@ -300,7 +302,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) # unknown!? else: - raise ValueError("unknown dbtype {dbtype}") + raise ValueError(f"unknown dbtype {dbtype}") # @CTB test? # END for loop From 8a0200a491ef2b6b4679592bd4e2c6dcec064233 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 26 Mar 2021 12:16:24 -0700 Subject: [PATCH 25/46] remove unnecessary changes --- tests/test_cmd_signature.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 2dd0cac078..4152ceab33 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -625,9 +625,6 @@ def test_sig_rename_3_file_dne(c): with pytest.raises(ValueError) as e: c.run_sourmash('sig', 'rename', 'no-such-sig', 'fiz bar') - print('out', (c.last_result.out,)) - print('err', (c.last_result.err,)) - assert "Error while reading signatures from 'no-such-sig'" in c.last_result.err @@ -781,9 +778,6 @@ def test_sig_cat_filelist_with_dbs(c): c.run_sourmash('sig', 'cat', filelist, '-o', 'out.sig') - print(c.last_result.out) - print(c.last_result.err) - # stdout should be same signatures out = c.output('out.sig') From e9df90f0a49e47d71903603d78544907415688c0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 26 Mar 2021 12:18:23 -0700 Subject: [PATCH 26/46] revert to original test --- tests/test_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index b9f24fc385..b2f994751a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -20,9 +20,9 @@ def test_sourmash_signature_api(c): @utils.in_tempdir def test_load_index_0_no_file(c): - with pytest.raises(Exception) as exc: # @CTB fix exception type + with pytest.raises(ValueError) as exc: # @CTB fix exception type idx = sourmash.load_file_as_index(c.output('does-not-exist')) - assert 'Error ' in str(exc.value) # @CTB fixme + assert 'Error while reading signatures from ' in str(exc.value) def test_load_index_1(): From 9e427e32778440393c3fe6ffa5cfee6c3f9b1eb3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 26 Mar 2021 12:18:56 -0700 Subject: [PATCH 27/46] remove unneeded comment --- tests/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_api.py b/tests/test_api.py index b2f994751a..3291c62165 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -20,7 +20,7 @@ def test_sourmash_signature_api(c): @utils.in_tempdir def test_load_index_0_no_file(c): - with pytest.raises(ValueError) as exc: # @CTB fix exception type + with pytest.raises(ValueError) as exc: idx = sourmash.load_file_as_index(c.output('does-not-exist')) assert 'Error while reading signatures from ' in str(exc.value) From 0dd390a92ea401a02518c54f236617078c69e82f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 26 Mar 2021 12:21:42 -0700 Subject: [PATCH 28/46] clean up a bit --- src/sourmash/sourmash_args.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 5cf5ecc2bd..1a4c67fd9b 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -367,7 +367,7 @@ def _load_sigfile(filename, **kwargs): raise ValueError(f"Error while reading signatures from '{filename}'") except Exception as exc: raise ValueError(f"Error while reading signatures from '{filename}'") - #raise ValueError(exc) # load_signature line 255 raises general exc @CTB + #raise ValueError(exc) # load_signature line 255 raises general exc @CTB fixme return (db, DatabaseType.SIGLIST) @@ -413,17 +413,14 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): loaded = False dbtype = None - # @CTB add debug/more informative error messages if we're going to - # catch all exceptions. Or, standardize on ValueError or something. - + # iterate through loader functions, trying them all. Catch ValueError + # but nothing else. for (desc, load_fn) in _loader_functions: - #print(f"trying loader fn {desc} for {filename}", file=sys.stderr) try: db, dbtype = load_fn(filename, traverse_yield_all=traverse_yield_all, cache_size=cache_size) except ValueError as exc: - #print(f"FAIL load {desc}", file=sys.stderr) pass if db: From 2c0ee2986fa8f6eaaf8466f7793d252caed5e798 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 27 Mar 2021 07:25:48 -0700 Subject: [PATCH 29/46] debugging update --- src/sourmash/sourmash_args.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 1a4c67fd9b..b99601fff8 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -302,7 +302,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) # unknown!? else: - raise ValueError(f"unknown dbtype {dbtype}") # @CTB test? + raise ValueError(f"unknown dbtype {dbtype}") # CTB check me. # END for loop @@ -417,11 +417,13 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): # but nothing else. for (desc, load_fn) in _loader_functions: try: + debug(f"_load_databases: trying loader fn {desc}") db, dbtype = load_fn(filename, traverse_yield_all=traverse_yield_all, cache_size=cache_size) except ValueError as exc: - pass + debug(f"_load_databases: FAIL on fn {desc}.") + debug(traceback.format_exc()) if db: loaded = True From edcb483b4b7cd4426fc5ecdffceda613087bc6a5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 27 Mar 2021 07:30:06 -0700 Subject: [PATCH 30/46] better exception raising and capture for signature parsing --- src/sourmash/signature.py | 2 +- src/sourmash/sourmash_args.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 60bb2d5c90..e382e58311 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -252,7 +252,7 @@ def load_signatures( input_type = _detect_input_type(data) if input_type == SigInput.UNKNOWN: if do_raise: - raise Exception("Error in parsing signature; quitting. Cannot open file or invalid signature") + raise ValueError("Error in parsing signature; quitting. Cannot open file or invalid signature") return size = ffi.new("uintptr_t *") diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index b99601fff8..486a2a7ffa 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -246,7 +246,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) This is basically a user-focused wrapping of _load_databases. - @CTB this can be refactored into a more generic function with 'filter'. + CTB: this can be refactored into a more generic function with 'filter'. """ query_ksize = query.minhash.ksize query_moltype = get_moltype(query) @@ -361,13 +361,11 @@ def _load_sigfile(filename, **kwargs): "Load collection from a signature JSON file" try: db = LinearIndex.load(filename) - except sourmash.exceptions.SerdeError as exc: + except sourmash.exceptions.SourmashError as exc: raise ValueError(exc) except FileNotFoundError: raise ValueError(f"Error while reading signatures from '{filename}'") - except Exception as exc: - raise ValueError(f"Error while reading signatures from '{filename}'") - #raise ValueError(exc) # load_signature line 255 raises general exc @CTB fixme + return (db, DatabaseType.SIGLIST) From 3f6c3f21c3311a12a1e1c658230b4390ecf0399d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 27 Mar 2021 07:35:02 -0700 Subject: [PATCH 31/46] more specific error message --- src/sourmash/sourmash_args.py | 3 +-- tests/test_sourmash.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 486a2a7ffa..b0720a5f07 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -72,8 +72,7 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): select_moltype=select_moltype) sl = list(sl) except (OSError, ValueError): - error("Cannot open file '{}'", filename) - raise # @CTB testme? + error(f"Cannot open query file '{filename}'") sys.exit(-1) if len(sl) and select_md5: diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index a1f2b55f6e..1a0d1d4d65 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -716,7 +716,7 @@ def test_search_query_sig_does_not_exist(c): print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 - assert "Cannot open file 'short2.fa.sig'" in c.last_result.err + assert "Cannot open query file 'short2.fa.sig'" in c.last_result.err assert len(c.last_result.err.split('\n\r')) < 5 From 78dbb1d07d10168b5c7ed2c07ad171aea3c7e06f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 27 Mar 2021 07:41:58 -0700 Subject: [PATCH 32/46] revert change in favor of creating new issue --- src/sourmash/sig/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index e39f54d3a9..75f738ab14 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -73,10 +73,10 @@ def cat(args): siglist = [] for sigfile in args.signatures: this_siglist = [] - n_loaded = 0 # @CTB testme. try: loader = sourmash_args.load_file_as_signatures(sigfile, progress=progress) + n_loaded = 0 for sig in loader: n_loaded += 1 From 229b1d7f3e7c810ac7a9d8d43c3ad7c9d2fa0af1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 08:41:03 -0700 Subject: [PATCH 33/46] add commentary => TODO --- src/sourmash/commands.py | 1 + src/sourmash/lca/command_summarize.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index ff41e3e124..35b88d0610 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -534,6 +534,7 @@ def categorize(args): tree = load_sbt_index(args.sbt_name) # load query filenames + # @CTB replace with MultiIndex? inp_files = set(sourmash_args.traverse_find_sigs(args.queries)) inp_files = inp_files - already_names diff --git a/src/sourmash/lca/command_summarize.py b/src/sourmash/lca/command_summarize.py index d5d0765f03..e64cca8e01 100644 --- a/src/sourmash/lca/command_summarize.py +++ b/src/sourmash/lca/command_summarize.py @@ -67,6 +67,8 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance): filenames = sourmash_args.traverse_find_sigs(filenames) filenames = list(filenames) + # @CTB replace with MultiIndex? + total_n = len(filenames) for query_filename in filenames: From 20ed9f0b91ecbb67f1e41fe141022e9ad5ac00d9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 09:08:01 -0700 Subject: [PATCH 34/46] add tests for MultiIndex.load_from_directory; fix traverse code --- src/sourmash/index.py | 12 +-- src/sourmash/sourmash_args.py | 40 +++++----- tests/test_index.py | 137 ++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 24 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 8b62d5c23b..e6dd779b91 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -1,5 +1,6 @@ "An Abstract Base Class for collections of signatures." +import sourmash from abc import abstractmethod, ABC from collections import namedtuple import os @@ -205,7 +206,7 @@ def load(self, *args): raise NotImplementedError @classmethod - def load_from_directory(cls, dirname, traverse_yield_all): + def load_from_directory(cls, dirname, force=False): "Create a MultiIndex from all files under a directory." from .sourmash_args import traverse_find_sigs if not os.path.isdir(dirname): @@ -213,16 +214,17 @@ def load_from_directory(cls, dirname, traverse_yield_all): index_list = [] source_list = [] - for thisfile in traverse_find_sigs([dirname], traverse_yield_all): + for thisfile in traverse_find_sigs([dirname], + yield_all_files=force): try: idx = LinearIndex.load(thisfile) index_list.append(idx) source_list.append(thisfile) except (IOError, sourmash.exceptions.SourmashError): - if traverse_yield_all: - continue + if force: + continue # ignore error else: - raise + raise # contine past error! db = None if index_list: diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index b0720a5f07..84a63c7bfa 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -152,32 +152,34 @@ def __iter__(self): yield filename, query, query_moltype, query_ksize +def _check_suffix(filename, endings): + for ending in endings: + if filename.endswith(ending): + return True + return False + + def traverse_find_sigs(filenames, yield_all_files=False): + """Find all .sig and .sig.gz files in & beneath 'filenames'. + + By default, this function returns files with .sig and .sig.gz extensions. + If 'yield_all_files' is True, this will return _all_ files + (but not directories). + """ endings = ('.sig', '.sig.gz') for filename in filenames: + # check for files in filenames: if os.path.isfile(filename): - yield_me = False - if yield_all_files: - yield_me = True - continue - else: - for ending in endings: - if filename.endswith(ending): - yield_me = True - break - - if yield_me: + if yield_all_files or _check_suffix(filename, endings): yield filename - continue - - # filename is a directory -- - dirname = filename - for root, dirs, files in os.walk(dirname): - for name in files: - if name.endswith('.sig') or yield_all_files: + # filename is a directory -- traverse beneath! + elif os.path.isdir(filename): + for root, dirs, files in os.walk(filename): + for name in files: fullname = os.path.join(root, name) - yield fullname + if yield_all_files or _check_suffix(fullname, endings): + yield fullname def _check_signatures_are_compatible(query, subject): diff --git a/tests/test_index.py b/tests/test_index.py index 1b9ae93402..6530243a8b 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -1,6 +1,11 @@ +""" +Tests for Index classes and subclasses. +""" +import pytest import glob import os import zipfile +import shutil import sourmash from sourmash import load_one_signature, SourmashSignature @@ -502,3 +507,135 @@ def test_multi_index_signatures(): assert ss2 in siglist assert ss47 in siglist assert ss63 in siglist + + +def test_multi_index_load_from_directory(): + dirname = utils.get_test_data('prot/protein') + mi = MultiIndex.load_from_directory(dirname, force=False) + + sigs = list(mi.signatures()) + assert len(sigs) == 2 + + +def test_multi_index_load_from_directory_2(): + # only load .sig files, currently; not the databases under that directory. + dirname = utils.get_test_data('prot') + mi = MultiIndex.load_from_directory(dirname, force=False) + + print(mi.index_list) + print(mi.source_list) + + sigs = list(mi.signatures()) + assert len(sigs) == 6 + + +@utils.in_tempdir +def test_multi_index_load_from_directory_3(c): + # check that force works ok on a directory + dirname = utils.get_test_data('prot') + + count = 0 + for root, dirs, files in os.walk(dirname): + for name in files: + print(f"at {name}") + fullname = os.path.join(root, name) + copyto = c.output(f"file{count}.sig") + shutil.copyfile(fullname, copyto) + count += 1 + + with pytest.raises(sourmash.exceptions.SourmashError): + mi = MultiIndex.load_from_directory(c.location, force=False) + + +@utils.in_tempdir +def test_multi_index_load_from_directory_3_yield_all_true(c): + # check that force works ok on a directory w/force=True + dirname = utils.get_test_data('prot') + + count = 0 + for root, dirs, files in os.walk(dirname): + for name in files: + print(f"at {name}") + fullname = os.path.join(root, name) + copyto = c.output(f"file{count}.something") + shutil.copyfile(fullname, copyto) + count += 1 + + mi = MultiIndex.load_from_directory(c.location, force=True) + + print(mi.index_list) + print(mi.source_list) + + sigs = list(mi.signatures()) + assert len(sigs) == 6 + + +@utils.in_tempdir +def test_multi_index_load_from_directory_3_yield_all_true_subdir(c): + # check that force works ok on subdirectories + dirname = utils.get_test_data('prot') + + target_dir = c.output("some_subdir") + os.mkdir(target_dir) + + count = 0 + for root, dirs, files in os.walk(dirname): + for name in files: + print(f"at {name}") + fullname = os.path.join(root, name) + copyto = os.path.join(target_dir, f"file{count}.something") + shutil.copyfile(fullname, copyto) + count += 1 + + mi = MultiIndex.load_from_directory(c.location, force=True) + + print(mi.index_list) + print(mi.source_list) + + sigs = list(mi.signatures()) + assert len(sigs) == 6 + + +@utils.in_tempdir +def test_multi_index_load_from_directory_3_sig_gz(c): + # check that we find .sig.gz files, too + dirname = utils.get_test_data('prot') + + count = 0 + for root, dirs, files in os.walk(dirname): + for name in files: + if not name.endswith('.sig'): # skip non .sig things + continue + print(f"at {name}") + fullname = os.path.join(root, name) + copyto = c.output(f"file{count}.sig.gz") + shutil.copyfile(fullname, copyto) + count += 1 + + mi = MultiIndex.load_from_directory(c.location, force=False) + + print(mi.index_list) + print(mi.source_list) + + sigs = list(mi.signatures()) + assert len(sigs) == 6 + + +@utils.in_tempdir +def test_multi_index_load_from_directory_3_check_traverse_fn(c): + # test the actual traverse function... eventually this test can be + # removed, probably? + from sourmash import sourmash_args + + dirname = utils.get_test_data('prot') + files = list(sourmash_args.traverse_find_sigs([dirname])) + assert len(files) == 6, files + + files = list(sourmash_args.traverse_find_sigs([dirname], True)) + assert len(files) == 14, files + + +def test_multi_index_load_from_directory_no_exist(): + dirname = utils.get_test_data('does-not-exist') + with pytest.raises(ValueError): + mi = MultiIndex.load_from_directory(dirname, force=True) From 16a119e33cf83f33f8bfdab92af0aec37f00ee40 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 09:20:28 -0700 Subject: [PATCH 35/46] switch lca summarize over to usig MultiIndex --- src/sourmash/index.py | 18 +++++++++------- src/sourmash/lca/command_summarize.py | 22 +++++++------------- src/sourmash/sourmash_args.py | 6 +++--- tests/test_index.py | 30 +++++++++++++-------------- 4 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index e6dd779b91..0d05a03726 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -195,6 +195,11 @@ def signatures(self): for ss in idx.signatures(): yield ss + def signatures_with_location(self): + for idx, loc in zip(self.index_list, self.source_list): + for ss in idx.signatures(): + yield ss, loc + def __len__(self): return sum([ len(idx) for idx in self.index_list ]) @@ -206,16 +211,15 @@ def load(self, *args): raise NotImplementedError @classmethod - def load_from_directory(cls, dirname, force=False): - "Create a MultiIndex from all files under a directory." + def load_from_path(cls, pathname, force=False): + "Create a MultiIndex from a path (filename or directory)." from .sourmash_args import traverse_find_sigs - if not os.path.isdir(dirname): - raise ValueError(f"'{dirname}' must be a directory") + if not os.path.exists(pathname): + raise ValueError(f"'{pathname}' must be a directory") index_list = [] source_list = [] - for thisfile in traverse_find_sigs([dirname], - yield_all_files=force): + for thisfile in traverse_find_sigs([pathname], yield_all_files=force): try: idx = LinearIndex.load(thisfile) index_list.append(idx) @@ -230,7 +234,7 @@ def load_from_directory(cls, dirname, force=False): if index_list: db = cls(index_list, source_list) else: - raise ValueError(f"no signatures to load under directory '{dirname}'") + raise ValueError(f"no signatures to load under directory '{pathname}'") return db diff --git a/src/sourmash/lca/command_summarize.py b/src/sourmash/lca/command_summarize.py index e64cca8e01..2d5ec5a375 100644 --- a/src/sourmash/lca/command_summarize.py +++ b/src/sourmash/lca/command_summarize.py @@ -10,6 +10,7 @@ from ..logging import notify, error, print_results, set_quiet, debug from . import lca_utils from .lca_utils import check_files_exist +from sourmash.index import MultiIndex DEFAULT_THRESHOLD=5 @@ -61,22 +62,15 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance): total_count = 0 n = 0 - # in order to get the right reporting out of this function, we need - # to do our own traversal to expand the list of filenames, as opposed - # to using load_file_as_signatures(...) - filenames = sourmash_args.traverse_find_sigs(filenames) - filenames = list(filenames) - - # @CTB replace with MultiIndex? - total_n = len(filenames) - - for query_filename in filenames: + for filename in filenames: n += 1 - for query_sig in sourmash_args.load_file_as_signatures(query_filename, - ksize=ksize): + mi = MultiIndex.load_from_path(filename) + mi = mi.select(ksize=ksize) + + for query_sig, query_filename in mi.signatures_with_location(): notify(u'\r\033[K', end=u'') - notify('... loading {} (file {} of {})', query_sig, n, + notify(f'... loading {query_sig} (file {n} of {total_n})', total_n, end='\r') total_count += 1 @@ -89,7 +83,7 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance): yield query_filename, query_sig, hashvals notify(u'\r\033[K', end=u'') - notify('loaded {} signatures from {} files total.', total_count, n) + notify(f'loaded {total_count} signatures from {n} files total.') def count_signature(sig, scaled, hashvals): diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 84a63c7bfa..3a1ee1dd6b 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -350,10 +350,10 @@ def _multiindex_load_from_file_list(filename, **kwargs): return (db, DatabaseType.SIGLIST) -def _multiindex_load_from_directory(filename, **kwargs): +def _multiindex_load_from_path(filename, **kwargs): "Load collection from a directory." traverse_yield_all = kwargs['traverse_yield_all'] - db = MultiIndex.load_from_directory(filename, traverse_yield_all) + db = MultiIndex.load_from_path(filename, traverse_yield_all) return (db, DatabaseType.SIGLIST) @@ -394,7 +394,7 @@ def _load_revindex(filename, **kwargs): # all loader functions, in order. _loader_functions = [ ("load from stdin", _load_stdin), - ("load from directory", _multiindex_load_from_directory), + ("load from directory", _multiindex_load_from_path), ("load from sig file", _load_sigfile), ("load from file list", _multiindex_load_from_file_list), ("load SBT", _load_sbt), diff --git a/tests/test_index.py b/tests/test_index.py index 6530243a8b..2b242e62c3 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -509,18 +509,18 @@ def test_multi_index_signatures(): assert ss63 in siglist -def test_multi_index_load_from_directory(): +def test_multi_index_load_from_path(): dirname = utils.get_test_data('prot/protein') - mi = MultiIndex.load_from_directory(dirname, force=False) + mi = MultiIndex.load_from_path(dirname, force=False) sigs = list(mi.signatures()) assert len(sigs) == 2 -def test_multi_index_load_from_directory_2(): +def test_multi_index_load_from_path_2(): # only load .sig files, currently; not the databases under that directory. dirname = utils.get_test_data('prot') - mi = MultiIndex.load_from_directory(dirname, force=False) + mi = MultiIndex.load_from_path(dirname, force=False) print(mi.index_list) print(mi.source_list) @@ -530,7 +530,7 @@ def test_multi_index_load_from_directory_2(): @utils.in_tempdir -def test_multi_index_load_from_directory_3(c): +def test_multi_index_load_from_path_3(c): # check that force works ok on a directory dirname = utils.get_test_data('prot') @@ -544,11 +544,11 @@ def test_multi_index_load_from_directory_3(c): count += 1 with pytest.raises(sourmash.exceptions.SourmashError): - mi = MultiIndex.load_from_directory(c.location, force=False) + mi = MultiIndex.load_from_path(c.location, force=False) @utils.in_tempdir -def test_multi_index_load_from_directory_3_yield_all_true(c): +def test_multi_index_load_from_path_3_yield_all_true(c): # check that force works ok on a directory w/force=True dirname = utils.get_test_data('prot') @@ -561,7 +561,7 @@ def test_multi_index_load_from_directory_3_yield_all_true(c): shutil.copyfile(fullname, copyto) count += 1 - mi = MultiIndex.load_from_directory(c.location, force=True) + mi = MultiIndex.load_from_path(c.location, force=True) print(mi.index_list) print(mi.source_list) @@ -571,7 +571,7 @@ def test_multi_index_load_from_directory_3_yield_all_true(c): @utils.in_tempdir -def test_multi_index_load_from_directory_3_yield_all_true_subdir(c): +def test_multi_index_load_from_path_3_yield_all_true_subdir(c): # check that force works ok on subdirectories dirname = utils.get_test_data('prot') @@ -587,7 +587,7 @@ def test_multi_index_load_from_directory_3_yield_all_true_subdir(c): shutil.copyfile(fullname, copyto) count += 1 - mi = MultiIndex.load_from_directory(c.location, force=True) + mi = MultiIndex.load_from_path(c.location, force=True) print(mi.index_list) print(mi.source_list) @@ -597,7 +597,7 @@ def test_multi_index_load_from_directory_3_yield_all_true_subdir(c): @utils.in_tempdir -def test_multi_index_load_from_directory_3_sig_gz(c): +def test_multi_index_load_from_path_3_sig_gz(c): # check that we find .sig.gz files, too dirname = utils.get_test_data('prot') @@ -612,7 +612,7 @@ def test_multi_index_load_from_directory_3_sig_gz(c): shutil.copyfile(fullname, copyto) count += 1 - mi = MultiIndex.load_from_directory(c.location, force=False) + mi = MultiIndex.load_from_path(c.location, force=False) print(mi.index_list) print(mi.source_list) @@ -622,7 +622,7 @@ def test_multi_index_load_from_directory_3_sig_gz(c): @utils.in_tempdir -def test_multi_index_load_from_directory_3_check_traverse_fn(c): +def test_multi_index_load_from_path_3_check_traverse_fn(c): # test the actual traverse function... eventually this test can be # removed, probably? from sourmash import sourmash_args @@ -635,7 +635,7 @@ def test_multi_index_load_from_directory_3_check_traverse_fn(c): assert len(files) == 14, files -def test_multi_index_load_from_directory_no_exist(): +def test_multi_index_load_from_path_no_exist(): dirname = utils.get_test_data('does-not-exist') with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(dirname, force=True) + mi = MultiIndex.load_from_path(dirname, force=True) From cb1e8a3c6c3e708d89501ef41aa3f6e0a75f842d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 09:28:50 -0700 Subject: [PATCH 36/46] switch to using MultiIndex in categorize --- src/sourmash/commands.py | 22 ++++++++++++---------- tests/test_sourmash.py | 8 ++------ 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 35b88d0610..4584284829 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -519,6 +519,8 @@ def search(args): def categorize(args): "Use a database to find the best match to many signatures." + from .index import MultiIndex + set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) @@ -540,8 +542,13 @@ def categorize(args): notify('found {} files to query', len(inp_files)) - loader = sourmash_args.LoadSingleSignatures(inp_files, - args.ksize, moltype) + # utility function to load & select relevant signatures. + def _yield_all_sigs(queries, ksize, moltype): + for filename in queries: + mi = MultiIndex.load_from_path(filename, False) + mi = mi.select(ksize=ksize, moltype=moltype) + for ss, loc in mi.signatures_with_location(): + yield ss, loc csv_w = None csv_fp = None @@ -549,9 +556,9 @@ def categorize(args): csv_fp = open(args.csv, 'w', newline='') csv_w = csv.writer(csv_fp) - for queryfile, query, query_moltype, query_ksize in loader: + for query, loc in _yield_all_sigs(args.queries, args.ksize, moltype): notify('loaded query: {}... (k={}, {})', str(query)[:30], - query_ksize, query_moltype) + query.minhash.ksize, query.minhash.moltype) results = [] search_fn = SearchMinHashesFindBest().search @@ -576,14 +583,9 @@ def categorize(args): notify('for {}, no match found', query) if csv_w: - csv_w.writerow([queryfile, query, best_hit_query_name, + csv_w.writerow([loc, query, best_hit_query_name, best_hit_sim]) - if loader.skipped_ignore: - notify('skipped/ignore: {}', loader.skipped_ignore) - if loader.skipped_nosig: - notify('skipped/nosig: {}', loader.skipped_nosig) - if csv_fp: csv_fp.close() diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 1a0d1d4d65..d690fd40ff 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -4238,8 +4238,7 @@ def test_sbt_categorize_already_done_traverse(): def test_sbt_categorize_multiple_ksizes_moltypes(): - # 'categorize' should fail when there are multiple ksizes or moltypes - # present + # 'categorize' works fine with multiple moltypes/ksizes with utils.TempDirectory() as location: testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') @@ -4255,10 +4254,7 @@ def test_sbt_categorize_multiple_ksizes_moltypes(): args = ['categorize', 'zzz', '.'] status, out, err = utils.runscript('sourmash', args, - in_directory=location, fail_ok=True) - - assert status != 0 - assert 'multiple k-mer sizes/molecule types present' in err + in_directory=location) @utils.in_tempdir From c9e176d695a1e2ee7b5b0019289a3050dd15ce4c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 09:30:04 -0700 Subject: [PATCH 37/46] remove LoadSingleSignatures --- src/sourmash/sourmash_args.py | 41 ----------------------------------- 1 file changed, 41 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 3a1ee1dd6b..c7b71774ea 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -111,47 +111,6 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): return sl[0] -class LoadSingleSignatures(object): - def __init__(self, filelist, ksize=None, select_moltype=None, - ignore_files=set()): - self.filelist = filelist - self.ksize = ksize - self.select_moltype = select_moltype - self.ignore_files = ignore_files - - self.skipped_ignore = 0 - self.skipped_nosig = 0 - self.ksizes = set() - self.moltypes = set() - - def __iter__(self): - for filename in self.filelist: - if filename in self.ignore_files: - self.skipped_ignore += 1 - continue - - sl = signature.load_signatures(filename, - ksize=self.ksize, - select_moltype=self.select_moltype) - sl = list(sl) - if len(sl) == 0: - self.skipped_nosig += 1 - continue - - for query in sl: - query_moltype = get_moltype(query) - query_ksize = query.minhash.ksize - - self.ksizes.add(query_ksize) - self.moltypes.add(query_moltype) - - if len(self.ksizes) > 1 or len(self.moltypes) > 1: - raise ValueError('multiple k-mer sizes/molecule types present') - - for query in sl: - yield filename, query, query_moltype, query_ksize - - def _check_suffix(filename, endings): for ending in endings: if filename.endswith(ending): From 8f914f17fed37bcd4377ccf2d97e5b30b1a43de1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 09:34:22 -0700 Subject: [PATCH 38/46] test errors in lca database loading --- tests/test_lca.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_lca.py b/tests/test_lca.py index b4c0c8e990..980c3655c3 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -394,6 +394,18 @@ def test_databases(): assert scaled == 10000 +def test_databases_load_fail_on_dir(): + filename1 = utils.get_test_data('lca') + with pytest.raises(ValueError) as exc: + dblist, ksize, scaled = lca_utils.load_databases([filename1]) + + +def test_databases_load_fail_on_not_exist(): + filename1 = utils.get_test_data('does-not-exist') + with pytest.raises(ValueError) as exc: + dblist, ksize, scaled = lca_utils.load_databases([filename1]) + + def test_db_repr(): filename = utils.get_test_data('lca/delmont-1.lca.json') db, ksize, scaled = lca_utils.load_single_database(filename) From a43b011bb211cd545841387165f611f0a60e5d12 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 10:07:54 -0700 Subject: [PATCH 39/46] remove unneeded categorize code --- src/sourmash/commands.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 4584284829..5f98a65fdd 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -535,13 +535,6 @@ def categorize(args): # load search database tree = load_sbt_index(args.sbt_name) - # load query filenames - # @CTB replace with MultiIndex? - inp_files = set(sourmash_args.traverse_find_sigs(args.queries)) - inp_files = inp_files - already_names - - notify('found {} files to query', len(inp_files)) - # utility function to load & select relevant signatures. def _yield_all_sigs(queries, ksize, moltype): for filename in queries: @@ -557,6 +550,10 @@ def _yield_all_sigs(queries, ksize, moltype): csv_w = csv.writer(csv_fp) for query, loc in _yield_all_sigs(args.queries, args.ksize, moltype): + # skip if we've already done signatures from this file. + if loc in already_names: + continue + notify('loaded query: {}... (k={}, {})', str(query)[:30], query.minhash.ksize, query.minhash.moltype) From 15328aec469e1e2a88a036fe4ba7879c6298776e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 10:12:00 -0700 Subject: [PATCH 40/46] add testme info --- src/sourmash/sourmash_args.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index c7b71774ea..e8aa1eff66 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -220,8 +220,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) try: db, dbtype = _load_database(filename, False, cache_size=cache_size) except IOError as e: - raise - notify(str(e)) + notify(str(e)) # @CTB test me sys.exit(-1) # are we collecting signatures from an SBT? @@ -304,7 +303,7 @@ def _multiindex_load_from_file_list(filename, **kwargs): try: db = MultiIndex.load_from_file_list(filename) except UnicodeDecodeError as exc: - raise ValueError(exc) + raise ValueError(exc) # @CTB test me return (db, DatabaseType.SIGLIST) @@ -323,7 +322,7 @@ def _load_sigfile(filename, **kwargs): db = LinearIndex.load(filename) except sourmash.exceptions.SourmashError as exc: raise ValueError(exc) - except FileNotFoundError: + except FileNotFoundError: # @CTB test me raise ValueError(f"Error while reading signatures from '{filename}'") return (db, DatabaseType.SIGLIST) @@ -343,10 +342,7 @@ def _load_sbt(filename, **kwargs): def _load_revindex(filename, **kwargs): "Load collection from an LCA database/reverse index." - try: - db, _, _ = load_single_database(filename) - except FileNotFoundError as exc: - raise ValueError(exc) + db, _, _ = load_single_database(filename) return (db, DatabaseType.LCA) @@ -393,7 +389,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): successful_screed_load = False it = None try: - # CTB: could be kind of time consuming for big record, but at the + # CTB: could be kind of time consuming for a big record, but at the # moment screed doesn't expose format detection cleanly. with screed.open(filename) as it: record = next(iter(it)) From f674232a57aab40263a01a137829e7564eca95d5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 12:16:53 -0700 Subject: [PATCH 41/46] verified that this was tested --- src/sourmash/sourmash_args.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index e8aa1eff66..f23b318a12 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -219,8 +219,8 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) try: db, dbtype = _load_database(filename, False, cache_size=cache_size) - except IOError as e: - notify(str(e)) # @CTB test me + except Exception as e: + notify(str(e)) sys.exit(-1) # are we collecting signatures from an SBT? From 01c54c0fe3a737694586d90de18cc311a3921769 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 12:22:23 -0700 Subject: [PATCH 42/46] remove testme comments --- src/sourmash/sourmash_args.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index f23b318a12..58bf3ff3af 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -300,10 +300,7 @@ def _load_stdin(filename, **kwargs): def _multiindex_load_from_file_list(filename, **kwargs): "Load collection from a list of signature/database files" - try: - db = MultiIndex.load_from_file_list(filename) - except UnicodeDecodeError as exc: - raise ValueError(exc) # @CTB test me + db = MultiIndex.load_from_file_list(filename) return (db, DatabaseType.SIGLIST) @@ -322,8 +319,6 @@ def _load_sigfile(filename, **kwargs): db = LinearIndex.load(filename) except sourmash.exceptions.SourmashError as exc: raise ValueError(exc) - except FileNotFoundError: # @CTB test me - raise ValueError(f"Error while reading signatures from '{filename}'") return (db, DatabaseType.SIGLIST) From ae3f66de1c42c0578eaecfa370ceb8ce021e87b1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 28 Mar 2021 12:29:23 -0700 Subject: [PATCH 43/46] add tests for MultiIndex.load_from_file_list --- tests/test_index.py | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/tests/test_index.py b/tests/test_index.py index 2b242e62c3..f26cc3fc0a 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -11,6 +11,7 @@ from sourmash import load_one_signature, SourmashSignature from sourmash.index import LinearIndex, MultiIndex from sourmash.sbt import SBT, GraphFactory, Leaf +from sourmash import sourmash_args import sourmash_tst_utils as utils @@ -624,9 +625,8 @@ def test_multi_index_load_from_path_3_sig_gz(c): @utils.in_tempdir def test_multi_index_load_from_path_3_check_traverse_fn(c): # test the actual traverse function... eventually this test can be - # removed, probably? - from sourmash import sourmash_args - + # removed, probably, as we consolidate functionality and test MultiIndex + # better. dirname = utils.get_test_data('prot') files = list(sourmash_args.traverse_find_sigs([dirname])) assert len(files) == 6, files @@ -639,3 +639,40 @@ def test_multi_index_load_from_path_no_exist(): dirname = utils.get_test_data('does-not-exist') with pytest.raises(ValueError): mi = MultiIndex.load_from_path(dirname, force=True) + + +def test_multi_index_load_from_file_list_no_exist(): + dirname = utils.get_test_data('does-not-exist') + with pytest.raises(ValueError): + mi = MultiIndex.load_from_file_list(dirname) + + +@utils.in_tempdir +def test_multi_index_load_from_file_list_1(c): + dirname = utils.get_test_data('prot') + files = list(sourmash_args.traverse_find_sigs([dirname])) + assert len(files) == 6, files + + file_list = c.output('filelist.txt') + + with open(file_list, 'wt') as fp: + print("\n".join(files), file=fp) + mi = MultiIndex.load_from_file_list(file_list) + + sigs = list(mi.signatures()) + assert len(sigs) == 6 + + +@utils.in_tempdir +def test_multi_index_load_from_file_list_2(c): + dirname = utils.get_test_data('prot') + files = list(sourmash_args.traverse_find_sigs([dirname], True)) + assert len(files) == 14, files + + file_list = c.output('filelist.txt') + + with open(file_list, 'wt') as fp: + print("\n".join(files), file=fp) + + with pytest.raises(ValueError): + mi = MultiIndex.load_from_file_list(file_list) From e4e20de6b3a019f7aa9537fea5a126571595409d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 1 Apr 2021 15:23:26 -0700 Subject: [PATCH 44/46] Expand signature selection and compatibility checking in database loading code (#1420) * refactor select, add scaled/num/abund * fix scaled check for LCA database * add debug_literal * fix scaled check for SBT * fix LCA database ksize message & test * add 'containment' to 'select' * added 'is_database' flag for nicer UX * remove overly broad exception catching * document downsampling foo --- src/sourmash/index.py | 73 +++++++++++--- src/sourmash/lca/lca_db.py | 37 +++++-- src/sourmash/logging.py | 11 ++ src/sourmash/sbt.py | 62 ++++++++++-- src/sourmash/sourmash_args.py | 184 ++++++++++------------------------ tests/test_lca.py | 4 +- tests/test_sourmash.py | 51 +++++++--- 7 files changed, 240 insertions(+), 182 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 0d05a03726..44e108cc1f 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -7,6 +7,8 @@ class Index(ABC): + is_database = False + @abstractmethod def signatures(self): "Return an iterator over all signatures in the Index object." @@ -124,8 +126,55 @@ def gather(self, query, *args, **kwargs): return results @abstractmethod - def select(self, ksize=None, moltype=None): - "" + def select(self, ksize=None, moltype=None, scaled=None, num=None, + abund=None, containment=None): + """Return Index containing only signatures that match requirements. + + Current arguments can be any or all of: + * ksize + * moltype + * scaled + * num + * containment + + 'select' will raise ValueError if the requirements are incompatible + with the Index subclass. + + 'select' may return an empty object or None if no matches can be + found. + """ + + +def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0, + containment=False): + "Check that the given signature matches the specificed requirements." + # ksize match? + if ksize and ksize != ss.minhash.ksize: + return False + + # moltype match? + if moltype and moltype != ss.minhash.moltype: + return False + + # containment requires scaled; similarity does not. + if containment: + if not scaled: + raise ValueError("'containment' requires 'scaled' in Index.select'") + if not ss.minhash.scaled: + return False + + # 'scaled' and 'num' are incompatible + if scaled: + if ss.minhash.num: + return False + if num: + # note, here we check if 'num' is identical; this can be + # changed later. + if ss.minhash.scaled or num != ss.minhash.num: + return False + + return True + class LinearIndex(Index): "An Index for a collection of signatures. Can load from a .sig file." @@ -157,18 +206,17 @@ def load(cls, location): lidx = LinearIndex(si, filename=location) return lidx - def select(self, ksize=None, moltype=None): - def select_sigs(ss, ksize=ksize, moltype=moltype): - if (ksize is None or ss.minhash.ksize == ksize) and \ - (moltype is None or ss.minhash.moltype == moltype): - return True + def select(self, **kwargs): + """Return new LinearIndex containing only signatures that match req's. - return self.filter(select_sigs) + Does not raise ValueError, but may return an empty Index. + """ + # eliminate things from kwargs with None or zero value + kw = { k : v for (k, v) in kwargs.items() if v } - def filter(self, filter_fn): siglist = [] for ss in self._signatures: - if filter_fn(ss): + if select_signature(ss, **kwargs): siglist.append(ss) return LinearIndex(siglist, self.filename) @@ -260,11 +308,12 @@ def load_from_file_list(cls, filename): def save(self, *args): raise NotImplementedError - def select(self, ksize=None, moltype=None): + def select(self, **kwargs): + "Run 'select' on all indices within this MultiIndex." new_idx_list = [] new_src_list = [] for idx, src in zip(self.index_list, self.source_list): - idx = idx.select(ksize=ksize, moltype=moltype) + idx = idx.select(**kwargs) new_idx_list.append(idx) new_src_list.append(src) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 137f0fbd5c..03e22dc8a2 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -55,6 +55,8 @@ class LCA_Database(Index): `hashval_to_idx` is a dictionary from individual hash values to sets of `idx`. """ + is_database = True + def __init__(self, ksize, scaled, moltype='DNA'): self.ksize = int(ksize) self.scaled = int(scaled) @@ -169,18 +171,29 @@ def signatures(self): for v in self._signatures.values(): yield v - def select(self, ksize=None, moltype=None): - "Selector interface - make sure this database matches requirements." - ok = True + def select(self, ksize=None, moltype=None, num=0, scaled=0, + containment=False): + """Make sure this database matches the requested requirements. + + As with SBTs, queries with higher scaled values than the database + can still be used for containment search, but not for similarity + search. See SBT.select(...) for details, and _find_signatures for + implementation. + + Will always raise ValueError if a requirement cannot be met. + """ + if num: + raise ValueError("cannot use 'num' MinHashes to search LCA database") + + if scaled > self.scaled and not containment: + raise ValueError(f"cannot use scaled={scaled} on this database (scaled={self.scaled})") + if ksize is not None and self.ksize != ksize: - ok = False + raise ValueError(f"ksize on this database is {self.ksize}; this is different from requested ksize of {ksize}") if moltype is not None and moltype != self.moltype: - ok = False + raise ValueError(f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}") - if ok: - return self - - raise ValueError("cannot select LCA on ksize {} / moltype {}".format(ksize, moltype)) + return self @classmethod def load(cls, db_name): @@ -467,12 +480,16 @@ def _find_signatures(self, minhash, threshold, containment=False, This is essentially a fast implementation of find that collects all the signatures with overlapping hash values. Note that similarity searches (containment=False) will not be returned in sorted order. + + As with SBTs, queries with higher scaled values than the database + can still be used for containment search, but not for similarity + search. See SBT.select(...) for details. """ # make sure we're looking at the same scaled value as database if self.scaled > minhash.scaled: minhash = minhash.downsample(scaled=self.scaled) elif self.scaled < minhash.scaled and not ignore_scaled: - # note that containment cannot be calculated w/o matching scaled. + # note that similarity cannot be calculated w/o matching scaled. raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) query_mins = set(minhash.hashes) diff --git a/src/sourmash/logging.py b/src/sourmash/logging.py index 49c3dc26b3..2915c43f78 100644 --- a/src/sourmash/logging.py +++ b/src/sourmash/logging.py @@ -41,6 +41,17 @@ def debug(s, *args, **kwargs): sys.stderr.flush() +def debug_literal(s, *args, **kwargs): + "A debug logging function => stderr." + if _quiet or not _debug: + return + + print(u'\r\033[K', end=u'', file=sys.stderr) + print(s, file=sys.stderr, end=kwargs.get('end', u'\n')) + if kwargs.get('flush'): + sys.stderr.flush() + + def error(s, *args, **kwargs): "A simple error logging function => stderr." print(u'\r\033[K', end=u'', file=sys.stderr) diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index c499f50e2c..fa22507961 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -171,6 +171,7 @@ class SBT(Index): We use two dicts to store the tree structure: One for the internal nodes, and another for the leaves (datasets). """ + is_database = True def __init__(self, factory, *, d=2, storage=None, cache_size=None): self.factory = factory @@ -189,19 +190,60 @@ def signatures(self): for k in self.leaves(): yield k.data - def select(self, ksize=None, moltype=None): - first_sig = next(iter(self.signatures())) + def select(self, ksize=None, moltype=None, num=0, scaled=0, + containment=False): + """Make sure this database matches the requested requirements. - ok = True - if ksize is not None and first_sig.minhash.ksize != ksize: - ok = False - if moltype is not None and first_sig.minhash.moltype != moltype: - ok = False + Will always raise ValueError if a requirement cannot be met. - if ok: - return self + The only tricky bit here is around downsampling: if the scaled + value being requested is higher than the signatures in the + SBT, we can use the SBT for containment but not for + similarity. This is because: - raise ValueError("cannot select SBT on ksize {} / moltype {}".format(ksize, moltype)) + * if we are doing containment searches, the intermediate nodes + can still be used for calculating containment of signatures + with higher scaled values. This is because only hashes that match + in the higher range are used for containment scores. + * however, for similarity, _all_ hashes are used, and we cannot + implicitly downsample or necessarily estimate similarity if + the scaled values differ. + """ + # pull out a signature from this collection - + first_sig = next(iter(self.signatures())) + db_mh = first_sig.minhash + + # check ksize. + if ksize is not None and db_mh.ksize != ksize: + raise ValueError(f"search ksize {ksize} is different from database ksize {db_mh.ksize}") + + # check moltype. + if moltype is not None and db_mh.moltype != moltype: + raise ValueError(f"search moltype {moltype} is different from database moltype {db_mh.moltype}") + + # containment requires 'scaled'. + if containment: + if not scaled: + raise ValueError("'containment' requires 'scaled' in SBT.select'") + if not db_mh.scaled: + raise ValueError("cannot search this SBT for containment; signatures are not calculated with scaled") + + # 'num' and 'scaled' do not mix. + if num: + if not db_mh.num: + raise ValueError(f"this database was created with 'scaled' MinHash sketches, not 'num'") + if num != db_mh.num: + raise ValueError(f"num mismatch for SBT: num={num}, {db_mh.num}") + + if scaled: + if not db_mh.scaled: + raise ValueError(f"this database was created with 'num' MinHash sketches, not 'scaled'") + + # we can downsample SBTs for containment operations. + if scaled > db_mh.scaled and not containment: + raise ValueError(f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}") + + return self def new_node_pos(self, node): if not self._nodes: diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 58bf3ff3af..d8525a3fd4 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -15,7 +15,7 @@ import sourmash.exceptions from . import signature -from .logging import notify, error, debug +from .logging import notify, error, debug_literal from .index import LinearIndex, MultiIndex from . import signature as sig @@ -141,63 +141,6 @@ def traverse_find_sigs(filenames, yield_all_files=False): yield fullname -def _check_signatures_are_compatible(query, subject): - # is one scaled, and the other not? cannot do search - if query.minhash.scaled and not subject.minhash.scaled or \ - not query.minhash.scaled and subject.minhash.scaled: - error("signature {} and {} are incompatible - cannot compare.", - query, subject) - if query.minhash.scaled: - error(f"{query} was calculated with --scaled, {subject} was not.") - if subject.minhash.scaled: - error(f"{subject} was calculated with --scaled, {query} was not.") - return 0 - - return 1 - - -def check_tree_is_compatible(treename, tree, query, is_similarity_query): - # get a minhash from the tree - leaf = next(iter(tree.leaves())) - tree_mh = leaf.data.minhash - - query_mh = query.minhash - - if tree_mh.ksize != query_mh.ksize: - error(f"ksize on tree '{treename}' is {tree_mh.ksize};") - error(f"this is different from query ksize of {query_mh.ksize}.") - return 0 - - # is one scaled, and the other not? cannot do search. - if (tree_mh.scaled and not query_mh.scaled) or \ - (query_mh.scaled and not tree_mh.scaled): - error(f"for tree '{treename}', tree and query are incompatible for search.") - if tree_mh.scaled: - error("tree was calculated with scaled, query was not.") - else: - error("query was calculated with scaled, tree was not.") - return 0 - - # are the scaled values incompatible? cannot downsample tree for similarity - if tree_mh.scaled and tree_mh.scaled < query_mh.scaled and \ - is_similarity_query: - error(f"for tree '{treename}', scaled value is smaller than query.") - error(f"tree scaled: {tree_mh.scaled}; query scaled: {query_mh.scaled}. Cannot do similarity search.") - return 0 - - return 1 - - -def check_lca_db_is_compatible(filename, db, query): - query_mh = query.minhash - if db.ksize != query_mh.ksize: - error(f"ksize on db '{filename}' is {db.ksize};") - error(f"this is different from query ksize of {query_mh.ksize}.") - return 0 - - return 1 - - def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None): """ Load one or more SBTs, LCAs, and/or signatures. @@ -205,104 +148,83 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) Check for compatibility with query. This is basically a user-focused wrapping of _load_databases. - - CTB: this can be refactored into a more generic function with 'filter'. """ - query_ksize = query.minhash.ksize - query_moltype = get_moltype(query) + query_mh = query.minhash + + containment = True + if is_similarity_query: + containment = False - n_signatures = 0 - n_databases = 0 databases = [] for filename in filenames: notify(f'loading from {filename}...', end='\r') try: - db, dbtype = _load_database(filename, False, cache_size=cache_size) - except Exception as e: + db = _load_database(filename, False, cache_size=cache_size) + except ValueError as e: + # cannot load database! notify(str(e)) sys.exit(-1) - # are we collecting signatures from an SBT? - if dbtype == DatabaseType.SBT: - if not check_tree_is_compatible(filename, db, query, - is_similarity_query): - sys.exit(-1) + try: + db = db.select(moltype=query_mh.moltype, + ksize=query_mh.ksize, + num=query_mh.num, + scaled=query_mh.scaled, + containment=containment) + except ValueError as exc: + # incompatible collection specified! + notify(f"ERROR: cannot use '{filename}' for this query.") + notify(str(exc)) + sys.exit(-1) - databases.append(db) - notify(f'loaded SBT {filename}', end='\r') - n_databases += 1 + # 'select' returns nothing => all signatures filtered out. fail! + if not db: + notify(f"no compatible signatures found in '{filename}'") + sys.exit(-1) - # or an LCA? - elif dbtype == DatabaseType.LCA: - if not check_lca_db_is_compatible(filename, db, query): - sys.exit(-1) + databases.append(db) - notify(f'loaded LCA {filename}', end='\r') + # calc num loaded info. + n_signatures = 0 + n_databases = 0 + for db in databases: + if db.is_database: n_databases += 1 - - databases.append(db) - - # or a mixed collection of signatures? - elif dbtype == DatabaseType.SIGLIST: - db = db.select(moltype=query_moltype, ksize=query_ksize) - siglist = db.signatures() - filter_fn = lambda s: _check_signatures_are_compatible(query, s) - db = db.filter(filter_fn) - - if not db: - notify(f"no compatible signatures found in '{filename}'") - sys.exit(-1) - - databases.append(db) - - notify(f'loaded {len(db)} signatures from {filename}', end='\r') - n_signatures += len(db) - - # unknown!? else: - raise ValueError(f"unknown dbtype {dbtype}") # CTB check me. - - # END for loop - + n_signatures += len(db) notify(' '*79, end='\r') if n_signatures and n_databases: notify(f'loaded {n_signatures} signatures and {n_databases} databases total.') - elif n_signatures: + elif n_signatures and not n_databases: notify(f'loaded {n_signatures} signatures.') - elif n_databases: + elif n_databases and not n_signatures: notify(f'loaded {n_databases} databases.') - else: - notify('** ERROR: no signatures or databases loaded?') - sys.exit(-1) if databases: print('') + else: + notify('** ERROR: no signatures or databases loaded?') + sys.exit(-1) return databases -class DatabaseType(Enum): - SIGLIST = 1 - SBT = 2 - LCA = 3 - - def _load_stdin(filename, **kwargs): "Load collection from .sig file streamed in via stdin" db = None if filename == '-': db = LinearIndex.load(sys.stdin) - return (db, DatabaseType.SIGLIST) + return db def _multiindex_load_from_file_list(filename, **kwargs): "Load collection from a list of signature/database files" db = MultiIndex.load_from_file_list(filename) - return (db, DatabaseType.SIGLIST) + return db def _multiindex_load_from_path(filename, **kwargs): @@ -310,7 +232,7 @@ def _multiindex_load_from_path(filename, **kwargs): traverse_yield_all = kwargs['traverse_yield_all'] db = MultiIndex.load_from_path(filename, traverse_yield_all) - return (db, DatabaseType.SIGLIST) + return db def _load_sigfile(filename, **kwargs): @@ -320,7 +242,7 @@ def _load_sigfile(filename, **kwargs): except sourmash.exceptions.SourmashError as exc: raise ValueError(exc) - return (db, DatabaseType.SIGLIST) + return db def _load_sbt(filename, **kwargs): @@ -332,13 +254,13 @@ def _load_sbt(filename, **kwargs): except FileNotFoundError as exc: raise ValueError(exc) - return (db, DatabaseType.SBT) + return db def _load_revindex(filename, **kwargs): "Load collection from an LCA database/reverse index." db, _, _ = load_single_database(filename) - return (db, DatabaseType.LCA) + return db # all loader functions, in order. @@ -355,24 +277,23 @@ def _load_revindex(filename, **kwargs): def _load_database(filename, traverse_yield_all, *, cache_size=None): """Load file as a database - list of signatures, LCA, SBT, etc. - Return (db, dbtype), where dbtype is a DatabaseType enum. + Return Index object. This is an internal function used by other functions in sourmash_args. """ loaded = False - dbtype = None # iterate through loader functions, trying them all. Catch ValueError # but nothing else. for (desc, load_fn) in _loader_functions: try: - debug(f"_load_databases: trying loader fn {desc}") - db, dbtype = load_fn(filename, - traverse_yield_all=traverse_yield_all, - cache_size=cache_size) + debug_literal(f"_load_databases: trying loader fn {desc}") + db = load_fn(filename, + traverse_yield_all=traverse_yield_all, + cache_size=cache_size) except ValueError as exc: - debug(f"_load_databases: FAIL on fn {desc}.") - debug(traceback.format_exc()) + debug_literal(f"_load_databases: FAIL on fn {desc}.") + debug_literal(traceback.format_exc()) if db: loaded = True @@ -401,7 +322,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): if loaded: # this is a bit redundant but safe > sorry assert db - return db, dbtype + return db def load_file_as_index(filename, yield_all_files=False): @@ -417,8 +338,7 @@ def load_file_as_index(filename, yield_all_files=False): this directory into an Index object. If yield_all_files=True, will attempt to load all files. """ - db, dbtype = _load_database(filename, yield_all_files) - return db + return _load_database(filename, yield_all_files) def load_file_as_signatures(filename, select_moltype=None, ksize=None, @@ -441,7 +361,7 @@ def load_file_as_signatures(filename, select_moltype=None, ksize=None, if progress: progress.notify(filename) - db, dbtype = _load_database(filename, yield_all_files) + db = _load_database(filename, yield_all_files) db = db.select(moltype=select_moltype, ksize=ksize) loader = db.signatures() diff --git a/tests/test_lca.py b/tests/test_lca.py index 980c3655c3..65fec92350 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -1908,8 +1908,8 @@ def test_incompat_lca_db_ksize_2(c): err = c.last_result.err print(err) - assert "ksize on db 'test.lca.json' is 25;" in err - assert 'this is different from query ksize of 31.' in err + assert "ERROR: cannot use 'test.lca.json' for this query." in err + assert "ksize on this database is 25; this is different from requested ksize of 31" @utils.in_tempdir diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index d690fd40ff..deb1b667cb 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1851,12 +1851,14 @@ def test_search_incompatible(c): assert c.last_result.status != 0 print(c.last_result.out) print(c.last_result.err) - assert 'incompatible - cannot compare.' in c.last_result.err - assert 'was calculated with --scaled,' in c.last_result.err + + assert "no compatible signatures found in " in c.last_result.err @utils.in_tempdir def test_search_traverse_incompatible(c): + # build a directory with some signatures in it, search for compatible + # signatures. searchdir = c.output('searchme') os.mkdir(searchdir) @@ -1866,10 +1868,7 @@ def test_search_traverse_incompatible(c): shutil.copyfile(scaled_sig, c.output('searchme/scaled.sig')) c.run_sourmash("search", scaled_sig, c.output('searchme')) - print(c.last_result.out) - print(c.last_result.err) - assert 'incompatible - cannot compare.' in c.last_result.err - assert 'was calculated with --scaled,' in c.last_result.err + assert '100.0% NC_009665.1 Shewanella baltica OS185, complete genome' in c.last_result.out # explanation: you cannot downsample a scaled SBT to match a scaled @@ -1896,8 +1895,11 @@ def test_search_metagenome_downsample(): in_directory=location, fail_ok=True) assert status == -1 - assert "for tree 'gcf_all', scaled value is smaller than query." in err - assert 'tree scaled: 10000; query scaled: 100000. Cannot do similarity search.' in err + print(out) + print(err) + + assert "ERROR: cannot use 'gcf_all' for this query." in err + assert "search scaled value 100000 is less than database scaled value of 10000" in err def test_search_metagenome_downsample_containment(): @@ -2055,7 +2057,11 @@ def test_do_sourmash_sbt_search_wrong_ksize(): fail_ok=True) assert status == -1 - assert 'this is different from' in err + print(out) + print(err) + + assert "ERROR: cannot use 'zzz' for this query." in err + assert "search ksize 51 is different from database ksize 31" in err def test_do_sourmash_sbt_search_multiple(): @@ -2170,7 +2176,10 @@ def test_do_sourmash_sbt_search_downsample_2(): '--threshold=0.01'], in_directory=location, fail_ok=True) assert status == -1 - assert 'Cannot do similarity search.' in err + print(out) + print(err) + assert "ERROR: cannot use 'foo' for this query." in err + assert "search scaled value 100000 is less than database scaled value of 2000" in err def test_do_sourmash_index_single(): @@ -2465,7 +2474,10 @@ def test_do_sourmash_sbt_search_scaled_vs_num_1(): fail_ok=True) assert status == -1 - assert 'tree and query are incompatible for search' in err + print(out) + print(err) + assert "ERROR: cannot use '" in err + assert "this database was created with 'num' MinHash sketches, not 'scaled'" in err def test_do_sourmash_sbt_search_scaled_vs_num_2(): @@ -2497,7 +2509,10 @@ def test_do_sourmash_sbt_search_scaled_vs_num_2(): fail_ok=True) assert status == -1 - assert 'tree and query are incompatible for search' in err + print(out) + print(err) + assert "ERROR: cannot use '" in err + assert "this database was created with 'scaled' MinHash sketches, not 'num'" in err def test_do_sourmash_sbt_search_scaled_vs_num_3(): @@ -2522,7 +2537,9 @@ def test_do_sourmash_sbt_search_scaled_vs_num_3(): fail_ok=True) assert status == -1 - assert 'incompatible - cannot compare' in err + print(out) + print(err) + assert "no compatible signatures found in " in err def test_do_sourmash_sbt_search_scaled_vs_num_4(): @@ -2547,7 +2564,9 @@ def test_do_sourmash_sbt_search_scaled_vs_num_4(): ['search', sig_loc2, sig_loc], fail_ok=True) assert status == -1 - assert 'incompatible - cannot compare' in err + print(out) + print(err) + assert "no compatible signatures found in " in err def test_do_sourmash_check_search_vs_actual_similarity(): @@ -3604,8 +3623,7 @@ def test_gather_traverse_incompatible(c): c.run_sourmash("gather", scaled_sig, c.output('searchme')) print(c.last_result.out) print(c.last_result.err) - assert 'incompatible - cannot compare.' in c.last_result.err - assert 'was calculated with --scaled,' in c.last_result.err + assert "5.2 Mbp 100.0% 100.0% NC_009665.1 Shewanella baltica OS185,..." in c.last_result.out def test_gather_metagenome_output_unassigned(): @@ -3750,6 +3768,7 @@ def test_gather_query_downsample(): with utils.TempDirectory() as location: testdata_glob = utils.get_test_data('gather/GCF*.sig') testdata_sigs = glob.glob(testdata_glob) + print(testdata_sigs) query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') From 3229b5b2fa147057360e707d5255bbb75a704fb1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 2 Apr 2021 06:20:00 -0700 Subject: [PATCH 45/46] fix file_list -> pathlist --- src/sourmash/index.py | 6 +++--- tests/test_index.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 44e108cc1f..aeb75bb765 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -287,14 +287,14 @@ def load_from_path(cls, pathname, force=False): return db @classmethod - def load_from_file_list(cls, filename): + def load_from_pathlist(cls, filename): "Create a MultiIndex from all files listed in a text file." - from .sourmash_args import (load_file_list_of_signatures, + from .sourmash_args import (load_pathlist_from_file, load_file_as_index) idx_list = [] src_list = [] - file_list = load_file_list_of_signatures(filename) + file_list = load_pathlist_from_file(filename) for fname in file_list: idx = load_file_as_index(fname) src = fname diff --git a/tests/test_index.py b/tests/test_index.py index f26cc3fc0a..9ddcfa8672 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -641,14 +641,14 @@ def test_multi_index_load_from_path_no_exist(): mi = MultiIndex.load_from_path(dirname, force=True) -def test_multi_index_load_from_file_list_no_exist(): +def test_multi_index_load_from_pathlist_no_exist(): dirname = utils.get_test_data('does-not-exist') with pytest.raises(ValueError): - mi = MultiIndex.load_from_file_list(dirname) + mi = MultiIndex.load_from_pathlist(dirname) @utils.in_tempdir -def test_multi_index_load_from_file_list_1(c): +def test_multi_index_load_from_pathlist_1(c): dirname = utils.get_test_data('prot') files = list(sourmash_args.traverse_find_sigs([dirname])) assert len(files) == 6, files @@ -657,14 +657,14 @@ def test_multi_index_load_from_file_list_1(c): with open(file_list, 'wt') as fp: print("\n".join(files), file=fp) - mi = MultiIndex.load_from_file_list(file_list) + mi = MultiIndex.load_from_pathlist(file_list) sigs = list(mi.signatures()) assert len(sigs) == 6 @utils.in_tempdir -def test_multi_index_load_from_file_list_2(c): +def test_multi_index_load_from_pathlist_2(c): dirname = utils.get_test_data('prot') files = list(sourmash_args.traverse_find_sigs([dirname], True)) assert len(files) == 14, files @@ -675,4 +675,4 @@ def test_multi_index_load_from_file_list_2(c): print("\n".join(files), file=fp) with pytest.raises(ValueError): - mi = MultiIndex.load_from_file_list(file_list) + mi = MultiIndex.load_from_pathlist(file_list) From 6d6eb42baa61db24272e4fc0ce70f994a7096979 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 2 Apr 2021 06:35:13 -0700 Subject: [PATCH 46/46] fix typo --- src/sourmash/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index aeb75bb765..30f93723e8 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -276,7 +276,7 @@ def load_from_path(cls, pathname, force=False): if force: continue # ignore error else: - raise # contine past error! + raise # continue past error! db = None if index_list: