sourmash-bio · ctb · Apr 19, 2020 · Apr 6, 2020 · Apr 6, 2020 · Apr 6, 2020
diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py
@@ -566,16 +566,19 @@ def is_molecule_type(self, molecule):
         """Check if this MinHash is a particular human-readable molecule type.
 
         Supports 'protein', 'dayhoff', 'hp', 'DNA'.
+        @CTB deprecate for 4.0?
         """
         if molecule.lower() not in ('protein', 'dayhoff', 'hp', 'dna'):
             raise ValueError("unknown moltype in query, '{}'".format(molecule))
-        if self.is_protein and molecule == 'protein':
-            return True
-        elif self.dayhoff and molecule == 'dayhoff':
-            return True
-        elif self.hp and molecule == 'hp':
-            return True
-        elif molecule.lower() == "dna" and self.is_dna:
-            return True
-
-        return False
+        return molecule == self.moltype
+
+    @property
+    def moltype(self):                    # TODO: test in minhash tests
+        if self.is_protein:
+            return 'protein'
+        elif self.dayhoff:
+            return 'dayhoff'
+        elif self.hp:
+            return 'hp'
+        else:
+            return 'DNA'
diff --git a/sourmash/index.py b/sourmash/index.py
@@ -96,6 +96,9 @@ def gather(self, query, *args, **kwargs):
 
         return results
 
+    @abstractmethod
+    def select(self, ksize=None, moltype=None):
+        ""
 
 class LinearIndex(Index):
     def __init__(self, _signatures=None, filename=None):
@@ -125,3 +128,13 @@ def load(cls, location):
 
         lidx = LinearIndex(si, filename=location)
         return lidx
+
+    def select(self, ksize=None, moltype=None):
+        def select_sigs(siglist, ksize, moltype):
+            for ss in siglist:
+                if (ksize is None or ss.minhash.ksize == ksize) and \
+                   (moltype is None or ss.minhash.moltype == moltype):
+                   yield ss
+
+        siglist=select_sigs(self._signatures, ksize, moltype)
+        return LinearIndex(siglist, self.filename)
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
@@ -199,6 +199,18 @@ def signatures(self):
         for v in self._signatures.values():
             yield SourmashSignature(v)
 
+    def select(self, ksize=None, moltype=None):
+        ok = True
+        if ksize is not None and self.ksize != ksize:
+            ok = False
+        if moltype is not None and moltype != 'DNA':
+            ok = False
+
+        if ok:
+            return self
+
+        raise ValueError("cannot select LCA on ksize {} / moltype {}".format(ksize, moltype))
+
     def load(self, db_name):
         "Load from a JSON file."
         xopen = open

diff --git a/sourmash/sbt.py b/sourmash/sbt.py
@@ -137,6 +137,20 @@ def signatures(self):
         for k in self.leaves():
             yield k.data
 
+    def select(self, ksize=None, moltype=None):
+        first_sig = next(iter(self.signatures()))
+
+        ok = True
+        if ksize is not None and first_sig.minhash.ksize != ksize:
+            ok = False
+        if moltype is not None and first_sig.moltype != moltype:
+            ok = False
+
+        if ok:
+            return self
+
+        raise ValueError("cannot select SBT on ksize {} / moltype {}".format(ksize, moltype))
+
     def new_node_pos(self, node):
         if not self._nodes:
             self.next_node = 1

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
@@ -65,6 +65,8 @@
 def test_basic_dna(track_abundance):
     # verify that MHs of size 1 stay size 1, & act properly as bottom sketches.
     mh = MinHash(1, 4, track_abundance=track_abundance)
+    assert mh.moltype == 'DNA'
+
     mh.add_sequence('ATGC')
     a = mh.get_mins()
 
@@ -116,7 +118,14 @@ def test_bytes_dna(track_abundance):
 
 def test_bytes_protein_dayhoff(track_abundance, dayhoff):
     # verify that we can hash protein/aa sequences
-    mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance)
+    mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False,
+                 track_abundance=track_abundance)
+
+    expected_moltype = 'protein'
+    if dayhoff:
+        expected_moltype = 'dayhoff'
+    assert mh.moltype == expected_moltype
+
     mh.add_protein('AGYYG')
     mh.add_protein('AGYYG')
     mh.add_protein(b'AGYYG')
@@ -135,6 +144,11 @@ def test_protein_dayhoff(track_abundance, dayhoff):
 def test_bytes_protein_hp(track_abundance, hp):
     # verify that we can hash protein/aa sequences
     mh = MinHash(10, 6, True, dayhoff=False, hp=hp, track_abundance=track_abundance)
+    expected_moltype = 'protein'
+    if hp:
+        expected_moltype = 'hp'
+    assert mh.moltype == expected_moltype
+
     mh.add_protein('AGYYG')
     mh.add_protein(u'AGYYG')
     mh.add_protein(b'AGYYG')
@@ -159,6 +173,8 @@ def test_protein_hp(track_abundance, hp):
 def test_translate_codon(track_abundance):
     # Ensure that translation occurs properly
     mh = MinHash(10, 6, is_protein=True)
+    assert mh.moltype == 'protein'
+
     assert "S" == mh.translate_codon('TCT')
     assert "S" == mh.translate_codon('TC')
     assert "X" == mh.translate_codon("T")
@@ -187,6 +203,8 @@ def test_hp(track_abundance):
     # verify that we can hash to hp-encoded protein/aa sequences
     mh_hp = MinHash(10, 6, is_protein=True,
                     dayhoff=False, hp=True, track_abundance=track_abundance)
+    assert mh_hp.moltype == 'hp'
+
     mh_hp.add_sequence('ACTGAC')
 
     assert len(mh_hp.get_mins()) == 2
@@ -1096,7 +1114,8 @@ def test_set_abundance():
 
 
 def test_set_abundance_2():
-    sig = sourmash.load_one_signature(utils.get_test_data("genome-s12.fa.gz.sig"),
+    datapath = utils.get_test_data("genome-s12.fa.gz.sig")
+    sig = sourmash.load_one_signature(datapath,
                                       ksize=30,
                                       select_moltype='dna')
     new_mh = sig.minhash.copy_and_clear()

diff --git a/tests/test_index.py b/tests/test_index.py
@@ -215,3 +215,51 @@ def test_linear_index_save_load():
     print([s[1].name() for s in sr])
     assert len(sr) == 1
     assert sr[0][1] == ss2
+
+
+def test_linear_index_multik_select():
+    # this loads three ksizes, 21/31/51
+    sig2 = utils.get_test_data('2.fa.sig')
+    siglist = sourmash.load_signatures(sig2)
+
+    linear = LinearIndex()
+    for ss in siglist:
+        linear.insert(ss)
+
+    # select most specifically
+    linear2 = linear.select(ksize=31, moltype='DNA')
+    assert len(linear2) == 1
+
+    # all are DNA:
+    linear2 = linear.select(moltype='DNA')
+    assert len(linear2) == 3
+
+
+def test_linear_index_moltype_select():
+    # this loads two ksizes(21, 30), and two moltypes (DNA and protein)
+    filename = utils.get_test_data('genome-s10+s11.sig')
+    siglist = sourmash.load_signatures(filename)
+
+    linear = LinearIndex()
+    for ss in siglist:
+        linear.insert(ss)
+
+    # select most specific DNA
+    linear2 = linear.select(ksize=30, moltype='DNA')
+    assert len(linear2) == 1
+
+    # select most specific protein
+    linear2 = linear.select(ksize=30, moltype='protein')
+    assert len(linear2) == 1
+
+    # can leave off ksize, selects all ksizes
+    linear2 = linear.select(moltype='DNA')
+    assert len(linear2) == 2
+
+    # can leave off ksize, selects all ksizes
+    linear2 = linear.select(moltype='protein')
+    assert len(linear2) == 2
+
+    # select something impossible
+    linear2 = linear.select(ksize=4)
+    assert len(linear2) == 0