Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] add selector framework to Index, and moltype property to MinHash object #936

Merged
merged 10 commits into from
Apr 19, 2020
23 changes: 13 additions & 10 deletions sourmash/_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,16 +566,19 @@ def is_molecule_type(self, molecule):
"""Check if this MinHash is a particular human-readable molecule type.

Supports 'protein', 'dayhoff', 'hp', 'DNA'.
@CTB deprecate for 4.0?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes. See #751 for longer discussion, especially #751 (comment)

"""
if molecule.lower() not in ('protein', 'dayhoff', 'hp', 'dna'):
raise ValueError("unknown moltype in query, '{}'".format(molecule))
if self.is_protein and molecule == 'protein':
return True
elif self.dayhoff and molecule == 'dayhoff':
return True
elif self.hp and molecule == 'hp':
return True
elif molecule.lower() == "dna" and self.is_dna:
return True

return False
return molecule == self.moltype

@property
def moltype(self): # TODO: test in minhash tests
if self.is_protein:
return 'protein'
elif self.dayhoff:
return 'dayhoff'
elif self.hp:
return 'hp'
else:
return 'DNA'
13 changes: 13 additions & 0 deletions sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def gather(self, query, *args, **kwargs):

return results

@abstractmethod
def select(self, ksize=None, moltype=None):
""

class LinearIndex(Index):
def __init__(self, _signatures=None, filename=None):
Expand Down Expand Up @@ -125,3 +128,13 @@ def load(cls, location):

lidx = LinearIndex(si, filename=location)
return lidx

def select(self, ksize=None, moltype=None):
def select_sigs(siglist, ksize, moltype):
for ss in siglist:
if (ksize is None or ss.minhash.ksize == ksize) and \
(moltype is None or ss.minhash.moltype == moltype):
yield ss

siglist=select_sigs(self._signatures, ksize, moltype)
return LinearIndex(siglist, self.filename)
12 changes: 12 additions & 0 deletions sourmash/lca/lca_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,18 @@ def signatures(self):
for v in self._signatures.values():
yield SourmashSignature(v)

def select(self, ksize=None, moltype=None):
ok = True
if ksize is not None and self.ksize != ksize:
ok = False
if moltype is not None and moltype != 'DNA':
ok = False

if ok:
return self

raise ValueError("cannot select LCA on ksize {} / moltype {}".format(ksize, moltype))

def load(self, db_name):
"Load from a JSON file."
xopen = open
Expand Down
14 changes: 14 additions & 0 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,20 @@ def signatures(self):
for k in self.leaves():
yield k.data

def select(self, ksize=None, moltype=None):
first_sig = next(iter(self.signatures()))

ok = True
if ksize is not None and first_sig.minhash.ksize != ksize:
ok = False
if moltype is not None and first_sig.moltype != moltype:
ok = False

if ok:
return self

raise ValueError("cannot select SBT on ksize {} / moltype {}".format(ksize, moltype))

def new_node_pos(self, node):
if not self._nodes:
self.next_node = 1
Expand Down
23 changes: 21 additions & 2 deletions tests/test__minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
def test_basic_dna(track_abundance):
# verify that MHs of size 1 stay size 1, & act properly as bottom sketches.
mh = MinHash(1, 4, track_abundance=track_abundance)
assert mh.moltype == 'DNA'

mh.add_sequence('ATGC')
a = mh.get_mins()

Expand Down Expand Up @@ -116,7 +118,14 @@ def test_bytes_dna(track_abundance):

def test_bytes_protein_dayhoff(track_abundance, dayhoff):
# verify that we can hash protein/aa sequences
mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance)
mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False,
track_abundance=track_abundance)

expected_moltype = 'protein'
if dayhoff:
expected_moltype = 'dayhoff'
assert mh.moltype == expected_moltype

mh.add_protein('AGYYG')
mh.add_protein('AGYYG')
mh.add_protein(b'AGYYG')
Expand All @@ -135,6 +144,11 @@ def test_protein_dayhoff(track_abundance, dayhoff):
def test_bytes_protein_hp(track_abundance, hp):
# verify that we can hash protein/aa sequences
mh = MinHash(10, 6, True, dayhoff=False, hp=hp, track_abundance=track_abundance)
expected_moltype = 'protein'
if hp:
expected_moltype = 'hp'
assert mh.moltype == expected_moltype

mh.add_protein('AGYYG')
mh.add_protein(u'AGYYG')
mh.add_protein(b'AGYYG')
Expand All @@ -159,6 +173,8 @@ def test_protein_hp(track_abundance, hp):
def test_translate_codon(track_abundance):
# Ensure that translation occurs properly
mh = MinHash(10, 6, is_protein=True)
assert mh.moltype == 'protein'

assert "S" == mh.translate_codon('TCT')
assert "S" == mh.translate_codon('TC')
assert "X" == mh.translate_codon("T")
Expand Down Expand Up @@ -187,6 +203,8 @@ def test_hp(track_abundance):
# verify that we can hash to hp-encoded protein/aa sequences
mh_hp = MinHash(10, 6, is_protein=True,
dayhoff=False, hp=True, track_abundance=track_abundance)
assert mh_hp.moltype == 'hp'

mh_hp.add_sequence('ACTGAC')

assert len(mh_hp.get_mins()) == 2
Expand Down Expand Up @@ -1096,7 +1114,8 @@ def test_set_abundance():


def test_set_abundance_2():
sig = sourmash.load_one_signature(utils.get_test_data("genome-s12.fa.gz.sig"),
datapath = utils.get_test_data("genome-s12.fa.gz.sig")
sig = sourmash.load_one_signature(datapath,
ksize=30,
select_moltype='dna')
new_mh = sig.minhash.copy_and_clear()
Expand Down
48 changes: 48 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,51 @@ def test_linear_index_save_load():
print([s[1].name() for s in sr])
assert len(sr) == 1
assert sr[0][1] == ss2


def test_linear_index_multik_select():
# this loads three ksizes, 21/31/51
sig2 = utils.get_test_data('2.fa.sig')
siglist = sourmash.load_signatures(sig2)

linear = LinearIndex()
for ss in siglist:
linear.insert(ss)

# select most specifically
linear2 = linear.select(ksize=31, moltype='DNA')
assert len(linear2) == 1

# all are DNA:
linear2 = linear.select(moltype='DNA')
assert len(linear2) == 3


def test_linear_index_moltype_select():
# this loads two ksizes(21, 30), and two moltypes (DNA and protein)
filename = utils.get_test_data('genome-s10+s11.sig')
siglist = sourmash.load_signatures(filename)

linear = LinearIndex()
for ss in siglist:
linear.insert(ss)

# select most specific DNA
linear2 = linear.select(ksize=30, moltype='DNA')
assert len(linear2) == 1

# select most specific protein
linear2 = linear.select(ksize=30, moltype='protein')
assert len(linear2) == 1

# can leave off ksize, selects all ksizes
linear2 = linear.select(moltype='DNA')
assert len(linear2) == 2

# can leave off ksize, selects all ksizes
linear2 = linear.select(moltype='protein')
assert len(linear2) == 2

# select something impossible
linear2 = linear.select(ksize=4)
assert len(linear2) == 0