Skip to content

Commit

Permalink
build out a manifest class a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Jun 14, 2021
1 parent b2547f3 commit 17b9576
Showing 1 changed file with 65 additions and 25 deletions.
90 changes: 65 additions & 25 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from abc import abstractmethod, ABC
from collections import namedtuple, Counter
import zipfile
import csv
from io import TextIOWrapper

from .search import make_jaccard_search_query, make_gather_query

Expand Down Expand Up @@ -290,7 +292,7 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None,
"""


def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0,
def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0,
containment=False, picklist=None):
"Check that the given signature matches the specificed requirements."
# ksize match?
Expand Down Expand Up @@ -450,23 +452,20 @@ def __init__(self, zf, selection_dict=None,
self.selection_dict = selection_dict
self.traverse_yield_all = traverse_yield_all

# manifest?
# load manifest?
try:
zi = self.zf.getinfo('SOURMASH-MANIFEST.csv')
except KeyError:
self.manifest_info = None
self.manifest = None
else:
# maybe support passing manifest in on constructor?
# CTB: maybe support passing manifest in on constructor?
print(f'found manifest when loading {self.zf.filename}')
import csv
from io import TextIOWrapper

mfp = self.zf.open(zi, 'r')
manifest_l = []
r = csv.DictReader(TextIOWrapper(mfp, 'utf-8'))
for row in r:
manifest_l.append(row)
self.manifest_info = manifest_l
# wrap as text, since ZipFile.open only supports 'r' mode.
mfp = TextIOWrapper(mfp, 'utf-8')
# load manifest!
self.manifest = CollectionManifest.load_from_csv(mfp)

def __bool__(self):
"Are there any matching signatures in this zipfile? Avoid calling len."
Expand Down Expand Up @@ -524,28 +523,21 @@ def signatures(self):
"Load all signatures in the zip file."
from .signature import load_signatures

skipped_manifest = True
if self.manifest_info is not None:
manifest = None
if self.manifest:
print('.signatures() found manifest!')
picklist = None
if self.selection_dict:
picklist = self.selection_dict.get('picklist', None)

if picklist and picklist.coltype == 'md5':
skipped_manifest = False
colkey = 'md5'
elif picklist and picklist.coltype == 'md5prefix8':
skipped_manifest = False
colkey = 'md5short'

def yield_fp():
for row in self.manifest_info:
if row[colkey] in picklist.pickset:
filename = row['internal_location']
if picklist and picklist.coltype in ('md5', 'md5prefix8'):
manifest = self.manifest
def yield_fp():
for filename in manifest.select_filenames(picklist=picklist):
zi = self.zf.getinfo(filename)
yield self.zf.open(zi)

if skipped_manifest:
if not manifest:
def yield_fp():
for zipinfo in self.zf.infolist():
# should we load this file? if it ends in .sig OR we are forcing:
Expand Down Expand Up @@ -855,3 +847,51 @@ def prefetch(self, query, threshold_bp, **kwargs):
yield IndexSearchResult(score, ss, best_src)

return results


class CollectionManifest:
def __init__(self):
self.info = None

def __bool__(self):
if self.info is None:
return False
return True

@classmethod
def load_from_csv(cls, fp):
"load a manifest from a CSV file."
manifest_list = []
r = csv.DictReader(fp)
for k in ('internal_location', 'md5', 'md5short', 'ksize',
'moltype', 'num', 'scaled', 'n_hashes', 'seed',
'with_abundance', 'name'):
if k not in r.fieldnames:
raise ValueError(f"missing column '{k}' in manifest.")

row = None
for row in r:
manifest_list.append(row)

obj = cls()
obj.info = manifest_list
print('XYZ', len(manifest_list))
return obj

def select_filenames(self, *, ksize=None, moltype=None, scaled=0, num=0,
containment=False, picklist=None):
"Yield internal paths for sigs that match the specificed requirements."
matching_rows = self.info
if picklist:
# map picklist.coltype to manifest column types.
# CTB: should these be the same? probably...
if picklist.coltype == 'md5':
colkey = 'md5'
elif picklist.coltype == 'md5prefix8':
colkey = 'md5short'
else:
assert 0 # support more here CTB!
matching_rows = ( row for row in matching_rows if row[colkey] in picklist.pickset )

for row in matching_rows:
yield row['internal_location']

0 comments on commit 17b9576

Please sign in to comment.