From 71449e241d41fb12add0000812088269402c01f7 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 27 Apr 2020 07:24:00 -0700 Subject: [PATCH] support creating zip SBTs --- sourmash/sbt.py | 54 ++++++++++++++++++++++++++++------------- sourmash/sbt_storage.py | 13 +++++----- tests/test_sbt.py | 32 ++++++++++++++++++++++-- 3 files changed, 73 insertions(+), 26 deletions(-) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 39bb514eee..f5a97fd391 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -484,26 +484,36 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): """ version = 5 - if path.endswith('.sbt.json'): - path = path[:-9] - fn = os.path.abspath(path + '.sbt.json') + if path.endswith(".zip"): + storage = ZipStorage(path) + kind = "Zip" + backend = "FSStorage" + subdir = '.sbt.{}'.format(os.path.basename(path[:-4])) + storage_args = FSStorage("", subdir).init_args() + storage.save(subdir + "/", "") + else: + kind = "FS" + if path.endswith('.sbt.json'): + path = path[:-9] + fn = os.path.abspath(path + '.sbt.json') - if storage is None: - # default storage - location = os.path.dirname(fn) - subdir = '.sbt.{}'.format(os.path.basename(path)) + if storage is None: + # default storage + location = os.path.dirname(fn) + subdir = '.sbt.{}'.format(os.path.basename(path)) - storage = FSStorage(location, subdir) - fn = os.path.join(location, fn) + storage = FSStorage(location, subdir) + fn = os.path.join(location, fn) - backend = [k for (k, v) in STORAGES.items() if v == type(storage)][0] + backend = [k for (k, v) in STORAGES.items() if v == type(storage)][0] + storage_args = storage.init_args() info = {} info['d'] = self.d info['version'] = version info['storage'] = { 'backend': backend, - 'args': storage.init_args() + 'args': storage_args } info['factory'] = { 'class': GraphFactory.__name__, @@ -540,10 +550,11 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): node.storage = storage - data['filename'] = node.save(data['filename']) + if kind == "Zip": + node.save(os.path.join(subdir, data['filename'])) + elif kind == "FS": + data['filename'] = node.save(data['filename']) - node.storage = storage - data['filename'] = node.save(data['filename']) if isinstance(node, Node): nodes[i] = data else: @@ -555,10 +566,19 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): notify("\nFinished saving nodes, now saving SBT json file.") info['nodes'] = nodes info['signatures'] = leaves - with open(fn, 'w') as fp: - json.dump(info, fp) - return fn + if kind == "Zip": + tree_data = json.dumps(info) + save_path = os.path.basename(path)[:-4] + ".sbt.json" + storage.save(save_path, tree_data) + storage.close() + + elif kind == "FS": + with open(fn, 'w') as fp: + json.dump(info, fp) + + return path + @classmethod def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True): diff --git a/sourmash/sbt_storage.py b/sourmash/sbt_storage.py index 14ef171fbc..869e80e1e6 100644 --- a/sourmash/sbt_storage.py +++ b/sourmash/sbt_storage.py @@ -26,6 +26,9 @@ def __enter__(self): return self def __exit__(self, type, value, traceback): + self.close() + + def close(self): pass def can_open(self, location): @@ -100,13 +103,6 @@ def init_args(self): def __exit__(self, type, value, traceback): self.tarfile.close() - @staticmethod - def can_open(location): - try: - return tarfile.is_tarfile(location) - except IOError: - return False - class ZipStorage(Storage): @@ -149,6 +145,9 @@ def init_args(self): return {'path': self.path} def __exit__(self, type, value, traceback): + self.close() + + def close(self): self.zipfile.close() @staticmethod diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 210d528d52..6aa8a66f6d 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -374,10 +374,10 @@ def test_sbt_zipstorage(tmpdir): to_search.data, 0.1)} print(*old_result, sep='\n') - with ZipStorage(tmpdir.join("tree.zip")) as storage: + with ZipStorage(str(tmpdir.join("tree.zip"))) as storage: tree.save(str(tmpdir.join("tree")), storage=storage) - with ZipStorage(tmpdir.join("tree.zip")) as storage: + with ZipStorage(str(tmpdir.join("tree.zip"))) as storage: tree = SBT.load(str(tmpdir.join("tree")), leaf_loader=SigLeaf.load, storage=storage) @@ -470,6 +470,34 @@ def test_sbt_redisstorage(): assert old_result == new_result +def test_save_zip(tmpdir): + testdata = utils.get_test_data("v5.zip") + testsbt = tmpdir.join("v5.zip") + newsbt = tmpdir.join("new.zip") + + shutil.copyfile(testdata, str(testsbt)) + + tree = SBT.load(str(testsbt), leaf_loader=SigLeaf.load) + tree.save(str(newsbt)) + assert newsbt.exists() + + new_tree = SBT.load(str(newsbt), leaf_loader=SigLeaf.load) + assert isinstance(new_tree.storage, ZipStorage) + + to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) + + print("*" * 60) + print("{}:".format(to_search)) + old_result = {str(s) for s in tree.find(search_minhashes, to_search, 0.1)} + new_result = {str(s) for s in new_tree.find(search_minhashes, to_search, 0.1)} + print(*new_result, sep="\n") + + + assert old_result == new_result + assert len(new_result) == 2 + + + def test_load_zip(tmpdir): testdata = utils.get_test_data("v5.zip") testsbt = tmpdir.join("v5.zip")