diff --git a/doc/command-line.md b/doc/command-line.md index bf1e6f705f..372075d753 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -1026,6 +1026,9 @@ databases, LCA databases, and directory hierarchies. `sourmash sig fileinfo` provides optional JSON and YAML output, and those formats are under semantic versioning. +Note: `sourmash signature summarize` is an alias for `fileinfo`; they are +the same command. + ### `sourmash signature split` - split signatures into individual files Split each signature in the input file(s) into individual files, with @@ -1315,8 +1318,14 @@ sourmash sig manifest tests/test-data/prot/all.zip -o manifest.csv will create a CSV file, `manifest.csv`, in the internal sourmash manifest format. The manifest will contain an entry for every signature in the file, database, or collection. This format is largely -meant for internal use, but it can serve as a picklist pickfile for -subsetting large collections. +meant for internal use, but it can serve as a +[picklist pickfile](#using-picklists-to-subset-large-collections-of-signatures) +for subsetting large collections. + +By default, `sourmash sig manifest` will rebuild the manifest by +iterating over the signatures in the input file. This can be slow for +large collections. Use `--no-rebuild-manifest` to load an existing +manifest if it is available. ## Advanced command-line usage diff --git a/src/sourmash/cli/sig/__init__.py b/src/sourmash/cli/sig/__init__.py index c240027ccf..fc3b7e6502 100644 --- a/src/sourmash/cli/sig/__init__.py +++ b/src/sourmash/cli/sig/__init__.py @@ -12,6 +12,7 @@ from . import filter from . import flatten from . import fileinfo +from . import fileinfo as summarize from . import kmers from . import intersect from . import manifest diff --git a/src/sourmash/cli/sig/fileinfo.py b/src/sourmash/cli/sig/fileinfo.py index 8030db1dcc..0b5e71df71 100644 --- a/src/sourmash/cli/sig/fileinfo.py +++ b/src/sourmash/cli/sig/fileinfo.py @@ -1,8 +1,23 @@ """provide summary information on the given file""" +usage=""" + + sourmash sig fileinfo + +This will provide a summary of the sketch contents in the given file. + +JSON output can be generated in place of the normal human-readable output +with '--json-out'. + +'sig summarize' and 'sig fileinfo' are aliases for the same command. + +""" + + def subparser(subparsers): - subparser = subparsers.add_parser('fileinfo') + subparser = subparsers.add_parser('fileinfo', aliases=['summarize'], + usage=usage) subparser.add_argument('path') subparser.add_argument( '-q', '--quiet', action='store_true', diff --git a/src/sourmash/cli/sig/manifest.py b/src/sourmash/cli/sig/manifest.py index 497208fce8..0562ee2c5d 100644 --- a/src/sourmash/cli/sig/manifest.py +++ b/src/sourmash/cli/sig/manifest.py @@ -1,8 +1,23 @@ """create a manifest for a collection of signatures""" +usage=""" + + sourmash sig manifest -o manifest.csv + +This will output a sourmash manifest in CSV format. This manifest +can be used as a picklist with --picklist manifest.csv::manifest. + +The manifest will be rebuilt by iterating over the signatures in the +file unless --no-rebuild-manifest is specified; for large +collections, rebuilding the manifest can take a long time! + +See also the 'describe' and 'fileinfo' commands under 'sourmash sig'. + +""" + def subparser(subparsers): - subparser = subparsers.add_parser('manifest') + subparser = subparsers.add_parser('manifest', usage=usage) subparser.add_argument('location') subparser.add_argument( '-q', '--quiet', action='store_true', diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 60cbb26046..b4bbd65841 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -284,7 +284,10 @@ def manifest(args): rebuild = True if args.no_rebuild_manifest: + debug("sig manifest: not forcing rebuild.") rebuild = False + else: + debug("sig manifest: forcing rebuild.") manifest = sourmash_args.get_manifest(loader, require=True, rebuild=rebuild) @@ -292,7 +295,7 @@ def manifest(args): with open(args.output, "w", newline='') as csv_fp: manifest.write_to_csv(csv_fp, write_header=True) - notify(f"built manifest for {len(manifest)} signatures total.") + notify(f"manifest contains {len(manifest)} signatures total.") notify(f"wrote manifest to '{args.output}'") diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index c34a45eb3a..b71202c0aa 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1231,7 +1231,7 @@ def test_sig_extract_1(runtmp): assert actual_extract_sig == test_extract_sig -def test_sig_extract_1(runtmp): +def test_sig_extract_1_from_file(runtmp): # run sig extract with --from-file c = runtmp @@ -2366,7 +2366,7 @@ def test_sig_flatten_1(runtmp): assert test_flattened.minhash == siglist[0].minhash -def test_sig_flatten_1(runtmp): +def test_sig_flatten_1_from_file(runtmp): c = runtmp # extract matches to several names from among several signatures & flatten diff --git a/tests/test_cmd_signature_fileinfo.py b/tests/test_cmd_signature_fileinfo.py index fc94db9d64..534360712b 100644 --- a/tests/test_cmd_signature_fileinfo.py +++ b/tests/test_cmd_signature_fileinfo.py @@ -1,19 +1,13 @@ """ Tests for the 'sourmash signature fileinfo' command line. """ -import csv import shutil import os -import glob import pytest -import screed import json import sourmash_tst_utils as utils -import sourmash -from sourmash.signature import load_signatures -from sourmash.manifest import CollectionManifest from sourmash_tst_utils import SourmashCommandFailed ## command line tests @@ -43,6 +37,30 @@ def test_fileinfo_1_sig(runtmp): assert line.strip() in out +def test_fileinfo_1_sig_summarize(runtmp): + # get basic info on a signature with 'summarize' as alias for fileinfo + sig47 = utils.get_test_data('47.fa.sig') + + shutil.copyfile(sig47, runtmp.output('sig47.sig')) + runtmp.run_sourmash('sig', 'summarize', 'sig47.sig') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + expected_output = """\ +path filetype: MultiIndex +location: sig47.sig +is database? no +has manifest? yes +num signatures: 1 +total hashes: 5177 +summary of sketches: + 1 sketches with DNA, k=31, scaled=1000 5177 +""".splitlines() + for line in expected_output: + assert line.strip() in out + + def test_fileinfo_1_sig_abund(runtmp): # get basic info on a signature with abundance sig47 = utils.get_test_data('47.abunds.fa.sig') @@ -126,7 +144,7 @@ def test_fileinfo_4_zip(runtmp): print(runtmp.last_result.out) # 'location' will be fully resolved, ignore it for now - expected_output = f"""\ + expected_output = """\ path filetype: ZipFileLinearIndex is database? yes has manifest? yes @@ -187,7 +205,7 @@ def test_fileinfo_4_zip_rebuild(runtmp): # CTB: note we're missing one of the 8 in the rebuilt, dna-sig.noext, # because it is not automatically included unless you load the zipfile # with traverse. This is intentional. - expected_output = f"""\ + expected_output = """\ path filetype: ZipFileLinearIndex is database? yes has manifest? yes diff --git a/tests/test_sourmash_args.py b/tests/test_sourmash_args.py index aea49a3d0b..97b9d4d129 100644 --- a/tests/test_sourmash_args.py +++ b/tests/test_sourmash_args.py @@ -285,7 +285,7 @@ def test_save_signatures_to_location_3_zip_add_with_manifest(runtmp): # construct & save manifest mf = manifest.CollectionManifest([row]) - mf_name = f"SOURMASH-MANIFEST.csv" + mf_name = "SOURMASH-MANIFEST.csv" manifest_fp = io.StringIO() mf.write_to_csv(manifest_fp, write_header=True) @@ -461,8 +461,8 @@ def _signatures_with_internal(self): assert m.rows[0]['internal_location'] == "fakeiloc" -def test_get_manifest_3_build(): - # check that manifest is building +def test_get_manifest_3_build_2(): + # check that manifest is building, but only when asked sig47 = utils.get_test_data('47.fa.sig') ss47 = sourmash.load_one_signature(sig47)