Add benchmarking scripts

aaronkollasch · Nov 24, 2021 · 3d5261f · 3d5261f
1 parent 5f0942a
commit 3d5261f
Show file tree

Hide file tree

Showing 5 changed files with 244 additions and 2 deletions.
diff --git a/benchmarking/__init__.py b/benchmarking/__init__.py
diff --git a/benchmarking/bench_database.py b/benchmarking/bench_database.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+import random
+import os
+from pathlib import Path
+import argparse
+import time
+import json
+import math
+import string
+import datetime
+from photomanager.photofile import PhotoFile
+from photomanager.database import Database
+
+
+def generate_test_database(num_uids=10000, r_seed=42):
+    random.seed(r_seed, version=2)
+    database = Database()
+    for i_uid in range(num_uids):
+        uid = "".join(random.choices(database.UID_ALPHABET, k=8))
+        database.photo_db[uid] = []
+        for i_photo in range(random.randint(1, 3)):
+            checksum = "".join(random.choices(string.hexdigits, k=64))
+            timestamp = random.randint(1037750179000000, 1637750179000000) / 1000000
+            dt = datetime.datetime.fromtimestamp(timestamp).astimezone(
+                datetime.timezone(datetime.timedelta(hours=random.randint(-12, 12)))
+            )
+            ts_str = dt.strftime("%Y-%m-%d %H:%M:%S%z")
+            img_num = random.randint(0, 9999)
+            source_path = f"/path/to/photo/{dt.year}/IMG_{img_num:04d}.JPG"
+            store_path = (
+                ""
+                if random.randint(0, 1)
+                else f"{dt.year}/{source_path.rsplit('/', 1)[-1]}"
+            )
+            filesize = random.randint(100000, 100000000)
+            photo = PhotoFile(
+                chk=checksum,
+                src=source_path,
+                ts=timestamp,
+                dt=ts_str,
+                fsz=filesize,
+                sto=store_path,
+            )
+            database.photo_db[uid].append(photo)
+    return database
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bench-dir", type=str, default="/tmp/photomanager_bench_dir")
+    parser.add_argument("--num-uids", type=int, default=100000)
+    parser.add_argument("--r-seed", type=int, default=42)
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("-n", "--num-tests", type=int, default=3)
+    args = parser.parse_args()
+
+    bench_dir = Path(args.bench_dir)
+    os.makedirs(bench_dir, exist_ok=True)
+
+    results = []
+    for i_test in range(args.num_tests):
+        database = generate_test_database(
+            num_uids=args.num_uids, r_seed=args.r_seed + i_test
+        )
+
+        time0 = time.perf_counter()
+        database.to_file(bench_dir / "db.json")
+        time1 = time.perf_counter()
+        database.to_file(bench_dir / "db.json.gz")
+        time2 = time.perf_counter()
+        database.to_file(bench_dir / "db.json.zst")
+        time3 = time.perf_counter()
+        database.from_file(bench_dir / "db.json")
+        time4 = time.perf_counter()
+        database.from_file(bench_dir / "db.json.gz")
+        time5 = time.perf_counter()
+        database.from_file(bench_dir / "db.json.zst")
+        time6 = time.perf_counter()
+
+        results.append(
+            {
+                "num_photos": sum(len(photos) for photos in database.photo_db.values()),
+                "raw_save": time1 - time0,
+                "gzip_save": time2 - time1,
+                "zst_save": time3 - time2,
+                "raw_load": time4 - time3,
+                "gzip_load": time5 - time4,
+                "zst_load": time6 - time5,
+                "raw_size": os.path.getsize(bench_dir / "db.json"),
+                "gzip_size": os.path.getsize(bench_dir / "db.json.gz"),
+                "zstd_size": os.path.getsize(bench_dir / "db.json.zst"),
+            }
+        )
+
+    summary = {
+        k: round(math.fsum(d[k] for d in results) / len(results), 3)
+        for k in results[0].keys()
+    }
+    output = {"args": vars(args), "results": results, "summary": summary}
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarking/bench_indexing.py b/benchmarking/bench_indexing.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+import random
+import os
+import sys
+from pathlib import Path
+import argparse
+import time
+import json
+import shutil
+import math
+from photomanager.actions.fileops import STORAGE_TYPES
+from photomanager.cli import main as cli_main
+from photomanager.hasher import DEFAULT_HASH_ALGO, HASH_ALGORITHMS
+
+
+def make_test_files(directory, n_folders=20, r_seed=42):
+    random.seed(r_seed, version=2)
+    # total file size = n_folders * 32 MiB
+    for i_folder in range(n_folders):
+        c = hex(i_folder)[2:]
+        photo_directory = Path(directory) / c
+        os.makedirs(photo_directory, exist_ok=True)
+        with open(photo_directory / f"{c}_16.jpg", "wb") as f:  # 1 16 MiB "photo"
+            f.write(random.randbytes(2 ** 24))
+        for i_photo in range(16):  # 16 1 MiB "photos"
+            with open(photo_directory / f"{c}_{i_photo}_1.jpg", "wb") as f:
+                f.write(random.randbytes(2 ** 20))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bench-dir", type=str, default="/tmp/photomanager_bench_dir")
+    parser.add_argument("--test-files-dir", type=str, default=None)
+    parser.add_argument(
+        "--storage-type",
+        type=str,
+        default="SSD",
+        choices=STORAGE_TYPES,
+        help="Class of storage medium (HDD, SSD, RAID)",
+    )
+    parser.add_argument(
+        "--hash-algorithm",
+        type=str,
+        default=DEFAULT_HASH_ALGO.value,
+        choices=HASH_ALGORITHMS,
+        help=f"Hash algorithm (default={DEFAULT_HASH_ALGO.value})",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("-n", "--num-tests", type=int, default=3)
+    args = parser.parse_args()
+
+    if not args.test_files_dir:
+        test_files_dir = Path(args.bench_dir) / "test_files"
+        if not os.path.exists(test_files_dir):
+            make_test_files(test_files_dir)
+    else:
+        test_files_dir = Path(args.test_files_dir)
+
+    results = []
+    for i in range(args.num_tests):
+        for database_file in Path(args.bench_dir).glob("photos*.db"):
+            os.remove(database_file)
+        database_file = Path(args.bench_dir) / "photos.db"
+        collect_dest_dir = Path(args.bench_dir) / "collect_dest"
+        try:
+            shutil.rmtree(collect_dest_dir)
+        except FileNotFoundError:
+            pass
+
+        if not args.verbose:
+            sys.stderr = open(os.devnull, "w")
+        try:
+            cli_main(
+                [
+                    "create",
+                    "--db",
+                    database_file,
+                    "--hash-algorithm",
+                    args.hash_algorithm,
+                ]
+            )
+        except SystemExit:
+            pass
+        time0 = time.perf_counter()
+        try:
+            cli_main(
+                [
+                    "index",
+                    "--db",
+                    database_file,
+                    "--source",
+                    test_files_dir,
+                    "--storage-type",
+                    args.storage_type,
+                ]
+            )
+        except SystemExit:
+            pass
+        time1 = time.perf_counter()
+        try:
+            cli_main(
+                ["collect", "--db", database_file, "--destination", collect_dest_dir]
+            )
+        except SystemExit:
+            pass
+        time2 = time.perf_counter()
+        try:
+            cli_main(
+                [
+                    "verify",
+                    "--db",
+                    database_file,
+                    "--destination",
+                    collect_dest_dir,
+                    "--storage-type",
+                    args.storage_type,
+                ]
+            )
+        except SystemExit:
+            pass
+        time3 = time.perf_counter()
+        if not args.verbose:
+            sys.stderr.close()
+            sys.stderr = sys.__stderr__
+        results.append(
+            {"index": time1 - time0, "collect": time2 - time1, "verify": time3 - time2}
+        )
+
+    summary = {
+        k: round(math.fsum(d[k] for d in results) / len(results), 3)
+        for k in results[0].keys()
+    }
+    output = {"args": vars(args), "results": results, "summary": summary}
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ write_to_template = '''version = "{version}"
 [tool.black]
 line-length = 88
 target-version = ['py38']
-include = '^/(src|tests)/.*\.pyi?$'
+include = '^/(src|tests|benchmarking)/.*\.pyi?$'
 extend-exclude = '''
 # A regex preceded with ^/ will apply only to files and directories
 # in the root of the project.

diff --git a/setup.cfg b/setup.cfg
@@ -84,7 +84,7 @@ deps =
     flake8-black
 commands =
     black --check --diff .
-    flake8 --count src tests
+    flake8 --count src tests benchmarking
 
 [testenv:twine]
 deps =