Skip to content

Commit

Permalink
Add benchmarking scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronkollasch committed Nov 24, 2021
1 parent 5f0942a commit 3d5261f
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 2 deletions.
Empty file added benchmarking/__init__.py
Empty file.
104 changes: 104 additions & 0 deletions benchmarking/bench_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python3
import random
import os
from pathlib import Path
import argparse
import time
import json
import math
import string
import datetime
from photomanager.photofile import PhotoFile
from photomanager.database import Database


def generate_test_database(num_uids=10000, r_seed=42):
random.seed(r_seed, version=2)
database = Database()
for i_uid in range(num_uids):
uid = "".join(random.choices(database.UID_ALPHABET, k=8))
database.photo_db[uid] = []
for i_photo in range(random.randint(1, 3)):
checksum = "".join(random.choices(string.hexdigits, k=64))
timestamp = random.randint(1037750179000000, 1637750179000000) / 1000000
dt = datetime.datetime.fromtimestamp(timestamp).astimezone(
datetime.timezone(datetime.timedelta(hours=random.randint(-12, 12)))
)
ts_str = dt.strftime("%Y-%m-%d %H:%M:%S%z")
img_num = random.randint(0, 9999)
source_path = f"/path/to/photo/{dt.year}/IMG_{img_num:04d}.JPG"
store_path = (
""
if random.randint(0, 1)
else f"{dt.year}/{source_path.rsplit('/', 1)[-1]}"
)
filesize = random.randint(100000, 100000000)
photo = PhotoFile(
chk=checksum,
src=source_path,
ts=timestamp,
dt=ts_str,
fsz=filesize,
sto=store_path,
)
database.photo_db[uid].append(photo)
return database


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--bench-dir", type=str, default="/tmp/photomanager_bench_dir")
parser.add_argument("--num-uids", type=int, default=100000)
parser.add_argument("--r-seed", type=int, default=42)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-n", "--num-tests", type=int, default=3)
args = parser.parse_args()

bench_dir = Path(args.bench_dir)
os.makedirs(bench_dir, exist_ok=True)

results = []
for i_test in range(args.num_tests):
database = generate_test_database(
num_uids=args.num_uids, r_seed=args.r_seed + i_test
)

time0 = time.perf_counter()
database.to_file(bench_dir / "db.json")
time1 = time.perf_counter()
database.to_file(bench_dir / "db.json.gz")
time2 = time.perf_counter()
database.to_file(bench_dir / "db.json.zst")
time3 = time.perf_counter()
database.from_file(bench_dir / "db.json")
time4 = time.perf_counter()
database.from_file(bench_dir / "db.json.gz")
time5 = time.perf_counter()
database.from_file(bench_dir / "db.json.zst")
time6 = time.perf_counter()

results.append(
{
"num_photos": sum(len(photos) for photos in database.photo_db.values()),
"raw_save": time1 - time0,
"gzip_save": time2 - time1,
"zst_save": time3 - time2,
"raw_load": time4 - time3,
"gzip_load": time5 - time4,
"zst_load": time6 - time5,
"raw_size": os.path.getsize(bench_dir / "db.json"),
"gzip_size": os.path.getsize(bench_dir / "db.json.gz"),
"zstd_size": os.path.getsize(bench_dir / "db.json.zst"),
}
)

summary = {
k: round(math.fsum(d[k] for d in results) / len(results), 3)
for k in results[0].keys()
}
output = {"args": vars(args), "results": results, "summary": summary}
print(json.dumps(output, indent=2))


if __name__ == "__main__":
main()
138 changes: 138 additions & 0 deletions benchmarking/bench_indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python3
import random
import os
import sys
from pathlib import Path
import argparse
import time
import json
import shutil
import math
from photomanager.actions.fileops import STORAGE_TYPES
from photomanager.cli import main as cli_main
from photomanager.hasher import DEFAULT_HASH_ALGO, HASH_ALGORITHMS


def make_test_files(directory, n_folders=20, r_seed=42):
random.seed(r_seed, version=2)
# total file size = n_folders * 32 MiB
for i_folder in range(n_folders):
c = hex(i_folder)[2:]
photo_directory = Path(directory) / c
os.makedirs(photo_directory, exist_ok=True)
with open(photo_directory / f"{c}_16.jpg", "wb") as f: # 1 16 MiB "photo"
f.write(random.randbytes(2 ** 24))
for i_photo in range(16): # 16 1 MiB "photos"
with open(photo_directory / f"{c}_{i_photo}_1.jpg", "wb") as f:
f.write(random.randbytes(2 ** 20))


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--bench-dir", type=str, default="/tmp/photomanager_bench_dir")
parser.add_argument("--test-files-dir", type=str, default=None)
parser.add_argument(
"--storage-type",
type=str,
default="SSD",
choices=STORAGE_TYPES,
help="Class of storage medium (HDD, SSD, RAID)",
)
parser.add_argument(
"--hash-algorithm",
type=str,
default=DEFAULT_HASH_ALGO.value,
choices=HASH_ALGORITHMS,
help=f"Hash algorithm (default={DEFAULT_HASH_ALGO.value})",
)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-n", "--num-tests", type=int, default=3)
args = parser.parse_args()

if not args.test_files_dir:
test_files_dir = Path(args.bench_dir) / "test_files"
if not os.path.exists(test_files_dir):
make_test_files(test_files_dir)
else:
test_files_dir = Path(args.test_files_dir)

results = []
for i in range(args.num_tests):
for database_file in Path(args.bench_dir).glob("photos*.db"):
os.remove(database_file)
database_file = Path(args.bench_dir) / "photos.db"
collect_dest_dir = Path(args.bench_dir) / "collect_dest"
try:
shutil.rmtree(collect_dest_dir)
except FileNotFoundError:
pass

if not args.verbose:
sys.stderr = open(os.devnull, "w")
try:
cli_main(
[
"create",
"--db",
database_file,
"--hash-algorithm",
args.hash_algorithm,
]
)
except SystemExit:
pass
time0 = time.perf_counter()
try:
cli_main(
[
"index",
"--db",
database_file,
"--source",
test_files_dir,
"--storage-type",
args.storage_type,
]
)
except SystemExit:
pass
time1 = time.perf_counter()
try:
cli_main(
["collect", "--db", database_file, "--destination", collect_dest_dir]
)
except SystemExit:
pass
time2 = time.perf_counter()
try:
cli_main(
[
"verify",
"--db",
database_file,
"--destination",
collect_dest_dir,
"--storage-type",
args.storage_type,
]
)
except SystemExit:
pass
time3 = time.perf_counter()
if not args.verbose:
sys.stderr.close()
sys.stderr = sys.__stderr__
results.append(
{"index": time1 - time0, "collect": time2 - time1, "verify": time3 - time2}
)

summary = {
k: round(math.fsum(d[k] for d in results) / len(results), 3)
for k in results[0].keys()
}
output = {"args": vars(args), "results": results, "summary": summary}
print(json.dumps(output, indent=2))


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ write_to_template = '''version = "{version}"
[tool.black]
line-length = 88
target-version = ['py38']
include = '^/(src|tests)/.*\.pyi?$'
include = '^/(src|tests|benchmarking)/.*\.pyi?$'
extend-exclude = '''
# A regex preceded with ^/ will apply only to files and directories
# in the root of the project.
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ deps =
flake8-black
commands =
black --check --diff .
flake8 --count src tests
flake8 --count src tests benchmarking

[testenv:twine]
deps =
Expand Down

0 comments on commit 3d5261f

Please sign in to comment.