Skip to content

Commit

Permalink
Revert to storing checksums as strings
Browse files Browse the repository at this point in the history
Using bytes was slower and increased code complexity,
and the memory gains were minimal.
  • Loading branch information
aaronkollasch committed Jun 14, 2021
1 parent a0e1381 commit 692f7ec
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 144 deletions.
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Changelog for PhotoManager
==========================

Unreleased - 2021-06-13
-----------------------

Reverted
^^^^^^^^

- Use str internally to represent checksum

0.0.2 - 2021-06-13
------------------

Expand Down
2 changes: 1 addition & 1 deletion src/photomanager/async_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ async def worker(self, worker_id: int):
async def close_worker(self, worker_id: int):
pass

async def execute_queue(self, all_jobs: Collection[AsyncJob]) -> dict[str, str]:
async def execute_queue(self, all_jobs: Collection[AsyncJob]) -> dict:
"""Run jobs"""
self.queue = Queue()
self.workers = []
Expand Down
28 changes: 14 additions & 14 deletions src/photomanager/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ class DatabaseException(PhotoManagerBaseException):

class Database:
VERSION = 3
"""
Database version history:
2: added tz_offset
3: shortened PhotoFile attribute names
"""
DB_KEY_ORDER = (
"version",
"hash_algorithm",
Expand All @@ -98,7 +103,7 @@ def __init__(self):
"photo_db": {},
"command_history": {},
}
self.hash_to_uid: dict[bytes, str] = {}
self.hash_to_uid: dict[str, str] = {}
self.timestamp_to_uids: dict[float, dict[str, None]] = {}

def __eq__(self, other: DB) -> bool:
Expand Down Expand Up @@ -159,17 +164,12 @@ def db(self, db: dict):
for uid in db["photo_db"].keys():
photos = db["photo_db"][uid]
for i in range(len(photos)):
checksum = photos[i]["checksum"]
checksum = checksum.split(":", 1)[0]
photos[i]["checksum"] = checksum.split(":", 1)[0]
photos[i] = {NAME_MAP_ENC[k]: v for k, v in photos[i].items()}

db = {k: db[k] for k in self.DB_KEY_ORDER}
db["hash_algorithm"] = HashAlgorithm(db["hash_algorithm"])
for uid in db["photo_db"].keys():
db["photo_db"][uid] = [
PhotoFile.from_json_dict(d) for d in db["photo_db"][uid]
]
db["photo_db"][uid] = [PhotoFile.from_dict(d) for d in db["photo_db"][uid]]

db["version"] = self.VERSION
self._db = db
Expand Down Expand Up @@ -562,7 +562,7 @@ def collect_to_directory(
)
rel_store_path = (
f"{photo.local_datetime.strftime('%Y/%m-%b/%Y-%m-%d_%H-%M-%S')}-"
f"{photo.chk[:4].hex()[:7]}-"
f"{photo.chk[:7]}-"
f"{Path(photo.src).name}"
)
abs_store_path = directory / rel_store_path
Expand Down Expand Up @@ -805,8 +805,8 @@ def get_stats(self) -> tuple[int, int, int, int]:
return num_uids, num_photos, num_stored_photos, total_file_size

def make_hash_map(
self, new_algo: HashAlgorithm, hash_map: Optional[dict[bytes, bytes]] = None
) -> dict[bytes, bytes]: # pragma: no cover
self, new_algo: HashAlgorithm, hash_map: Optional[dict[str, str]] = None
) -> dict[str, str]: # pragma: no cover
"""Make a map of file checksums in order to migrate hashing algorithms.
Checks source file hashes using the old algorithm to make sure the new hashes
Expand Down Expand Up @@ -855,7 +855,7 @@ def make_hash_map(
return hash_map

def map_hashes(
self, new_algo: str, hash_map: dict[bytes, bytes], map_all: bool = False
self, new_algo: str, hash_map: dict[str, str], map_all: bool = False
) -> Optional[int]: # pragma: no cover
"""Map the database's checksums to a new algorithm.
Expand All @@ -877,7 +877,7 @@ def map_hashes(
all_photos = [photo for photos in self.photo_db.values() for photo in photos]
if map_all and (
num_skipped_photos := sum(
photo.chk.split(b":", 1)[0] not in hash_map for photo in all_photos
photo.chk.split(":", 1)[0] not in hash_map for photo in all_photos
)
):
print(f"Not all items will be mapped: {num_skipped_photos}")
Expand All @@ -886,12 +886,12 @@ def map_hashes(
if photo.chk in hash_map:
photo.chk = hash_map[photo.chk]
num_correct_photos += 1
elif (ca := photo.chk.split(b":", 1)) and len(ca) == 2:
elif (ca := photo.chk.split(":", 1)) and len(ca) == 2:
if c := hash_map.get(ca[0], None):
photo.chk = c
num_correct_photos += 1
else:
photo.chk = photo.chk + f":{old_algo}".encode()
photo.chk = f"{photo.chk}:{old_algo}"
num_skipped_photos += 1
self.hash_algorithm = new_algo
print(f"Mapped {num_correct_photos} items")
Expand Down
12 changes: 6 additions & 6 deletions src/photomanager/hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _update_hash_obj(hash_obj, fd):
def file_checksum(
file: Union[bytes, str, PathLike, IOBase],
algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
) -> bytes:
) -> str:
if algorithm in HASH_ALGO_DEFINITIONS:
hash_obj = HASH_ALGO_DEFINITIONS[algorithm]["factory"]()
else:
Expand All @@ -59,13 +59,13 @@ def file_checksum(
else:
with open(file, "rb") as f:
_update_hash_obj(hash_obj, f)
return hash_obj.digest()
return hash_obj.hexdigest()


def check_files(
file_paths: Iterable[Union[bytes, str, PathLike, IOBase]],
algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
) -> dict[str, bytes]:
) -> dict[str, str]:
output_dict = {}
for path in tqdm(file_paths):
try:
Expand Down Expand Up @@ -135,7 +135,7 @@ async def do_job(self, worker_id: int, job: FileHasherJob):
for line in stdout.decode("utf-8").splitlines(keepends=False):
if line.strip():
checksum, path = line.split(maxsplit=1)
self.output_dict[path] = bytes.fromhex(checksum)
self.output_dict[path] = checksum
except Exception as e:
print("hasher output:", stdout)
raise e
Expand Down Expand Up @@ -167,11 +167,11 @@ def check_files(
file_paths: Iterable[Union[str, PathLike]],
pbar_unit: str = "it",
file_sizes: Optional[Iterable[int]] = None,
) -> dict[str, bytes]:
) -> dict[str, str]:
if not self.use_async:
return check_files(file_paths=file_paths, algorithm=self.algorithm)

self.output_dict: dict[str, bytes] = {}
self.output_dict = {}
self.pbar_unit = pbar_unit
all_jobs = []
all_paths = list(make_chunks(self.encode(file_paths), self.batch_size))
Expand Down
29 changes: 5 additions & 24 deletions src/photomanager/photofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from os import PathLike
from os.path import getsize
from datetime import datetime, tzinfo, timezone, timedelta
from dataclasses import dataclass, asdict, fields
from typing import Union, Optional, Type, TypeVar, ClassVar
from dataclasses import dataclass, asdict
from typing import Union, Optional, Type, TypeVar

from photomanager.pyexiftool import ExifTool
from photomanager.hasher import file_checksum, DEFAULT_HASH_ALGO, HashAlgorithm
Expand All @@ -29,7 +29,7 @@ class PhotoFile:
"""A dataclass describing a photo or other media file
Attributes:
:chk (bytes): checksum of photo file
:chk (str): checksum of photo file
:src (str): Absolute path where photo was found
:dt (str): Datetime string for best estimated creation date (original)
:ts (float): POSIX timestamp of best estimated creation date (derived)
Expand All @@ -39,7 +39,7 @@ class PhotoFile:
:tzo (float): local time zone offset
"""

chk: bytes
chk: str
src: str
dt: str
ts: float
Expand All @@ -48,20 +48,6 @@ class PhotoFile:
prio: int = 10
tzo: float = None

@property
def __dict__(self):
d = {name: getattr(self, name) for name in self.field_names()}
d["chk"] = d["chk"].hex()
return d

FIELD_NAMES: ClassVar = None

@classmethod
def field_names(cls):
if cls.FIELD_NAMES is None:
cls.FIELD_NAMES = tuple(f.name for f in fields(cls))
return cls.FIELD_NAMES

@property
def local_datetime(self):
tz = timezone(timedelta(seconds=self.tzo)) if self.tzo is not None else None
Expand Down Expand Up @@ -104,7 +90,7 @@ def from_file(
def from_file_cached(
cls: Type[PF],
source_path: str,
checksum_cache: dict[str, bytes],
checksum_cache: dict[str, str],
datetime_cache: dict[str, str],
algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
tz_default: Optional[tzinfo] = None,
Expand Down Expand Up @@ -148,11 +134,6 @@ def from_file_cached(
tzo=tz,
)

@classmethod
def from_json_dict(cls: Type[PF], d: dict) -> PF:
d["chk"] = bytes.fromhex(d["chk"])
return cls(**d)

@classmethod
def from_dict(cls: Type[PF], d: dict) -> PF:
return cls(**d)
Expand Down
1 change: 0 additions & 1 deletion src/photomanager/pyexiftool/pyexiftool_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def __init__(
)
self.executable = executable if executable_ is None else executable_
self.running = False
self.output_dict = {}
self.queue = None
self.batch_size = batch_size
self.pbar = None
Expand Down
32 changes: 8 additions & 24 deletions tests/integ_tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,14 @@
keep_top_dir=True,
)
EXPECTED_HASHES = {
"A/img1.jpg": bytes.fromhex(
"d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb"
),
"A/img2.jpg": bytes.fromhex(
"3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666"
),
"A/img1.png": bytes.fromhex(
"1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9"
),
"A/img4.jpg": bytes.fromhex(
"79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c"
),
"B/img1.jpg": bytes.fromhex(
"d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb"
),
"B/img2.jpg": bytes.fromhex(
"e9fec87008fd240309b81c997e7ec5491fee8da7eb1a76fc39b8fcafa76bb583"
),
"B/img4.jpg": bytes.fromhex(
"2b0f304f86655ebd04272cc5e7e886e400b79a53ecfdc789f75dd380cbcc8317"
),
"C/img3.tiff": bytes.fromhex(
"2aca4e78afbcebf2526ad8ac544d90b92991faae22499eec45831ef7be392391"
),
"A/img1.jpg": "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb",
"A/img2.jpg": "3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666",
"A/img1.png": "1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9",
"A/img4.jpg": "79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c",
"B/img1.jpg": "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb",
"B/img2.jpg": "e9fec87008fd240309b81c997e7ec5491fee8da7eb1a76fc39b8fcafa76bb583",
"B/img4.jpg": "2b0f304f86655ebd04272cc5e7e886e400b79a53ecfdc789f75dd380cbcc8317",
"C/img3.tiff": "2aca4e78afbcebf2526ad8ac544d90b92991faae22499eec45831ef7be392391",
}


Expand Down
18 changes: 3 additions & 15 deletions tests/integ_tests/test_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,12 @@
from photomanager.hasher import AsyncFileHasher, file_checksum, HashAlgorithm

checksums = [
(
b"",
bytes.fromhex(
"0e5751c026e543b2e8ab2eb06099daa1d1e5df47778f7787faab45cdf12fe3a8"
),
),
(b"", "0e5751c026e543b2e8ab2eb06099daa1d1e5df47778f7787faab45cdf12fe3a8"),
(
b"\xff\xd8\xff\xe0",
bytes.fromhex(
"7d13007a8afed521cfc13306cbd6747bbc59556e3ca9514c8d94f900fbb56230"
),
),
(
b"test",
bytes.fromhex(
"928b20366943e2afd11ebc0eae2e53a93bf177a4fcf35bcc64d503704e65e202"
),
"7d13007a8afed521cfc13306cbd6747bbc59556e3ca9514c8d94f900fbb56230",
),
(b"test", "928b20366943e2afd11ebc0eae2e53a93bf177a4fcf35bcc64d503704e65e202"),
]
for _ in range(100):
st = bytes([random.randint(0, 255) for _ in range(1000)])
Expand Down
32 changes: 8 additions & 24 deletions tests/integ_tests/test_photofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,79 +12,63 @@
)
photofile_expected_results = [
database.PhotoFile(
chk=bytes.fromhex(
"d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb"
),
chk="d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb",
src="A/img1.jpg",
dt="2015:08:01 18:28:36.90",
ts=1438468116.9,
fsz=771,
tzo=-14400.0,
),
database.PhotoFile(
chk=bytes.fromhex(
"3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666"
),
chk="3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666",
src="A/img2.jpg",
dt="2015:08:01 18:28:36.99",
ts=1438450116.99,
fsz=771,
tzo=3600.0,
),
database.PhotoFile(
chk=bytes.fromhex(
"1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9"
),
chk="1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9",
src="A/img1.png",
dt="2015:08:01 18:28:36.90",
ts=1438453716.9,
fsz=382,
tzo=0.0,
),
database.PhotoFile(
chk=bytes.fromhex(
"79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c"
),
chk="79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c",
src="A/img4.jpg",
dt="2018:08:01 20:28:36",
ts=1533169716.0,
fsz=759,
tzo=-14400.0,
),
database.PhotoFile(
chk=bytes.fromhex(
"d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb"
),
chk="d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb",
src="B/img1.jpg",
dt="2015:08:01 18:28:36.90",
ts=1438468116.9,
fsz=771,
tzo=-14400.0,
),
database.PhotoFile(
chk=bytes.fromhex(
"e9fec87008fd240309b81c997e7ec5491fee8da7eb1a76fc39b8fcafa76bb583"
),
chk="e9fec87008fd240309b81c997e7ec5491fee8da7eb1a76fc39b8fcafa76bb583",
src="B/img2.jpg",
dt="2015:08:01 18:28:36.99",
ts=1438468116.99,
fsz=789,
tzo=-14400.0,
),
database.PhotoFile(
chk=bytes.fromhex(
"2b0f304f86655ebd04272cc5e7e886e400b79a53ecfdc789f75dd380cbcc8317"
),
chk="2b0f304f86655ebd04272cc5e7e886e400b79a53ecfdc789f75dd380cbcc8317",
src="B/img4.jpg",
dt="2018:08:01 20:28:36",
ts=1533169716.0,
fsz=777,
tzo=-14400.0,
),
database.PhotoFile(
chk=bytes.fromhex(
"2aca4e78afbcebf2526ad8ac544d90b92991faae22499eec45831ef7be392391"
),
chk="2aca4e78afbcebf2526ad8ac544d90b92991faae22499eec45831ef7be392391",
src="C/img3.tiff",
dt="2018:08:01 19:28:36",
ts=1533166116.0,
Expand Down
Loading

0 comments on commit 692f7ec

Please sign in to comment.