Revert to storing checksums as strings

Using bytes was slower and increased code complexity, and the memory gains were minimal.
aaronkollasch · Jun 14, 2021 · 692f7ec · 692f7ec
1 parent a0e1381
commit 692f7ec
Show file tree

Hide file tree

Showing 11 changed files with 80 additions and 144 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,14 @@
 Changelog for PhotoManager
 ==========================
 
+Unreleased - 2021-06-13
+-----------------------
+
+Reverted
+^^^^^^^^
+
+- Use str internally to represent checksum
+
 0.0.2 - 2021-06-13
 ------------------
 

diff --git a/src/photomanager/async_base.py b/src/photomanager/async_base.py
@@ -92,7 +92,7 @@ async def worker(self, worker_id: int):
     async def close_worker(self, worker_id: int):
         pass
 
-    async def execute_queue(self, all_jobs: Collection[AsyncJob]) -> dict[str, str]:
+    async def execute_queue(self, all_jobs: Collection[AsyncJob]) -> dict:
         """Run jobs"""
         self.queue = Queue()
         self.workers = []

diff --git a/src/photomanager/database.py b/src/photomanager/database.py
@@ -82,6 +82,11 @@ class DatabaseException(PhotoManagerBaseException):
 
 class Database:
     VERSION = 3
+    """
+    Database version history:
+    2: added tz_offset
+    3: shortened PhotoFile attribute names
+    """
     DB_KEY_ORDER = (
         "version",
         "hash_algorithm",
@@ -98,7 +103,7 @@ def __init__(self):
             "photo_db": {},
             "command_history": {},
         }
-        self.hash_to_uid: dict[bytes, str] = {}
+        self.hash_to_uid: dict[str, str] = {}
         self.timestamp_to_uids: dict[float, dict[str, None]] = {}
 
     def __eq__(self, other: DB) -> bool:
@@ -159,17 +164,12 @@ def db(self, db: dict):
             for uid in db["photo_db"].keys():
                 photos = db["photo_db"][uid]
                 for i in range(len(photos)):
-                    checksum = photos[i]["checksum"]
-                    checksum = checksum.split(":", 1)[0]
-                    photos[i]["checksum"] = checksum.split(":", 1)[0]
                     photos[i] = {NAME_MAP_ENC[k]: v for k, v in photos[i].items()}
 
         db = {k: db[k] for k in self.DB_KEY_ORDER}
         db["hash_algorithm"] = HashAlgorithm(db["hash_algorithm"])
         for uid in db["photo_db"].keys():
-            db["photo_db"][uid] = [
-                PhotoFile.from_json_dict(d) for d in db["photo_db"][uid]
-            ]
+            db["photo_db"][uid] = [PhotoFile.from_dict(d) for d in db["photo_db"][uid]]
 
         db["version"] = self.VERSION
         self._db = db
@@ -562,7 +562,7 @@ def collect_to_directory(
                 )
                 rel_store_path = (
                     f"{photo.local_datetime.strftime('%Y/%m-%b/%Y-%m-%d_%H-%M-%S')}-"
-                    f"{photo.chk[:4].hex()[:7]}-"
+                    f"{photo.chk[:7]}-"
                     f"{Path(photo.src).name}"
                 )
                 abs_store_path = directory / rel_store_path
@@ -805,8 +805,8 @@ def get_stats(self) -> tuple[int, int, int, int]:
         return num_uids, num_photos, num_stored_photos, total_file_size
 
     def make_hash_map(
-        self, new_algo: HashAlgorithm, hash_map: Optional[dict[bytes, bytes]] = None
-    ) -> dict[bytes, bytes]:  # pragma: no cover
+        self, new_algo: HashAlgorithm, hash_map: Optional[dict[str, str]] = None
+    ) -> dict[str, str]:  # pragma: no cover
         """Make a map of file checksums in order to migrate hashing algorithms.
 
         Checks source file hashes using the old algorithm to make sure the new hashes
@@ -855,7 +855,7 @@ def make_hash_map(
         return hash_map
 
     def map_hashes(
-        self, new_algo: str, hash_map: dict[bytes, bytes], map_all: bool = False
+        self, new_algo: str, hash_map: dict[str, str], map_all: bool = False
     ) -> Optional[int]:  # pragma: no cover
         """Map the database's checksums to a new algorithm.
 
@@ -877,7 +877,7 @@ def map_hashes(
         all_photos = [photo for photos in self.photo_db.values() for photo in photos]
         if map_all and (
             num_skipped_photos := sum(
-                photo.chk.split(b":", 1)[0] not in hash_map for photo in all_photos
+                photo.chk.split(":", 1)[0] not in hash_map for photo in all_photos
             )
         ):
             print(f"Not all items will be mapped: {num_skipped_photos}")
@@ -886,12 +886,12 @@ def map_hashes(
             if photo.chk in hash_map:
                 photo.chk = hash_map[photo.chk]
                 num_correct_photos += 1
-            elif (ca := photo.chk.split(b":", 1)) and len(ca) == 2:
+            elif (ca := photo.chk.split(":", 1)) and len(ca) == 2:
                 if c := hash_map.get(ca[0], None):
                     photo.chk = c
                 num_correct_photos += 1
             else:
-                photo.chk = photo.chk + f":{old_algo}".encode()
+                photo.chk = f"{photo.chk}:{old_algo}"
                 num_skipped_photos += 1
         self.hash_algorithm = new_algo
         print(f"Mapped {num_correct_photos} items")

diff --git a/src/photomanager/hasher.py b/src/photomanager/hasher.py
@@ -49,7 +49,7 @@ def _update_hash_obj(hash_obj, fd):
 def file_checksum(
     file: Union[bytes, str, PathLike, IOBase],
     algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
-) -> bytes:
+) -> str:
     if algorithm in HASH_ALGO_DEFINITIONS:
         hash_obj = HASH_ALGO_DEFINITIONS[algorithm]["factory"]()
     else:
@@ -59,13 +59,13 @@ def file_checksum(
     else:
         with open(file, "rb") as f:
             _update_hash_obj(hash_obj, f)
-    return hash_obj.digest()
+    return hash_obj.hexdigest()
 
 
 def check_files(
     file_paths: Iterable[Union[bytes, str, PathLike, IOBase]],
     algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
-) -> dict[str, bytes]:
+) -> dict[str, str]:
     output_dict = {}
     for path in tqdm(file_paths):
         try:
@@ -135,7 +135,7 @@ async def do_job(self, worker_id: int, job: FileHasherJob):
             for line in stdout.decode("utf-8").splitlines(keepends=False):
                 if line.strip():
                     checksum, path = line.split(maxsplit=1)
-                    self.output_dict[path] = bytes.fromhex(checksum)
+                    self.output_dict[path] = checksum
         except Exception as e:
             print("hasher output:", stdout)
             raise e
@@ -167,11 +167,11 @@ def check_files(
         file_paths: Iterable[Union[str, PathLike]],
         pbar_unit: str = "it",
         file_sizes: Optional[Iterable[int]] = None,
-    ) -> dict[str, bytes]:
+    ) -> dict[str, str]:
         if not self.use_async:
             return check_files(file_paths=file_paths, algorithm=self.algorithm)
 
-        self.output_dict: dict[str, bytes] = {}
+        self.output_dict = {}
         self.pbar_unit = pbar_unit
         all_jobs = []
         all_paths = list(make_chunks(self.encode(file_paths), self.batch_size))

diff --git a/src/photomanager/photofile.py b/src/photomanager/photofile.py
@@ -3,8 +3,8 @@
 from os import PathLike
 from os.path import getsize
 from datetime import datetime, tzinfo, timezone, timedelta
-from dataclasses import dataclass, asdict, fields
-from typing import Union, Optional, Type, TypeVar, ClassVar
+from dataclasses import dataclass, asdict
+from typing import Union, Optional, Type, TypeVar
 
 from photomanager.pyexiftool import ExifTool
 from photomanager.hasher import file_checksum, DEFAULT_HASH_ALGO, HashAlgorithm
@@ -29,7 +29,7 @@ class PhotoFile:
     """A dataclass describing a photo or other media file
 
     Attributes:
-        :chk (bytes): checksum of photo file
+        :chk (str): checksum of photo file
         :src (str): Absolute path where photo was found
         :dt (str): Datetime string for best estimated creation date (original)
         :ts (float): POSIX timestamp of best estimated creation date (derived)
@@ -39,7 +39,7 @@ class PhotoFile:
         :tzo (float): local time zone offset
     """
 
-    chk: bytes
+    chk: str
     src: str
     dt: str
     ts: float
@@ -48,20 +48,6 @@ class PhotoFile:
     prio: int = 10
     tzo: float = None
 
-    @property
-    def __dict__(self):
-        d = {name: getattr(self, name) for name in self.field_names()}
-        d["chk"] = d["chk"].hex()
-        return d
-
-    FIELD_NAMES: ClassVar = None
-
-    @classmethod
-    def field_names(cls):
-        if cls.FIELD_NAMES is None:
-            cls.FIELD_NAMES = tuple(f.name for f in fields(cls))
-        return cls.FIELD_NAMES
-
     @property
     def local_datetime(self):
         tz = timezone(timedelta(seconds=self.tzo)) if self.tzo is not None else None
@@ -104,7 +90,7 @@ def from_file(
     def from_file_cached(
         cls: Type[PF],
         source_path: str,
-        checksum_cache: dict[str, bytes],
+        checksum_cache: dict[str, str],
         datetime_cache: dict[str, str],
         algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
         tz_default: Optional[tzinfo] = None,
@@ -148,11 +134,6 @@ def from_file_cached(
             tzo=tz,
         )
 
-    @classmethod
-    def from_json_dict(cls: Type[PF], d: dict) -> PF:
-        d["chk"] = bytes.fromhex(d["chk"])
-        return cls(**d)
-
     @classmethod
     def from_dict(cls: Type[PF], d: dict) -> PF:
         return cls(**d)

diff --git a/src/photomanager/pyexiftool/pyexiftool_async.py b/src/photomanager/pyexiftool/pyexiftool_async.py
@@ -118,7 +118,6 @@ def __init__(
         )
         self.executable = executable if executable_ is None else executable_
         self.running = False
-        self.output_dict = {}
         self.queue = None
         self.batch_size = batch_size
         self.pbar = None

diff --git a/tests/integ_tests/test_cli.py b/tests/integ_tests/test_cli.py
@@ -16,30 +16,14 @@
     keep_top_dir=True,
 )
 EXPECTED_HASHES = {
-    "A/img1.jpg": bytes.fromhex(
-        "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb"
-    ),
-    "A/img2.jpg": bytes.fromhex(
-        "3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666"
-    ),
-    "A/img1.png": bytes.fromhex(
-        "1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9"
-    ),
-    "A/img4.jpg": bytes.fromhex(
-        "79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c"
-    ),
-    "B/img1.jpg": bytes.fromhex(
-        "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb"
-    ),
-    "B/img2.jpg": bytes.fromhex(
-        "e9fec87008fd240309b81c997e7ec5491fee8da7eb1a76fc39b8fcafa76bb583"
-    ),
-    "B/img4.jpg": bytes.fromhex(
-        "2b0f304f86655ebd04272cc5e7e886e400b79a53ecfdc789f75dd380cbcc8317"
-    ),
-    "C/img3.tiff": bytes.fromhex(
-        "2aca4e78afbcebf2526ad8ac544d90b92991faae22499eec45831ef7be392391"
-    ),
+    "A/img1.jpg": "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb",
+    "A/img2.jpg": "3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666",
+    "A/img1.png": "1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9",
+    "A/img4.jpg": "79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c",
+    "B/img1.jpg": "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb",
+    "B/img2.jpg": "e9fec87008fd240309b81c997e7ec5491fee8da7eb1a76fc39b8fcafa76bb583",
+    "B/img4.jpg": "2b0f304f86655ebd04272cc5e7e886e400b79a53ecfdc789f75dd380cbcc8317",
+    "C/img3.tiff": "2aca4e78afbcebf2526ad8ac544d90b92991faae22499eec45831ef7be392391",
 }
 
 

diff --git a/tests/integ_tests/test_hasher.py b/tests/integ_tests/test_hasher.py
@@ -4,24 +4,12 @@
 from photomanager.hasher import AsyncFileHasher, file_checksum, HashAlgorithm
 
 checksums = [
-    (
-        b"",
-        bytes.fromhex(
-            "0e5751c026e543b2e8ab2eb06099daa1d1e5df47778f7787faab45cdf12fe3a8"
-        ),
-    ),
+    (b"", "0e5751c026e543b2e8ab2eb06099daa1d1e5df47778f7787faab45cdf12fe3a8"),
     (
         b"\xff\xd8\xff\xe0",
-        bytes.fromhex(
-            "7d13007a8afed521cfc13306cbd6747bbc59556e3ca9514c8d94f900fbb56230"
-        ),
-    ),
-    (
-        b"test",
-        bytes.fromhex(
-            "928b20366943e2afd11ebc0eae2e53a93bf177a4fcf35bcc64d503704e65e202"
-        ),
+        "7d13007a8afed521cfc13306cbd6747bbc59556e3ca9514c8d94f900fbb56230",
     ),
+    (b"test", "928b20366943e2afd11ebc0eae2e53a93bf177a4fcf35bcc64d503704e65e202"),
 ]
 for _ in range(100):
     st = bytes([random.randint(0, 255) for _ in range(1000)])

diff --git a/tests/integ_tests/test_photofile.py b/tests/integ_tests/test_photofile.py
@@ -12,79 +12,63 @@
 )
 photofile_expected_results = [
     database.PhotoFile(
-        chk=bytes.fromhex(
-            "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb"
-        ),
+        chk="d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb",
         src="A/img1.jpg",
         dt="2015:08:01 18:28:36.90",
         ts=1438468116.9,
         fsz=771,
         tzo=-14400.0,
     ),
     database.PhotoFile(
-        chk=bytes.fromhex(
-            "3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666"
-        ),
+        chk="3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666",
         src="A/img2.jpg",
         dt="2015:08:01 18:28:36.99",
         ts=1438450116.99,
         fsz=771,
         tzo=3600.0,
     ),
     database.PhotoFile(
-        chk=bytes.fromhex(
-            "1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9"
-        ),
+        chk="1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9",
         src="A/img1.png",
         dt="2015:08:01 18:28:36.90",
         ts=1438453716.9,
         fsz=382,
         tzo=0.0,
     ),
     database.PhotoFile(
-        chk=bytes.fromhex(
-            "79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c"
-        ),
+        chk="79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c",
         src="A/img4.jpg",
         dt="2018:08:01 20:28:36",
         ts=1533169716.0,
         fsz=759,
         tzo=-14400.0,
     ),
     database.PhotoFile(
-        chk=bytes.fromhex(
-            "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb"
-        ),
+        chk="d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb",
         src="B/img1.jpg",
         dt="2015:08:01 18:28:36.90",
         ts=1438468116.9,
         fsz=771,
         tzo=-14400.0,
     ),
     database.PhotoFile(
-        chk=bytes.fromhex(
-            "e9fec87008fd240309b81c997e7ec5491fee8da7eb1a76fc39b8fcafa76bb583"
-        ),
+        chk="e9fec87008fd240309b81c997e7ec5491fee8da7eb1a76fc39b8fcafa76bb583",
         src="B/img2.jpg",
         dt="2015:08:01 18:28:36.99",
         ts=1438468116.99,
         fsz=789,
         tzo=-14400.0,
     ),
     database.PhotoFile(
-        chk=bytes.fromhex(
-            "2b0f304f86655ebd04272cc5e7e886e400b79a53ecfdc789f75dd380cbcc8317"
-        ),
+        chk="2b0f304f86655ebd04272cc5e7e886e400b79a53ecfdc789f75dd380cbcc8317",
         src="B/img4.jpg",
         dt="2018:08:01 20:28:36",
         ts=1533169716.0,
         fsz=777,
         tzo=-14400.0,
     ),
     database.PhotoFile(
-        chk=bytes.fromhex(
-            "2aca4e78afbcebf2526ad8ac544d90b92991faae22499eec45831ef7be392391"
-        ),
+        chk="2aca4e78afbcebf2526ad8ac544d90b92991faae22499eec45831ef7be392391",
         src="C/img3.tiff",
         dt="2018:08:01 19:28:36",
         ts=1533166116.0,