Check MD5 sums of downloaded atom data (tardis-sn#2267)

* Add MD5 as a value on atom data YAML configuration file; Modify downloader functions to check-checksums; Add retrying capabilities; Logging * Sort modules * Reformatted with black
light2802 · May 27, 2023 · f9b0025 · f9b0025
1 parent 7c57877
commit f9b0025
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 12 deletions.
diff --git a/tardis/data/atomic_data_repo.yml b/tardis/data/atomic_data_repo.yml
@@ -6,3 +6,4 @@ kurucz_cd23_chianti_H_He:
     - https://dev.azure.com/tardis-sn/TARDIS/_apis/git/repositories/tardis-refdata/items?path=atom_data/kurucz_cd23_chianti_H_He.h5&resolveLfs=true
     - https://media.githubusercontent.com/media/tardis-sn/tardis-refdata/master/atom_data/kurucz_cd23_chianti_H_He.h5
   uuid: NA
+  md5: 69a304e1e85e06508fe02dd8c5ba9397
diff --git a/tardis/io/atom_data/atom_web_download.py b/tardis/io/atom_data/atom_web_download.py
@@ -44,7 +44,8 @@ def download_atom_data(atomic_data_name=None):
 
     dst_dir = os.path.join(get_data_dir(), f"{atomic_data_name}.h5")
     src_url = atomic_repo[atomic_data_name]["url"]
-    mirrors = atomic_repo[atomic_data_name]["mirrors"]
+    mirrors = tuple(atomic_repo[atomic_data_name]["mirrors"])
+    checksum = atomic_repo[atomic_data_name]["md5"]
 
     logger.info(f"Downloading atomic data from {src_url} to {dst_dir}")
-    download_from_url(src_url, dst_dir, mirrors)
+    download_from_url(src_url, dst_dir, checksum, mirrors)
diff --git a/tardis/io/util.py b/tardis/io/util.py
@@ -1,22 +1,22 @@
 # Utility functions for the IO part of TARDIS
 
+import collections.abc as collections_abc
+import hashlib
+import logging
 import os
 import re
 import shutil
-import logging
-
-import pandas as pd
-import numpy as np
-import collections.abc as collections_abc
 from collections import OrderedDict
+from functools import lru_cache
 
+import numpy as np
+import pandas as pd
 import yaml
-
-from tardis import constants as const
 from astropy import units as u
 from astropy.utils.data import download_file
 
 from tardis import __path__ as TARDIS_PATH
+from tardis import constants as const
 
 logger = logging.getLogger(__name__)
 
@@ -387,7 +387,8 @@ def to_hdf(
         )
 
 
-def download_from_url(url, dst, src=None):
+@lru_cache(maxsize=None)
+def download_from_url(url, dst, checksum, src=None, retries=3):
     """Download files from a given URL
 
     Parameters
@@ -396,9 +397,24 @@ def download_from_url(url, dst, src=None):
         URL to download from
     dst : str
         Destination folder for the downloaded file
-    src : list
+    src : tuple
         List of URLs to use as mirrors
     """
 
     cached_file_path = download_file(url, sources=src, pkgname="tardis")
-    shutil.copy(cached_file_path, dst)
+
+    with open(cached_file_path, "rb") as f:
+        new_checksum = hashlib.md5(f.read()).hexdigest()
+
+    if checksum == new_checksum:
+        shutil.copy(cached_file_path, dst)
+
+    elif checksum != new_checksum and retries > 0:
+        retries -= 1
+        logger.warning(
+            f"Incorrect checksum, retrying... ({retries+1} attempts remaining)"
+        )
+        download_from_url(url, dst, checksum, src, retries)
+
+    else:
+        logger.error("Maximum number of retries reached. Aborting")