Skip to content

Commit

Permalink
Check MD5 sums of downloaded atom data (tardis-sn#2267)
Browse files Browse the repository at this point in the history
* Add MD5 as a value on atom data YAML configuration file; Modify downloader functions to check-checksums; Add retrying capabilities; Logging

* Sort modules

* Reformatted with black
  • Loading branch information
epassaro authored and light2802 committed May 27, 2023
1 parent 7c57877 commit f9b0025
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 12 deletions.
1 change: 1 addition & 0 deletions tardis/data/atomic_data_repo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ kurucz_cd23_chianti_H_He:
- https://dev.azure.com/tardis-sn/TARDIS/_apis/git/repositories/tardis-refdata/items?path=atom_data/kurucz_cd23_chianti_H_He.h5&resolveLfs=true
- https://media.githubusercontent.com/media/tardis-sn/tardis-refdata/master/atom_data/kurucz_cd23_chianti_H_He.h5
uuid: NA
md5: 69a304e1e85e06508fe02dd8c5ba9397
5 changes: 3 additions & 2 deletions tardis/io/atom_data/atom_web_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def download_atom_data(atomic_data_name=None):

dst_dir = os.path.join(get_data_dir(), f"{atomic_data_name}.h5")
src_url = atomic_repo[atomic_data_name]["url"]
mirrors = atomic_repo[atomic_data_name]["mirrors"]
mirrors = tuple(atomic_repo[atomic_data_name]["mirrors"])
checksum = atomic_repo[atomic_data_name]["md5"]

logger.info(f"Downloading atomic data from {src_url} to {dst_dir}")
download_from_url(src_url, dst_dir, mirrors)
download_from_url(src_url, dst_dir, checksum, mirrors)
36 changes: 26 additions & 10 deletions tardis/io/util.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
# Utility functions for the IO part of TARDIS

import collections.abc as collections_abc
import hashlib
import logging
import os
import re
import shutil
import logging

import pandas as pd
import numpy as np
import collections.abc as collections_abc
from collections import OrderedDict
from functools import lru_cache

import numpy as np
import pandas as pd
import yaml

from tardis import constants as const
from astropy import units as u
from astropy.utils.data import download_file

from tardis import __path__ as TARDIS_PATH
from tardis import constants as const

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -387,7 +387,8 @@ def to_hdf(
)


def download_from_url(url, dst, src=None):
@lru_cache(maxsize=None)
def download_from_url(url, dst, checksum, src=None, retries=3):
"""Download files from a given URL
Parameters
Expand All @@ -396,9 +397,24 @@ def download_from_url(url, dst, src=None):
URL to download from
dst : str
Destination folder for the downloaded file
src : list
src : tuple
List of URLs to use as mirrors
"""

cached_file_path = download_file(url, sources=src, pkgname="tardis")
shutil.copy(cached_file_path, dst)

with open(cached_file_path, "rb") as f:
new_checksum = hashlib.md5(f.read()).hexdigest()

if checksum == new_checksum:
shutil.copy(cached_file_path, dst)

elif checksum != new_checksum and retries > 0:
retries -= 1
logger.warning(
f"Incorrect checksum, retrying... ({retries+1} attempts remaining)"
)
download_from_url(url, dst, checksum, src, retries)

else:
logger.error("Maximum number of retries reached. Aborting")

0 comments on commit f9b0025

Please sign in to comment.