Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check MD5 sums of downloaded atom data #2267

Merged
merged 3 commits into from
Apr 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tardis/data/atomic_data_repo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ kurucz_cd23_chianti_H_He:
- https://dev.azure.com/tardis-sn/TARDIS/_apis/git/repositories/tardis-refdata/items?path=atom_data/kurucz_cd23_chianti_H_He.h5&resolveLfs=true
- https://media.githubusercontent.com/media/tardis-sn/tardis-refdata/master/atom_data/kurucz_cd23_chianti_H_He.h5
uuid: NA
md5: 69a304e1e85e06508fe02dd8c5ba9397
5 changes: 3 additions & 2 deletions tardis/io/atom_data/atom_web_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def download_atom_data(atomic_data_name=None):

dst_dir = os.path.join(get_data_dir(), f"{atomic_data_name}.h5")
src_url = atomic_repo[atomic_data_name]["url"]
mirrors = atomic_repo[atomic_data_name]["mirrors"]
mirrors = tuple(atomic_repo[atomic_data_name]["mirrors"])
checksum = atomic_repo[atomic_data_name]["md5"]

logger.info(f"Downloading atomic data from {src_url} to {dst_dir}")
download_from_url(src_url, dst_dir, mirrors)
download_from_url(src_url, dst_dir, checksum, mirrors)
36 changes: 26 additions & 10 deletions tardis/io/util.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
# Utility functions for the IO part of TARDIS

import collections.abc as collections_abc
import hashlib
import logging
import os
import re
import shutil
import logging

import pandas as pd
import numpy as np
import collections.abc as collections_abc
from collections import OrderedDict
from functools import lru_cache

import numpy as np
import pandas as pd
import yaml

from tardis import constants as const
from astropy import units as u
from astropy.utils.data import download_file

from tardis import __path__ as TARDIS_PATH
from tardis import constants as const

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -387,7 +387,8 @@ def to_hdf(
)


def download_from_url(url, dst, src=None):
@lru_cache(maxsize=None)
def download_from_url(url, dst, checksum, src=None, retries=3):
"""Download files from a given URL

Parameters
Expand All @@ -396,9 +397,24 @@ def download_from_url(url, dst, src=None):
URL to download from
dst : str
Destination folder for the downloaded file
src : list
src : tuple
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the parameters must be hashable to memoize with lru_cache

List of URLs to use as mirrors
"""

cached_file_path = download_file(url, sources=src, pkgname="tardis")
shutil.copy(cached_file_path, dst)

with open(cached_file_path, "rb") as f:
new_checksum = hashlib.md5(f.read()).hexdigest()

if checksum == new_checksum:
shutil.copy(cached_file_path, dst)

elif checksum != new_checksum and retries > 0:
retries -= 1
logger.warning(
f"Incorrect checksum, retrying... ({retries+1} attempts remaining)"
)
download_from_url(url, dst, checksum, src, retries)

else:
logger.error("Maximum number of retries reached. Aborting")