Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Resource Manager: Unlink archives after extracting #1245 #1246

Merged
merged 14 commits into from
Jul 11, 2024
55 changes: 28 additions & 27 deletions src/ocrd/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT

class OcrdResourceManager():

class OcrdResourceManager:

"""
Managing processor resources
Expand Down Expand Up @@ -81,7 +82,7 @@ def load_resource_list(self, list_filename, database=None):
report = OcrdResourceListValidator.validate(list_loaded)
if not report.is_valid:
self.log.error('\n'.join(report.errors))
raise ValueError("Resource list %s is invalid!" % (list_filename))
raise ValueError(f"Resource list {list_filename} is invalid!")
for executable, resource_list in list_loaded.items():
if executable not in database:
database[executable] = []
Expand Down Expand Up @@ -176,7 +177,8 @@ def add_to_user_database(self, executable, res_filename, url=None, resource_type
Add a stub entry to the user resource.yml
"""
res_name = Path(res_filename).name
self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", executable, res_name, str(res_filename), self.user_list)
self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
f"creating stub in {self.user_list}'")
if Path(res_filename).is_dir():
res_size = directory_size(res_filename)
else:
Expand All @@ -190,7 +192,7 @@ def add_to_user_database(self, executable, res_filename, url=None, resource_type
resdict = {
'name': res_name,
'url': url if url else '???',
'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
'version_range': '???',
'type': resource_type,
'size': res_size
Expand Down Expand Up @@ -218,21 +220,23 @@ def resource_dir_to_location(self, resource_path):
'cwd' if resource_path.startswith(getcwd()) else \
resource_path

def parameter_usage(self, name, usage='as-is'):
@staticmethod
def parameter_usage(name, usage='as-is'):
if usage == 'as-is':
return name
elif usage == 'without-extension':
return Path(name).stem
raise ValueError("No such usage '%s'" % usage)
raise ValueError(f"No such usage '{usage}'")

def _download_impl(self, url, filename, progress_cb=None, size=None):
@staticmethod
def _download_impl(url, filename, progress_cb=None, size=None):
log = getLogger('ocrd.resource_manager._download_impl')
log.info("Downloading %s to %s" % (url, filename))
log.info(f"Downloading {url} to {filename}")
with open(filename, 'wb') as f:
gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
if gdrive_file_id:
if not is_gdrive_download_link:
url = "https://drive.google.com/uc?id={id}".format(id=gdrive_file_id)
url = f"https://drive.google.com/uc?id={gdrive_file_id}"
try:
with requests.get(url, stream=True) as r:
if "Content-Disposition" not in r.headers:
Expand All @@ -246,9 +250,10 @@ def _download_impl(self, url, filename, progress_cb=None, size=None):
progress_cb(len(data))
f.write(data)

def _copy_impl(self, src_filename, filename, progress_cb=None):
@staticmethod
def _copy_impl(src_filename, filename, progress_cb=None):
log = getLogger('ocrd.resource_manager._copy_impl')
log.info("Copying %s to %s", src_filename, filename)
log.info(f"Copying {src_filename} to {filename}")
if Path(src_filename).is_dir():
log.info(f"Copying recursively from {src_filename} to {filename}")
for child in Path(src_filename).rglob('*'):
Expand Down Expand Up @@ -276,16 +281,8 @@ def _copy_impl(self, src_filename, filename, progress_cb=None):

# TODO Proper caching (make head request for size, If-Modified etc)
def download(
self,
executable,
url,
basedir,
overwrite=False,
no_subdir=False,
name=None,
resource_type='file',
path_in_archive='.',
progress_cb=None,
self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
path_in_archive='.', progress_cb=None,
):
"""
Download a resource by URL
Expand All @@ -299,12 +296,13 @@ def download(
is_url = url.startswith('https://') or url.startswith('http://')
if fpath.exists():
if not overwrite:
raise FileExistsError("%s %s already exists but --overwrite is not set" % ('Directory' if fpath.is_dir() else 'File', fpath))
fpath_type = 'Directory' if fpath.is_dir() else 'File'
raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
if fpath.is_dir():
log.info("Removing existing target directory {fpath}")
log.info(f"Removing existing target directory {fpath}")
rmtree(str(fpath))
else:
log.info("Removing existing target file {fpath}")
log.info(f"Removing existing target file {fpath}")
unlink(str(fpath))
destdir.mkdir(parents=True, exist_ok=True)
if resource_type in ('file', 'directory'):
Expand All @@ -322,20 +320,23 @@ def download(
Path('out').mkdir()
with pushd_popd('out'):
mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
log.info("Extracting %s archive to %s/out" % (mimetype, tempdir))
log.info(f"Extracting {mimetype} archive to {tempdir}/out")
if mimetype == 'application/zip':
with ZipFile(f'../{archive_fname}', 'r') as zipf:
zipf.extractall()
elif mimetype in ('application/gzip', 'application/x-xz'):
with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
tar.extractall()
else:
raise RuntimeError("Unable to handle extraction of %s archive %s" % (mimetype, url))
log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
if Path(path_in_archive).is_dir():
copytree(path_in_archive, str(fpath))
else:
copy(path_in_archive, str(fpath))
if Path(tempdir).exists():
log.info(f"Removing temp dir {tempdir}")
rmtree(tempdir)
MehmedGIT marked this conversation as resolved.
Show resolved Hide resolved
return fpath

def _dedup_database(self, database=None, dedup_key='name'):
Expand Down
Loading