From 4cf03e8c9bf1e5f6fc648cfb22f96f07095d9959 Mon Sep 17 00:00:00 2001 From: Erik Nordin Date: Tue, 15 Oct 2024 14:17:34 -0500 Subject: [PATCH] fix: Improve command_static_url repack archive logic This patch improves the logic of static URL fetch tasks to better be able to determine if a file simply needs to be renamed, or if it is an archive that needs to be repackaged. --- src/taskgraph/run-task/fetch-content | 54 ++++++++++++++++-- test/test_scripts_fetch_content.py | 82 ++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 4 deletions(-) diff --git a/src/taskgraph/run-task/fetch-content b/src/taskgraph/run-task/fetch-content index e23192b8..f7d0819d 100755 --- a/src/taskgraph/run-task/fetch-content +++ b/src/taskgraph/run-task/fetch-content @@ -435,11 +435,49 @@ def extract_archive(path, dest_dir): log("%s extracted in %.3fs" % (path, time.time() - t0)) +def should_repack_archive(orig: pathlib.Path, dest: pathlib.Path, strip_components=0, add_prefix="") -> bool: + """ + Determines whether we should attempt to repack an archive based on the naming conventions used + in the original file path, the destination file path, and any modifications such as stripping + components or adding a prefix. + """ + + if strip_components or add_prefix: + # If strip_components or add_prefix is specified, we should always repack. + return True + + if orig.suffixes == dest.suffixes: + # If all suffixes are exactly the same, then a rename will suffice. + return False + + if dest.suffixes[-2:] == [".tar", ".zst"]: + # If the destination is a ".tar.zst" file then we will always try to repack it ourselves. + return True + + if orig.suffix == dest.suffix: + # If the final suffix is the same, this is likely just a rename. + # + # This may be a case where multiple suffixes were detected unnecessarily due to the file name + # containing a semantic version. For example the file "python-3.8.10-amd64.exe" would be detected + # to have three suffixes [".8", ".10-amd64", ".exe"]. Changing this to "python.exe" should be a + # simple rename, rather than a repack. + # + # In this case, we will try to determine if the original path has a supported archive suffix. + # If the original path is detected to be an archive, we will try to repack, otherwise rename. + return archive_type(orig) is not None + + # Otherwise, if the paths aren't the same, assume it's an archive and try to repack. + # + # It would be best to fail early if the repack fails than to fail during a test because a renamed + # file was incorrect type after all. + return True + + def repack_archive( orig: pathlib.Path, dest: pathlib.Path, strip_components=0, prefix="" ): assert orig != dest - log("Repacking as %s" % dest) + log(f"Repacking {orig} as {dest}") orig_typ, ifh = open_stream(orig) typ = archive_type(dest) if not typ: @@ -534,6 +572,9 @@ def repack_archive( # We only change compression here. The tar stream is unchanged. ctx.copy_stream(ifh, fh) + else: + raise Exception(f"Attempt to repack an archive of unsupported type {orig_typ}") + def fetch_and_extract(url, dest_dir, extract=True, sha256=None, size=None): """Fetch a URL and extract it to a destination path. @@ -773,8 +814,13 @@ def command_static_url(args): if gpg_sig_url: gpg_verify_path(dl_dest, gpg_key, gpg_signature) - if dl_dest != dest or args.strip_components or args.add_prefix: - repack_archive(dl_dest, dest, args.strip_components, args.add_prefix) + if dl_dest != dest: + if should_repack_archive(dl_dest, dest, args.strip_components, args.add_prefix): + repack_archive(dl_dest, dest, args.strip_components, args.add_prefix) + else: + log(f"Renaming {dl_dest} to {dest}") + dl_dest.rename(dest) + except Exception: try: dl_dest.unlink() @@ -783,7 +829,7 @@ def command_static_url(args): raise - if dl_dest != dest: + if dl_dest != dest and dl_dest.exists(): log("Removing %s" % dl_dest) dl_dest.unlink() diff --git a/test/test_scripts_fetch_content.py b/test/test_scripts_fetch_content.py index 761a112c..b8450d03 100644 --- a/test/test_scripts_fetch_content.py +++ b/test/test_scripts_fetch_content.py @@ -1,4 +1,5 @@ import os +import pathlib import urllib.request from importlib.machinery import SourceFileLoader from importlib.util import module_from_spec, spec_from_loader @@ -94,3 +95,84 @@ def getheader(field): except fetch_content_mod.IntegrityError: if not raises: raise + + +@pytest.mark.parametrize( + "expected,orig,dest,strip_components,add_prefix", + [ + # Archives to repack + (True, pathlib.Path("archive"), pathlib.Path("archive.tar.zst"), 0, ""), + (True, pathlib.Path("archive.tar"), pathlib.Path("archive.tar.zst"), 0, ""), + (True, pathlib.Path("archive.tgz"), pathlib.Path("archive.tar.zst"), 0, ""), + (True, pathlib.Path("archive.zip"), pathlib.Path("archive.tar.zst"), 0, ""), + (True, pathlib.Path("archive.tar.xz"), pathlib.Path("archive.tar.zst"), 0, ""), + (True, pathlib.Path("archive.zst"), pathlib.Path("archive.tar.zst"), 0, ""), + # Path is exactly the same + (False, pathlib.Path("archive"), pathlib.Path("archive"), 0, ""), + (False, pathlib.Path("file.txt"), pathlib.Path("file.txt"), 0, ""), + (False, pathlib.Path("archive.tar"), pathlib.Path("archive.tar"), 0, ""), + (False, pathlib.Path("archive.tgz"), pathlib.Path("archive.tgz"), 0, ""), + (False, pathlib.Path("archive.zip"), pathlib.Path("archive.zip"), 0, ""), + ( + False, + pathlib.Path("archive.tar.zst"), + pathlib.Path("archive.tar.zst"), + 0, + "", + ), + ( + False, + pathlib.Path("archive-before.tar.zst"), + pathlib.Path("archive-after.tar.zst"), + 0, + "", + ), + ( + False, + pathlib.Path("before.foo.bar.baz"), + pathlib.Path("after.foo.bar.baz"), + 0, + "", + ), + # Non-default values for strip_components and add_prefix parameters + (True, pathlib.Path("archive.tar.zst"), pathlib.Path("archive.tar.zst"), 1, ""), + ( + True, + pathlib.Path("archive.tar.zst"), + pathlib.Path("archive.tar.zst"), + 0, + "prefix", + ), + ( + True, + pathlib.Path("archive.tar.zst"), + pathlib.Path("archive.tar.zst"), + 1, + "prefix", + ), + # Real edge cases that should not be repacks + ( + False, + pathlib.Path("python-3.8.10-amd64.exe"), + pathlib.Path("python.exe"), + 0, + "", + ), + ( + False, + pathlib.Path("9ee26e91-9b52-44ba-8d30-c0230dd587b2.bin"), + pathlib.Path("model.esen.intgemm.alphas.bin"), + 0, + "", + ), + ], +) +def test_should_repack_archive( + fetch_content_mod, orig, dest, expected, strip_components, add_prefix +): + assert ( + fetch_content_mod.should_repack_archive( + orig, dest, strip_components, add_prefix + ) + == expected + ), f"Failed for orig: {orig}, dest: {dest}, strip_components: {strip_components}, add_prefix: {add_prefix}, expected {expected} but received {not expected}"