Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor and expand download_hash.py #11513

Merged
merged 10 commits into from
Sep 12, 2024
203 changes: 166 additions & 37 deletions scripts/download_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

import sys

from itertools import count
from itertools import count, groupby
from collections import defaultdict
from functools import cache
import argparse
import requests
from ruamel.yaml import YAML
from packaging.version import Version
Expand All @@ -25,52 +27,179 @@ def open_checksums_yaml():

return data, yaml


def download_hash(minors):
architectures = ["arm", "arm64", "amd64", "ppc64le"]
downloads = ["kubelet", "kubectl", "kubeadm"]
def version_compare(version):
return Version(version.removeprefix("v"))

downloads = {
"calicoctl_binary": "https://github.com/projectcalico/calico/releases/download/{version}/SHA256SUMS",
"ciliumcli_binary": "https://github.com/cilium/cilium-cli/releases/download/{version}/cilium-{os}-{arch}.tar.gz.sha256sum",
"cni_binary": "https://github.com/containernetworking/plugins/releases/download/{version}/cni-plugins-{os}-{arch}-{version}.tgz.sha256",
"containerd_archive": "https://github.com/containerd/containerd/releases/download/v{version}/containerd-{version}-{os}-{arch}.tar.gz.sha256sum",
"crictl": "https://github.com/kubernetes-sigs/cri-tools/releases/download/{version}/critest-{version}-{os}-{arch}.tar.gz.sha256",
"crio_archive": "https://storage.googleapis.com/cri-o/artifacts/cri-o.{arch}.{version}.tar.gz.sha256sum",
"etcd_binary": "https://github.com/etcd-io/etcd/releases/download/{version}/SHA256SUMS",
"kubeadm": "https://dl.k8s.io/release/{version}/bin/linux/{arch}/kubeadm.sha256",
"kubectl": "https://dl.k8s.io/release/{version}/bin/linux/{arch}/kubectl.sha256",
"kubelet": "https://dl.k8s.io/release/{version}/bin/linux/{arch}/kubelet.sha256",
"nerdctl_archive": "https://github.com/containerd/nerdctl/releases/download/v{version}/SHA256SUMS",
"runc": "https://github.com/opencontainers/runc/releases/download/{version}/runc.sha256sum",
"skopeo_binary": "https://github.com/lework/skopeo-binary/releases/download/{version}/skopeo-{os}-{arch}.sha256",
"yq": "https://github.com/mikefarah/yq/releases/download/{version}/checksums-bsd", # see https://github.com/mikefarah/yq/pull/1691 for why we use this url
}
# TODO: downloads not supported
# youki: no checkusms in releases
# kata: no checksums in releases
# gvisor: sha512 checksums
# crun : PGP signatures
# cri_dockerd: no checksums or signatures
# helm_archive: PGP signatures
# krew_archive: different yaml structure
# calico_crds_archive: different yaml structure

# TODO:
# noarch support -> k8s manifests, helm charts
# different checksum format (needs download role changes)
# different verification methods (gpg, cosign) ( needs download role changes) (or verify the sig in this script and only use the checksum in the playbook)
# perf improvements (async)

def download_hash(only_downloads: [str]) -> None:
# Handle file with multiples hashes, with various formats.
# the lambda is expected to produce a dictionary of hashes indexed by arch name
download_hash_extract = {
"calicoctl_binary": lambda hashes : {
line.split('-')[-1] : line.split()[0]
for line in hashes.strip().split('\n')
if line.count('-') == 2 and line.split('-')[-2] == "linux"
},
"etcd_binary": lambda hashes : {
line.split('-')[-1].removesuffix('.tar.gz') : line.split()[0]
for line in hashes.strip().split('\n')
if line.split('-')[-2] == "linux"
},
"nerdctl_archive": lambda hashes : {
line.split()[1].removesuffix('.tar.gz').split('-')[3] : line.split()[0]
for line in hashes.strip().split('\n')
if [x for x in line.split(' ') if x][1].split('-')[2] == "linux"
},
"runc": lambda hashes : {
parts[1].split('.')[1] : parts[0]
for parts in (line.split()
for line in hashes.split('\n')[3:9])
},
"yq": lambda rhashes_bsd : {
pair[0].split('_')[-1] : pair[1]
# pair = (yq_<os>_<arch>, <hash>)
for pair in ((line.split()[1][1:-1], line.split()[3])
for line in rhashes_bsd.splitlines()
if line.startswith("SHA256"))
if pair[0].startswith("yq")
and pair[0].split('_')[1] == "linux"
and not pair[0].endswith(".tar.gz")
},
}

data, yaml = open_checksums_yaml()
if not minors:
minors = {'.'.join(minor.split('.')[:-1]) for minor in data["kubelet_checksums"]["amd64"].keys()}

for download in downloads:
s = requests.Session()

@cache
def _get_hash_by_arch(download: str, version: str) -> {str: str}:

hash_file = s.get(downloads[download].format(
version = version,
os = "linux",
),
allow_redirects=True)
if hash_file.status_code == 404:
print(f"Unable to find {download} hash file for version {version} at {hash_file.url}")
return None
hash_file.raise_for_status()
return download_hash_extract[download](hash_file.content.decode())

for download, url in (downloads if only_downloads == []
else {k:downloads[k] for k in downloads.keys() & only_downloads}).items():
Comment on lines +118 to +119
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having such complex expression directly in the for loop makes it super hard to read IMO, could you refactor this in a few lines before the for loop?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean something like for instance ?

Suggested change
for download, url in (downloads if only_downloads == []
else {k:downloads[k] for k in downloads.keys() & only_downloads}).items():
if only_downloads != []:
downloads = {k:downloads[k] for k in downloads.keys() & only_downloads}
for download, url in downloads.items():

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep 👍

Copy link
Member

@MrFreezeex MrFreezeex Sep 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw maybe you have a good reason for it but in general I think this:

if not only_downloads:

instead of:

if only_downloads != []:

is slightly more idiomatic (although it's not technically checking the same thing strictly speaking) but that's more a nit feel free to do what you prefer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a specific reason, it's just I tend to avoid "implicit conversion to boolean" even in Python, I find it confusing (for later)

checksum_name = f"{download}_checksums"
data[checksum_name] = defaultdict(dict, data[checksum_name])
for arch in architectures:
for minor in minors:
if not minor.startswith("v"):
minor = f"v{minor}"
for release in (f"{minor}.{patch}" for patch in count(start=0, step=1)):
if release in data[checksum_name][arch]:
# Propagate new patch versions to all architectures
for arch in data[checksum_name].values():
for arch2 in data[checksum_name].values():
arch.update({
v:("NONE" if arch2[v] == "NONE" else 0)
for v in (set(arch2.keys()) - set(arch.keys()))
if v.split('.')[2] == '0'})
# this is necessary to make the script indempotent,
# by only adding a vX.X.0 version (=minor release) in each arch
# and letting the rest of the script populate the potential
# patch versions

for arch, versions in data[checksum_name].items():
for minor, patches in groupby(versions.copy().keys(), lambda v : '.'.join(v.split('.')[:-1])):
for version in (f"{minor}.{patch}" for patch in
count(start=int(max(patches, key=version_compare).split('.')[-1]),
step=1)):
# Those barbaric generators do the following:
# Group all patches versions by minor number, take the newest and start from that
# to find new versions
Comment on lines +134 to +140
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could these generators be independent functions? it would probably be much more readable that way IMO

Copy link
Contributor Author

@VannTen VannTen Sep 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hum, I might be able to convert this two loops into one, since I don't use minor of patches expect in the sub-loop, and then put that in a distinct fonction.

if version in versions and versions[version] != 0:
continue
hash_file = requests.get(f"https://dl.k8s.io/release/{release}/bin/linux/{arch}/{download}.sha256", allow_redirects=True)
if hash_file.status_code == 404:
print(f"Unable to find {download} hash file for release {release} (arch: {arch})")
break
hash_file.raise_for_status()
sha256sum = hash_file.content.decode().strip()
if download in download_hash_extract:
hashes = _get_hash_by_arch(download, version)
if hashes == None:
break
sha256sum = hashes.get(arch)
if sha256sum == None:
break
else:
hash_file = s.get(downloads[download].format(
version = version,
os = "linux",
arch = arch
),
allow_redirects=True)
if hash_file.status_code == 404:
print(f"Unable to find {download} hash file for version {version} (arch: {arch}) at {hash_file.url}")
break
hash_file.raise_for_status()
sha256sum = hash_file.content.decode().split()[0]
Comment on lines +143 to +161
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like there is some unnecessary duplicated code here and you could find a way to do the file download/error checking in a common function and/or still rely on _get_hash_by_arch and handle the if download in download_hash_extract there somehow

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comes from the need to cache memoize _get_hash_by_arch (to avoid redundant HTTP calls). functools.cache memoize by parameters, so I can't pass arch to _get_hash_by_arch because it would break the memoization. OTOH, I need it for "simple hash" download .

I could extract the GET + check 404 + raise in it's own little function though 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh ok I see, hmm well if you find some elegant way to not duplicate this piece of code that could be nice but otherwise feel free to resolve that comment then


if len(sha256sum) != 64:
raise Exception(f"Checksum has an unexpected length: {len(sha256sum)} (binary: {download}, arch: {arch}, release: 1.{minor}.{patch})")
data[checksum_name][arch][release] = sha256sum
raise Exception(f"Checksum has an unexpected length: {len(sha256sum)} (binary: {download}, arch: {arch}, release: {version}, checksum: '{sha256sum}')")
data[checksum_name][arch][version] = sha256sum
data[checksum_name] = {arch : {r : releases[r] for r in sorted(releases.keys(),
key=lambda v : Version(v[1:]),
key=version_compare,
reverse=True)}
for arch, releases in data[checksum_name].items()}

with open(CHECKSUMS_YML, "w") as checksums_yml:
yaml.dump(data, checksums_yml)
print(f"\n\nUpdated {CHECKSUMS_YML}\n")


def usage():
print(f"USAGE:\n {sys.argv[0]} [k8s_version1] [[k8s_version2]....[k8s_versionN]]")


def main(argv=None):
download_hash(sys.argv[1:])
return 0


if __name__ == "__main__":
sys.exit(main())
parser = argparse.ArgumentParser(description=f"Add new patch versions hashes in {CHECKSUMS_YML}",
formatter_class=argparse.RawTextHelpFormatter,
epilog=f"""
This script only lookup new patch versions relative to those already existing
in the data in {CHECKSUMS_YML},
which means it won't add new major or minor versions.
In order to add one of these, edit {CHECKSUMS_YML}
by hand, adding the new versions with a patch number of 0 (or the lowest relevant patch versions)
; then run this script.

Note that the script will try to add the versions on all
architecture keys already present for a given download target.

The '0' value for a version hash is treated as a missing hash, so the script will try to download it again.
To notify a non-existing version (yanked, or upstream does not have monotonically increasing versions numbers),
use the special value 'NONE'.

EXAMPLES:

crictl_checksums:
...
amd64:
+ v1.30.0: 0
v1.29.0: d16a1ffb3938f5a19d5c8f45d363bd091ef89c0bc4d44ad16b933eede32fdcbb
v1.28.0: 8dc78774f7cbeaf787994d386eec663f0a3cf24de1ea4893598096cb39ef2508"""

)
parser.add_argument('binaries', nargs='*', choices=downloads.keys())

args = parser.parse_args()
download_hash(args.binaries)