Merge pull request #71 from konflux-ci/cache-rpmdb

Add caching for downloaded rpmdb
konflux-ci · Nov 19, 2024 · 3a29832 · 3a29832
2 parents a36b096 + cba1afd
commit 3a29832
Show file tree

Hide file tree

Showing 5 changed files with 302 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,16 @@
 
 ## Unreleased
 
+### Added
+
+- Caching for extracted rpm databases has been added. This avoids the need to
+  repeatedly download the same image if resolving runs multiple times on the
+  same base image. The cached data is stored in
+  `~/.cache/rpm-lockfile-prototype/rpmdbs`, and there is no automatic cleanup.
+
+  When the filesystem with the cache gets over 80 % full, no new data will be
+  cached anymore.
+
 ### Changed
 
 - Repositories can now use `mirrorlist` or `metalink` instead of just `baseurl`.

diff --git a/rpm_lockfile/containers.py b/rpm_lockfile/containers.py
@@ -1,6 +1,9 @@
 import json
 import logging
 import os
+import re
+import shutil
+import subprocess
 import tarfile
 import tempfile
 from pathlib import Path
@@ -10,12 +13,12 @@
 # Known locations for rpmdb inside the image.
 RPMDB_PATHS = ["usr/lib/sysimage/rpm", "var/lib/rpm"]
 
+CACHE_PATH = Path.home() / ".cache" / "rpm-lockfile-prototype" / "rpmdbs"
 
-def _translate_arch(arch):
-    # This is a horrible hack. Skopeo will reject x86_64, but is happy with
-    # amd64. The same goes for aarch64 -> arm64.
-    ARCHES = {"aarch64": "arm64", "x86_64": "amd64"}
-    return ARCHES.get(arch, arch)
+# Storage usage limit. If the filesystem with the cache fills up over this
+# limit, nothing new will be added into the cache.
+# Value in percent.
+USAGE_THRESHOLD = 80
 
 
 def _copy_image(baseimage, arch, destdir):
@@ -24,14 +27,47 @@ def _copy_image(baseimage, arch, destdir):
         "skopeo",
         f"--override-arch={arch}",
         "copy",
-        f"docker://{utils.strip_tag(baseimage)}",
+        f"docker://{baseimage}",
         f"dir:{destdir}",
     ]
     utils.logged_run(cmd, check=True)
 
 
-def setup_rpmdb(cache_dir, baseimage, arch):
-    arch = _translate_arch(arch)
+def setup_rpmdb(dest_dir, baseimage, arch):
+    """
+    Extract rpmdb from `baseimage` for `arch` to `dest_dir`.
+    """
+    image, _, digest = utils.split_image(baseimage)
+
+    if not digest:
+        # We don't have a digest yet, so find the correct one from the
+        # registry.
+        digest = utils.inspect_image(baseimage, arch)["Digest"]
+
+    # Construct a new image pull spec with the digest (we no longer need the
+    # tag). We need to pull the image by the digest used in the cache.
+    # Otherwise we would risk a race condition if the image got updated between
+    # calls to `skopeo inspect` and `skopeo copy`.
+    image = utils.make_image_spec(image, None, digest)
+
+    # The images need to be cached per-architecture. The same digest is used
+    # reference the same image.
+    cache = CACHE_PATH / arch / digest
+    if not cache.exists():
+        # If we don't have anything cached, extract the rpmdb from the image
+        # into the cache.
+        _online_setup_rpmdb(cache, image, arch)
+    else:
+        logging.info("Using already downloaded rpmdb")
+
+    # Copy the cache to the correct destination directory.
+    shutil.copytree(cache, dest_dir, dirs_exist_ok=True)
+
+    _maybe_cleanup(cache)
+
+
+def _online_setup_rpmdb(dest_dir, baseimage, arch):
+    arch = utils.translate_arch(arch)
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
@@ -59,7 +95,7 @@ def filter_rpmdb(member, path):
             # ...find all files in interesting locations and extract them to
             # the destination cache.
             archive = tarfile.open(tmpdir / digest)
-            archive.extractall(path=cache_dir, filter=filter_rpmdb)
+            archive.extractall(path=dest_dir, filter=filter_rpmdb)
 
         if dbpaths and utils.RPMDB_PATH not in dbpaths:
             # If we have at least one possible rpmdb location populated by the
@@ -73,10 +109,36 @@ def filter_rpmdb(member, path):
             dbpath = dbpaths.pop()
             logging.debug("Creating rpmdb symlink %s -> %s", utils.RPMDB_PATH, dbpath)
             os.makedirs(
-                os.path.dirname(os.path.join(cache_dir, utils.RPMDB_PATH)),
+                os.path.dirname(os.path.join(dest_dir, utils.RPMDB_PATH)),
                 exist_ok=True,
             )
             os.symlink(
-                os.path.join(cache_dir, dbpath),
-                os.path.join(cache_dir, utils.RPMDB_PATH),
+                os.path.join(dest_dir, dbpath),
+                os.path.join(dest_dir, utils.RPMDB_PATH),
             )
+
+
+def _maybe_cleanup(directory):
+    """Check if there's enough free space on the filesystem with given
+    directory. If not, delete the directory.
+    """
+    usage = _get_storage_usage(directory)
+    if usage and usage >= USAGE_THRESHOLD:
+        logging.info("Storage is %d % full. Cleaning up cached rpmdb.", usage)
+        shutil.rmtree(directory)
+
+
+def _get_storage_usage(directory):
+    """Return disk usage of filesystem with given directory as an integer
+    representing percentage. Returns None on failure.
+    """
+    cp = subprocess.run(
+        ["df", "--output=pcent", directory], stdout=subprocess.PIPE, text=True
+    )
+    if cp.returncode != 0:
+        logging.debug("Failed to check free storage size...")
+    else:
+        m = re.search(r"\b(\d+)%", cp.stdout)
+        if m:
+            return int(m.group(1))
+    return None
diff --git a/rpm_lockfile/utils.py b/rpm_lockfile/utils.py
@@ -1,3 +1,4 @@
+import functools
 import hashlib
 import json
 import logging
@@ -93,15 +94,26 @@ def subst_vars(template, vars):
     return template
 
 
+def translate_arch(arch):
+    # This is a horrible hack. Skopeo will reject x86_64, but is happy with
+    # amd64. The same goes for aarch64 -> arm64.
+    ARCHES = {"aarch64": "arm64", "x86_64": "amd64"}
+    return ARCHES.get(arch, arch)
+
+
+@functools.lru_cache
+def inspect_image(image_spec, arch=None):
+    cmd = ["skopeo"]
+    if arch:
+        cmd.append(f"--override-arch={translate_arch(arch)}")
+    cmd.extend(["inspect", f"docker://{strip_tag(image_spec)}"])
+    cp = logged_run(cmd, stdout=subprocess.PIPE, check=True)
+    return json.loads(cp.stdout)
+
+
 def _get_image_labels(image_spec):
     """Given an image specification, return a dict with labels from the image."""
-    cp = logged_run(
-        ["skopeo", "inspect", f"docker://{strip_tag(image_spec)}"],
-        stdout=subprocess.PIPE,
-        check=True,
-    )
-    data = json.loads(cp.stdout)
-    return data["Labels"]
+    return inspect_image(image_spec)["Labels"]
 
 
 def _get_containerfile_labels(containerfile, config_dir):
@@ -120,6 +132,28 @@ def _get_containerfile_labels(containerfile, config_dir):
     return _get_image_labels(extract_image(os.path.join(config_dir, fp), **filters))
 
 
+def split_image(image_spec):
+    # De don't want to validate the digest here in any way, so even wrong
+    # length should be accepted.
+    m = re.match(r'([^:@]+)(:[^@]+)?(@sha\d+:[a-f0-9]+)?$', image_spec)
+    if m:
+        repo = m.group(1)
+        tag = m.group(2)
+        digest_suffix = m.group(3)
+        digest = digest_suffix[1:] if digest_suffix else None
+        return repo, tag, digest
+    raise RuntimeError(f"Unknown format for image specification: {image_spec}")
+
+
+def make_image_spec(repo, tag, digest):
+    spec = repo
+    if tag:
+        spec += f":{tag}"
+    if digest:
+        spec += f"@{digest}"
+    return spec
+
+
 def strip_tag(image_spec):
     """
     If the image specification contains both a tag and a digest, remove the
@@ -128,12 +162,11 @@ def strip_tag(image_spec):
 
     https://github.com/containers/image/issues/1736
     """
-    # De don't want to validate the digest here in any way, so even wrong
-    # length should be accepted.
-    m = re.match(r'([^:]+)(:[^@]+)(@sha\d+:[a-f0-9]+)$', image_spec)
-    if m:
-        logging.info("Digest was provided, ignoring tag %s", m.group(2)[1:])
-        return f"{m.group(1)}{m.group(3)}"
+    repo, tag, digest = split_image(image_spec)
+    if tag and digest:
+        logging.info(f"Digest was provided, ignoring tag {tag}")
+    if digest:
+        return f"{repo}@{digest}"
     return image_spec