Skip to content

Commit

Permalink
Merge pull request #71 from konflux-ci/cache-rpmdb
Browse files Browse the repository at this point in the history
Add caching for downloaded rpmdb
  • Loading branch information
lubomir authored Nov 19, 2024
2 parents a36b096 + cba1afd commit 3a29832
Show file tree
Hide file tree
Showing 5 changed files with 302 additions and 32 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@

## Unreleased

### Added

- Caching for extracted rpm databases has been added. This avoids the need to
repeatedly download the same image if resolving runs multiple times on the
same base image. The cached data is stored in
`~/.cache/rpm-lockfile-prototype/rpmdbs`, and there is no automatic cleanup.

When the filesystem with the cache gets over 80 % full, no new data will be
cached anymore.

### Changed

- Repositories can now use `mirrorlist` or `metalink` instead of just `baseurl`.
Expand Down
86 changes: 74 additions & 12 deletions rpm_lockfile/containers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
import logging
import os
import re
import shutil
import subprocess
import tarfile
import tempfile
from pathlib import Path
Expand All @@ -10,12 +13,12 @@
# Known locations for rpmdb inside the image.
RPMDB_PATHS = ["usr/lib/sysimage/rpm", "var/lib/rpm"]

CACHE_PATH = Path.home() / ".cache" / "rpm-lockfile-prototype" / "rpmdbs"

def _translate_arch(arch):
# This is a horrible hack. Skopeo will reject x86_64, but is happy with
# amd64. The same goes for aarch64 -> arm64.
ARCHES = {"aarch64": "arm64", "x86_64": "amd64"}
return ARCHES.get(arch, arch)
# Storage usage limit. If the filesystem with the cache fills up over this
# limit, nothing new will be added into the cache.
# Value in percent.
USAGE_THRESHOLD = 80


def _copy_image(baseimage, arch, destdir):
Expand All @@ -24,14 +27,47 @@ def _copy_image(baseimage, arch, destdir):
"skopeo",
f"--override-arch={arch}",
"copy",
f"docker://{utils.strip_tag(baseimage)}",
f"docker://{baseimage}",
f"dir:{destdir}",
]
utils.logged_run(cmd, check=True)


def setup_rpmdb(cache_dir, baseimage, arch):
arch = _translate_arch(arch)
def setup_rpmdb(dest_dir, baseimage, arch):
"""
Extract rpmdb from `baseimage` for `arch` to `dest_dir`.
"""
image, _, digest = utils.split_image(baseimage)

if not digest:
# We don't have a digest yet, so find the correct one from the
# registry.
digest = utils.inspect_image(baseimage, arch)["Digest"]

# Construct a new image pull spec with the digest (we no longer need the
# tag). We need to pull the image by the digest used in the cache.
# Otherwise we would risk a race condition if the image got updated between
# calls to `skopeo inspect` and `skopeo copy`.
image = utils.make_image_spec(image, None, digest)

# The images need to be cached per-architecture. The same digest is used
# reference the same image.
cache = CACHE_PATH / arch / digest
if not cache.exists():
# If we don't have anything cached, extract the rpmdb from the image
# into the cache.
_online_setup_rpmdb(cache, image, arch)
else:
logging.info("Using already downloaded rpmdb")

# Copy the cache to the correct destination directory.
shutil.copytree(cache, dest_dir, dirs_exist_ok=True)

_maybe_cleanup(cache)


def _online_setup_rpmdb(dest_dir, baseimage, arch):
arch = utils.translate_arch(arch)

with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
Expand Down Expand Up @@ -59,7 +95,7 @@ def filter_rpmdb(member, path):
# ...find all files in interesting locations and extract them to
# the destination cache.
archive = tarfile.open(tmpdir / digest)
archive.extractall(path=cache_dir, filter=filter_rpmdb)
archive.extractall(path=dest_dir, filter=filter_rpmdb)

if dbpaths and utils.RPMDB_PATH not in dbpaths:
# If we have at least one possible rpmdb location populated by the
Expand All @@ -73,10 +109,36 @@ def filter_rpmdb(member, path):
dbpath = dbpaths.pop()
logging.debug("Creating rpmdb symlink %s -> %s", utils.RPMDB_PATH, dbpath)
os.makedirs(
os.path.dirname(os.path.join(cache_dir, utils.RPMDB_PATH)),
os.path.dirname(os.path.join(dest_dir, utils.RPMDB_PATH)),
exist_ok=True,
)
os.symlink(
os.path.join(cache_dir, dbpath),
os.path.join(cache_dir, utils.RPMDB_PATH),
os.path.join(dest_dir, dbpath),
os.path.join(dest_dir, utils.RPMDB_PATH),
)


def _maybe_cleanup(directory):
"""Check if there's enough free space on the filesystem with given
directory. If not, delete the directory.
"""
usage = _get_storage_usage(directory)
if usage and usage >= USAGE_THRESHOLD:
logging.info("Storage is %d % full. Cleaning up cached rpmdb.", usage)
shutil.rmtree(directory)


def _get_storage_usage(directory):
"""Return disk usage of filesystem with given directory as an integer
representing percentage. Returns None on failure.
"""
cp = subprocess.run(
["df", "--output=pcent", directory], stdout=subprocess.PIPE, text=True
)
if cp.returncode != 0:
logging.debug("Failed to check free storage size...")
else:
m = re.search(r"\b(\d+)%", cp.stdout)
if m:
return int(m.group(1))
return None
59 changes: 46 additions & 13 deletions rpm_lockfile/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import functools
import hashlib
import json
import logging
Expand Down Expand Up @@ -93,15 +94,26 @@ def subst_vars(template, vars):
return template


def translate_arch(arch):
# This is a horrible hack. Skopeo will reject x86_64, but is happy with
# amd64. The same goes for aarch64 -> arm64.
ARCHES = {"aarch64": "arm64", "x86_64": "amd64"}
return ARCHES.get(arch, arch)


@functools.lru_cache
def inspect_image(image_spec, arch=None):
cmd = ["skopeo"]
if arch:
cmd.append(f"--override-arch={translate_arch(arch)}")
cmd.extend(["inspect", f"docker://{strip_tag(image_spec)}"])
cp = logged_run(cmd, stdout=subprocess.PIPE, check=True)
return json.loads(cp.stdout)


def _get_image_labels(image_spec):
"""Given an image specification, return a dict with labels from the image."""
cp = logged_run(
["skopeo", "inspect", f"docker://{strip_tag(image_spec)}"],
stdout=subprocess.PIPE,
check=True,
)
data = json.loads(cp.stdout)
return data["Labels"]
return inspect_image(image_spec)["Labels"]


def _get_containerfile_labels(containerfile, config_dir):
Expand All @@ -120,6 +132,28 @@ def _get_containerfile_labels(containerfile, config_dir):
return _get_image_labels(extract_image(os.path.join(config_dir, fp), **filters))


def split_image(image_spec):
# De don't want to validate the digest here in any way, so even wrong
# length should be accepted.
m = re.match(r'([^:@]+)(:[^@]+)?(@sha\d+:[a-f0-9]+)?$', image_spec)
if m:
repo = m.group(1)
tag = m.group(2)
digest_suffix = m.group(3)
digest = digest_suffix[1:] if digest_suffix else None
return repo, tag, digest
raise RuntimeError(f"Unknown format for image specification: {image_spec}")


def make_image_spec(repo, tag, digest):
spec = repo
if tag:
spec += f":{tag}"
if digest:
spec += f"@{digest}"
return spec


def strip_tag(image_spec):
"""
If the image specification contains both a tag and a digest, remove the
Expand All @@ -128,12 +162,11 @@ def strip_tag(image_spec):
https://github.com/containers/image/issues/1736
"""
# De don't want to validate the digest here in any way, so even wrong
# length should be accepted.
m = re.match(r'([^:]+)(:[^@]+)(@sha\d+:[a-f0-9]+)$', image_spec)
if m:
logging.info("Digest was provided, ignoring tag %s", m.group(2)[1:])
return f"{m.group(1)}{m.group(3)}"
repo, tag, digest = split_image(image_spec)
if tag and digest:
logging.info(f"Digest was provided, ignoring tag {tag}")
if digest:
return f"{repo}@{digest}"
return image_spec


Expand Down
Loading

0 comments on commit 3a29832

Please sign in to comment.