Skip to content

Commit

Permalink
Add support for zstd compression.
Browse files Browse the repository at this point in the history
Squashed version of PR conan-io#14706 as of 03/10/2024.
  • Loading branch information
bentonj-omnissa committed Oct 7, 2024
1 parent 66008be commit 58c6e11
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 40 deletions.
69 changes: 44 additions & 25 deletions conan/internal/api/uploader.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import fnmatch
import os
import shutil
import tarfile
import time

from conan.internal.conan_app import ConanApp
from conan.api.output import ConanOutput
from conans.client.source import retrieve_exports_sources
from conans.errors import ConanException, NotFoundException
from conan.internal.paths import (CONAN_MANIFEST, CONANFILE, EXPORT_SOURCES_TGZ_NAME,
EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, CONANINFO)
EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME, CONANINFO)
from conans.util.files import (clean_dirty, is_dirty, gather_files,
gzopen_without_timestamps, set_dirty_context_manager, mkdir,
human_size)
human_size, tar_zst_compress)

UPLOAD_POLICY_FORCE = "force-upload"
UPLOAD_POLICY_SKIP = "skip-upload"
Expand Down Expand Up @@ -173,11 +174,24 @@ def _prepare_package(self, pref, prev_bundle):
def _compress_package_files(self, layout, pref):
output = ConanOutput(scope=str(pref))
download_pkg_folder = layout.download_package()
package_tgz = os.path.join(download_pkg_folder, PACKAGE_TGZ_NAME)
if is_dirty(package_tgz):
output.warning("Removing %s, marked as dirty" % PACKAGE_TGZ_NAME)
os.remove(package_tgz)
clean_dirty(package_tgz)

compression_format = self._global_conf.get("core.upload:compression_format",
default="gzip")
if compression_format == "gzip":
compress_level_config = "core.gzip:compresslevel"
package_file_name = PACKAGE_TGZ_NAME
package_file = os.path.join(download_pkg_folder, PACKAGE_TGZ_NAME)
elif compression_format == "zstd":
compress_level_config = "core.zstd:compresslevel"
package_file_name = PACKAGE_TZSTD_NAME
package_file = os.path.join(download_pkg_folder, PACKAGE_TZSTD_NAME)
else:
raise ConanException(f"Unsupported compression format '{compression_format}'")

if is_dirty(package_file):
output.warning(f"Removing {package_file_name}, marked as dirty")
os.remove(package_file)
clean_dirty(package_file)

# Get all the files in that directory
# existing package, will use short paths if defined
Expand All @@ -198,15 +212,17 @@ def _compress_package_files(self, layout, pref):
files.pop(CONANINFO)
files.pop(CONAN_MANIFEST)

if not os.path.isfile(package_tgz):
if not os.path.isfile(package_file):
tgz_files = {f: path for f, path in files.items()}
compresslevel = self._global_conf.get("core.gzip:compresslevel", check_type=int)
tgz_path = compress_files(tgz_files, PACKAGE_TGZ_NAME, download_pkg_folder,
compresslevel=compresslevel, ref=pref)
assert tgz_path == package_tgz
assert os.path.exists(package_tgz)
compresslevel = self._global_conf.get(compress_level_config, check_type=int)
compressed_path = compress_files(tgz_files, package_file_name, download_pkg_folder,
compresslevel=compresslevel, compressformat=compression_format,
ref=pref)

return {PACKAGE_TGZ_NAME: package_tgz,
assert compressed_path == package_file
assert os.path.exists(package_file)

return {package_file_name: package_file,
CONANINFO: os.path.join(download_pkg_folder, CONANINFO),
CONAN_MANIFEST: os.path.join(download_pkg_folder, CONAN_MANIFEST)}

Expand Down Expand Up @@ -254,22 +270,25 @@ def upload_package(self, pref, prev_bundle, remote):
output.debug(f"Upload {pref} in {duration} time")


def compress_files(files, name, dest_dir, compresslevel=None, ref=None):
def compress_files(files, name, dest_dir, compressformat=None, compresslevel=None, ref=None):
t1 = time.time()
# FIXME, better write to disk sequentially and not keep tgz contents in memory
tgz_path = os.path.join(dest_dir, name)
tar_path = os.path.join(dest_dir, name)
ConanOutput(scope=str(ref)).info(f"Compressing {name}")
with set_dirty_context_manager(tgz_path), open(tgz_path, "wb") as tgz_handle:
tgz = gzopen_without_timestamps(name, mode="w", fileobj=tgz_handle,
compresslevel=compresslevel)
for filename, abs_path in sorted(files.items()):
# recursive is False in case it is a symlink to a folder
tgz.add(abs_path, filename, recursive=False)
tgz.close()

if compressformat == "zstd":
tar_zst_compress(tar_path, files, compresslevel=compresslevel)
else:
with set_dirty_context_manager(tar_path), open(tar_path, "wb") as tgz_handle:
tgz = gzopen_without_timestamps(name, mode="w", fileobj=tgz_handle,
compresslevel=compresslevel)
for filename, abs_path in sorted(files.items()):
# recursive is False in case it is a symlink to a folder
tgz.add(abs_path, filename, recursive=False)
tgz.close()

duration = time.time() - t1
ConanOutput().debug(f"{name} compressed in {duration} time")
return tgz_path
return tar_path


def _total_size(cache_files):
Expand Down
1 change: 1 addition & 0 deletions conan/internal/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def _user_home_from_conanrc_file():
CONAN_MANIFEST = "conanmanifest.txt"
CONANINFO = "conaninfo.txt"
PACKAGE_TGZ_NAME = "conan_package.tgz"
PACKAGE_TZSTD_NAME = "conan_package.tzst"
EXPORT_TGZ_NAME = "conan_export.tgz"
EXPORT_SOURCES_TGZ_NAME = "conan_sources.tgz"
DATA_YML = "conandata.yml"
20 changes: 15 additions & 5 deletions conans/client/remote_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import shutil
import tarfile
import time
from typing import List

from requests.exceptions import ConnectionError
Expand All @@ -16,7 +18,7 @@
from conans.model.package_ref import PkgReference
from conans.model.recipe_ref import RecipeReference
from conans.util.files import rmdir, human_size
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME
from conans.util.files import mkdir, tar_extract


Expand Down Expand Up @@ -164,14 +166,21 @@ def _get_package(self, layout, pref, remote, scoped_output, metadata):
metadata, only_metadata=False)
zipped_files = {k: v for k, v in zipped_files.items() if not k.startswith(METADATA)}
# quick server package integrity check:
for f in ("conaninfo.txt", "conanmanifest.txt", "conan_package.tgz"):
for f in ("conaninfo.txt", "conanmanifest.txt"):
if f not in zipped_files:
raise ConanException(f"Corrupted {pref} in '{remote.name}' remote: no {f}")
accepted_package_files = [PACKAGE_TZSTD_NAME, PACKAGE_TGZ_NAME]
package_file = next((f for f in zipped_files if f in accepted_package_files), None)
if not package_file:
raise ConanException(f"Corrupted {pref} in '{remote.name}' remote: no {accepted_package_files} found")
self._signer.verify(pref, download_pkg_folder, zipped_files)

tgz_file = zipped_files.pop(PACKAGE_TGZ_NAME, None)
package_file = zipped_files.pop(package_file, None)
package_folder = layout.package()
uncompress_file(tgz_file, package_folder, scope=str(pref.ref))
t1 = time.time()
uncompress_file(package_file, package_folder, scope=str(pref.ref))
duration = time.time() - t1
scoped_output.debug(f"Decompressed {package_file} in {duration} seconds")
mkdir(package_folder) # Just in case it doesn't exist, because uncompress did nothing
for file_name, file_path in zipped_files.items(): # copy CONANINFO and CONANMANIFEST
shutil.move(file_path, os.path.join(package_folder, file_name))
Expand Down Expand Up @@ -276,7 +285,8 @@ def uncompress_file(src_path, dest_folder, scope=None):
hs = human_size(filesize)
ConanOutput(scope=scope).info(f"Decompressing {hs} {os.path.basename(src_path)}")
with open(src_path, mode='rb') as file_handler:
tar_extract(file_handler, dest_folder)
tar_extract(file_handler, dest_folder,
is_tar_zst=src_path.endswith((".tar.zst", ".tzst")))
except Exception as e:
error_msg = "Error while extracting downloaded file '%s' to %s\n%s\n"\
% (src_path, dest_folder, str(e))
Expand Down
10 changes: 7 additions & 3 deletions conans/client/rest/rest_client_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from conans.errors import ConanException, NotFoundException, PackageNotFoundException, \
RecipeNotFoundException, AuthenticationException, ForbiddenException, EXCEPTION_CODE_MAPPING
from conans.model.package_ref import PkgReference
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME
from conans.model.recipe_ref import RecipeReference
from conans.util.dates import from_iso8601_to_timestamp
from conans.util.thread import ExceptionThread
Expand Down Expand Up @@ -296,8 +296,12 @@ def get_package(self, pref, dest_folder, metadata, only_metadata):
result = {}
# Download only known files, but not metadata (except sign)
if not only_metadata: # Retrieve package first, then metadata
accepted_files = ["conaninfo.txt", "conan_package.tgz", "conanmanifest.txt",
"metadata/sign"]
accepted_package_files = [PACKAGE_TZSTD_NAME, PACKAGE_TGZ_NAME]
accepted_files = ["conaninfo.txt", "conanmanifest.txt", "metadata/sign"]
for f in accepted_package_files:
if f in server_files:
accepted_files = [f] + accepted_files
break
files = [f for f in server_files if any(f.startswith(m) for m in accepted_files)]
# If we didn't indicated reference, server got the latest, use absolute now, it's safer
urls = {fn: self.router.package_file(pref, fn) for fn in files}
Expand Down
5 changes: 4 additions & 1 deletion conans/model/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,11 @@
"core.net.http:cacert_path": "Path containing a custom Cacert file",
"core.net.http:client_cert": "Path or tuple of files containing a client cert (and key)",
"core.net.http:clean_system_proxy": "If defined, the proxies system env-vars will be discarded",
# Gzip compression
# Compression for `conan upload`
"core.upload:compression_format": "The compression format used when uploading Conan packages. "
"Possible values: 'zstd', 'gzip' (default=gzip)",
"core.gzip:compresslevel": "The Gzip compression level for Conan artifacts (default=9)",
"core.zstd:compresslevel": "The zstd compression level for Conan artifacts (default=3)",
# Excluded from revision_mode = "scm" dirty and Git().is_dirty() checks
"core.scm:excluded": "List of excluded patterns for builtin git dirty checks",
"core.scm:local_url": "By default allows to store local folders as remote url, but not upload them. Use 'allow' for allowing upload and 'block' to completely forbid it",
Expand Down
4 changes: 2 additions & 2 deletions conans/model/manifest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from collections import defaultdict

from conan.internal.paths import CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME
from conan.internal.paths import CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME
from conans.util.dates import timestamp_now, timestamp_to_str
from conans.util.files import load, md5, md5sum, save, gather_files

Expand Down Expand Up @@ -91,7 +91,7 @@ def create(cls, folder, exports_sources_folder=None):
"""
files, _ = gather_files(folder)
# The folders symlinks are discarded for the manifest
for f in (PACKAGE_TGZ_NAME, EXPORT_TGZ_NAME, CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME):
for f in (PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME, EXPORT_TGZ_NAME, CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME):
files.pop(f, None)

file_dict = {}
Expand Down
62 changes: 60 additions & 2 deletions conans/util/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
import tarfile
import time

try:
import zstandard
zstandard_exception = None
except ImportError as e:
zstandard_exception = e

from contextlib import contextmanager


Expand Down Expand Up @@ -291,15 +297,67 @@ def gzopen_without_timestamps(name, mode="r", fileobj=None, compresslevel=None,
return t


def tar_extract(fileobj, destination_dir):
the_tar = tarfile.open(fileobj=fileobj)
def raise_if_zstandard_not_present(operation):
if zstandard_exception:
raise ConanException(
f"zstandard {operation} was requested, but the required package is not present. "
f"Please install it using 'pip install zstandard' and try again. "
f"Exception details: {zstandard_exception}")


def tar_zst_compress(tar_path, files, compresslevel=None):
raise_if_zstandard_not_present("compression")

with open(tar_path, "wb") as tarfile_obj:
# Only provide level if it was overridden by config.
zstd_kwargs = {}
if compresslevel is not None:
zstd_kwargs["level"] = compresslevel

dctx = zstandard.ZstdCompressor(write_checksum=True, threads=-1, **zstd_kwargs)

# Create a zstd stream writer so tarfile writes uncompressed data to
# the zstd stream writer, which in turn writes compressed data to the
# output tar.zst file.
with dctx.stream_writer(tarfile_obj) as stream_writer:
# The choice of bufsize=32768 comes from profiling compression at various
# values and finding that bufsize value consistently performs well.
# The variance in compression times at bufsize<=64KB is small. It is only
# when bufsize>=128KB that compression times start increasing.
with tarfile.open(mode="w|", fileobj=stream_writer, bufsize=32768,
format=tarfile.PAX_FORMAT) as tar:
current_frame_bytes = 0
for filename, abs_path in sorted(files.items()):
tar.add(abs_path, filename, recursive=False)

# Flush the current frame if it has reached a large enough size.
# There is no required size, but 128MB is a good starting point
# because it allows for faster random access to the file.
current_frame_bytes += os.path.getsize(abs_path)
if current_frame_bytes >= 134217728:
stream_writer.flush(zstandard.FLUSH_FRAME)
current_frame_bytes = 0


def tar_extract(fileobj, destination_dir, is_tar_zst=False):
if is_tar_zst:
raise_if_zstandard_not_present("decompression")
dctx = zstandard.ZstdDecompressor()
stream_reader = dctx.stream_reader(fileobj)
the_tar = tarfile.open(fileobj=stream_reader, bufsize=32768, mode="r|")
else:
the_tar = tarfile.open(fileobj=fileobj)

# NOTE: The errorlevel=2 has been removed because it was failing in Win10, it didn't allow to
# "could not change modification time", with time=0
# the_tar.errorlevel = 2 # raise exception if any error
the_tar.extraction_filter = (lambda member, path: member) # fully_trusted, avoid Py3.14 break
the_tar.extractall(path=destination_dir)
the_tar.close()

if is_tar_zst:
stream_reader.close()


def exception_message_safe(exc):
try:
Expand Down
32 changes: 30 additions & 2 deletions test/unittests/client/remote_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
import unittest

from conan.internal.api.uploader import compress_files
from conan.internal.paths import PACKAGE_TGZ_NAME
from conans.client.remote_manager import uncompress_file
from conan.internal.paths import PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME
from conan.test.utils.test_files import temp_folder
from conans.util.files import save


class RemoteManagerTest(unittest.TestCase):

def test_compress_files(self):
def test_compress_files_tgz(self):
folder = temp_folder()
save(os.path.join(folder, "one_file.txt"), "The contents")
save(os.path.join(folder, "Two_file.txt"), "Two contents")
Expand All @@ -23,3 +24,30 @@ def test_compress_files(self):
self.assertTrue(os.path.exists(path))
expected_path = os.path.join(folder, PACKAGE_TGZ_NAME)
self.assertEqual(path, expected_path)

def test_compress_and_uncompress_zst_files(self):
folder = temp_folder()
save(os.path.join(folder, "one_file.txt"), "The contents")
save(os.path.join(folder, "Two_file.txt"), "Two contents")

files = {
"one_file.txt": os.path.join(folder, "one_file.txt"),
"Two_file.txt": os.path.join(folder, "Two_file.txt"),
}

path = compress_files(files, PACKAGE_TZSTD_NAME, dest_dir=folder, compressformat="zstd")
self.assertTrue(os.path.exists(path))
expected_path = os.path.join(folder, PACKAGE_TZSTD_NAME)
self.assertEqual(path, expected_path)

extract_dir = os.path.join(folder, "extracted")
uncompress_file(path, extract_dir)

extract_files = list(sorted(os.listdir(extract_dir)))
expected_files = sorted(files.keys())
self.assertEqual(extract_files, expected_files)

for name, path in sorted(files.items()):
extract_path = os.path.join(extract_dir, name)
with open(path, "r") as f1, open(extract_path, "r") as f2:
self.assertEqual(f1.read(), f2.read())

0 comments on commit 58c6e11

Please sign in to comment.