From 05767638e66632b887ca3f7d6f9cbe05bf36a377 Mon Sep 17 00:00:00 2001 From: James Benton Date: Sun, 6 Oct 2024 14:53:07 +0100 Subject: [PATCH] Add support for zstd compression. Squashed version of PR #14706 as of 03/10/2024. --- conan/internal/api/uploader.py | 69 +++++++++++++------- conan/internal/paths.py | 1 + conans/client/remote_manager.py | 20 ++++-- conans/client/rest/rest_client_v2.py | 10 ++- conans/model/conf.py | 5 +- conans/model/manifest.py | 4 +- conans/util/files.py | 62 +++++++++++++++++- test/unittests/client/remote_manager_test.py | 32 ++++++++- 8 files changed, 163 insertions(+), 40 deletions(-) diff --git a/conan/internal/api/uploader.py b/conan/internal/api/uploader.py index 8d8b8a4d93c..de2cda95727 100644 --- a/conan/internal/api/uploader.py +++ b/conan/internal/api/uploader.py @@ -1,6 +1,7 @@ import fnmatch import os import shutil +import tarfile import time from conan.internal.conan_app import ConanApp @@ -9,10 +10,10 @@ from conan.internal.errors import NotFoundException from conan.errors import ConanException from conan.internal.paths import (CONAN_MANIFEST, CONANFILE, EXPORT_SOURCES_TGZ_NAME, - EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, CONANINFO) + EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME, CONANINFO) from conans.util.files import (clean_dirty, is_dirty, gather_files, gzopen_without_timestamps, set_dirty_context_manager, mkdir, - human_size) + human_size, tar_zst_compress) UPLOAD_POLICY_FORCE = "force-upload" UPLOAD_POLICY_SKIP = "skip-upload" @@ -174,11 +175,24 @@ def _prepare_package(self, pref, prev_bundle): def _compress_package_files(self, layout, pref): output = ConanOutput(scope=str(pref)) download_pkg_folder = layout.download_package() - package_tgz = os.path.join(download_pkg_folder, PACKAGE_TGZ_NAME) - if is_dirty(package_tgz): - output.warning("Removing %s, marked as dirty" % PACKAGE_TGZ_NAME) - os.remove(package_tgz) - clean_dirty(package_tgz) + + compression_format = self._global_conf.get("core.upload:compression_format", + default="gzip") + if compression_format == "gzip": + compress_level_config = "core.gzip:compresslevel" + package_file_name = PACKAGE_TGZ_NAME + package_file = os.path.join(download_pkg_folder, PACKAGE_TGZ_NAME) + elif compression_format == "zstd": + compress_level_config = "core.zstd:compresslevel" + package_file_name = PACKAGE_TZSTD_NAME + package_file = os.path.join(download_pkg_folder, PACKAGE_TZSTD_NAME) + else: + raise ConanException(f"Unsupported compression format '{compression_format}'") + + if is_dirty(package_file): + output.warning(f"Removing {package_file_name}, marked as dirty") + os.remove(package_file) + clean_dirty(package_file) # Get all the files in that directory # existing package, will use short paths if defined @@ -199,15 +213,17 @@ def _compress_package_files(self, layout, pref): files.pop(CONANINFO) files.pop(CONAN_MANIFEST) - if not os.path.isfile(package_tgz): + if not os.path.isfile(package_file): tgz_files = {f: path for f, path in files.items()} - compresslevel = self._global_conf.get("core.gzip:compresslevel", check_type=int) - tgz_path = compress_files(tgz_files, PACKAGE_TGZ_NAME, download_pkg_folder, - compresslevel=compresslevel, ref=pref) - assert tgz_path == package_tgz - assert os.path.exists(package_tgz) + compresslevel = self._global_conf.get(compress_level_config, check_type=int) + compressed_path = compress_files(tgz_files, package_file_name, download_pkg_folder, + compresslevel=compresslevel, compressformat=compression_format, + ref=pref) - return {PACKAGE_TGZ_NAME: package_tgz, + assert compressed_path == package_file + assert os.path.exists(package_file) + + return {package_file_name: package_file, CONANINFO: os.path.join(download_pkg_folder, CONANINFO), CONAN_MANIFEST: os.path.join(download_pkg_folder, CONAN_MANIFEST)} @@ -255,22 +271,25 @@ def upload_package(self, pref, prev_bundle, remote): output.debug(f"Upload {pref} in {duration} time") -def compress_files(files, name, dest_dir, compresslevel=None, ref=None): +def compress_files(files, name, dest_dir, compressformat=None, compresslevel=None, ref=None): t1 = time.time() - # FIXME, better write to disk sequentially and not keep tgz contents in memory - tgz_path = os.path.join(dest_dir, name) + tar_path = os.path.join(dest_dir, name) ConanOutput(scope=str(ref)).info(f"Compressing {name}") - with set_dirty_context_manager(tgz_path), open(tgz_path, "wb") as tgz_handle: - tgz = gzopen_without_timestamps(name, mode="w", fileobj=tgz_handle, - compresslevel=compresslevel) - for filename, abs_path in sorted(files.items()): - # recursive is False in case it is a symlink to a folder - tgz.add(abs_path, filename, recursive=False) - tgz.close() + + if compressformat == "zstd": + tar_zst_compress(tar_path, files, compresslevel=compresslevel) + else: + with set_dirty_context_manager(tar_path), open(tar_path, "wb") as tgz_handle: + tgz = gzopen_without_timestamps(name, mode="w", fileobj=tgz_handle, + compresslevel=compresslevel) + for filename, abs_path in sorted(files.items()): + # recursive is False in case it is a symlink to a folder + tgz.add(abs_path, filename, recursive=False) + tgz.close() duration = time.time() - t1 ConanOutput().debug(f"{name} compressed in {duration} time") - return tgz_path + return tar_path def _total_size(cache_files): diff --git a/conan/internal/paths.py b/conan/internal/paths.py index 34c85929f3f..acce04ab49a 100644 --- a/conan/internal/paths.py +++ b/conan/internal/paths.py @@ -82,6 +82,7 @@ def _user_home_from_conanrc_file(): CONAN_MANIFEST = "conanmanifest.txt" CONANINFO = "conaninfo.txt" PACKAGE_TGZ_NAME = "conan_package.tgz" +PACKAGE_TZSTD_NAME = "conan_package.tzst" EXPORT_TGZ_NAME = "conan_export.tgz" EXPORT_SOURCES_TGZ_NAME = "conan_sources.tgz" DATA_YML = "conandata.yml" diff --git a/conans/client/remote_manager.py b/conans/client/remote_manager.py index e5d2defd752..78d9e7c23b4 100644 --- a/conans/client/remote_manager.py +++ b/conans/client/remote_manager.py @@ -1,5 +1,7 @@ import os import shutil +import tarfile +import time from typing import List from requests.exceptions import ConnectionError @@ -16,7 +18,7 @@ from conans.model.package_ref import PkgReference from conans.model.recipe_ref import RecipeReference from conans.util.files import rmdir, human_size -from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME +from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME from conans.util.files import mkdir, tar_extract @@ -164,14 +166,21 @@ def _get_package(self, layout, pref, remote, scoped_output, metadata): metadata, only_metadata=False) zipped_files = {k: v for k, v in zipped_files.items() if not k.startswith(METADATA)} # quick server package integrity check: - for f in ("conaninfo.txt", "conanmanifest.txt", "conan_package.tgz"): + for f in ("conaninfo.txt", "conanmanifest.txt"): if f not in zipped_files: raise ConanException(f"Corrupted {pref} in '{remote.name}' remote: no {f}") + accepted_package_files = [PACKAGE_TZSTD_NAME, PACKAGE_TGZ_NAME] + package_file = next((f for f in zipped_files if f in accepted_package_files), None) + if not package_file: + raise ConanException(f"Corrupted {pref} in '{remote.name}' remote: no {accepted_package_files} found") self._signer.verify(pref, download_pkg_folder, zipped_files) - tgz_file = zipped_files.pop(PACKAGE_TGZ_NAME, None) + package_file = zipped_files.pop(package_file, None) package_folder = layout.package() - uncompress_file(tgz_file, package_folder, scope=str(pref.ref)) + t1 = time.time() + uncompress_file(package_file, package_folder, scope=str(pref.ref)) + duration = time.time() - t1 + scoped_output.debug(f"Decompressed {package_file} in {duration} seconds") mkdir(package_folder) # Just in case it doesn't exist, because uncompress did nothing for file_name, file_path in zipped_files.items(): # copy CONANINFO and CONANMANIFEST shutil.move(file_path, os.path.join(package_folder, file_name)) @@ -276,7 +285,8 @@ def uncompress_file(src_path, dest_folder, scope=None): hs = human_size(filesize) ConanOutput(scope=scope).info(f"Decompressing {hs} {os.path.basename(src_path)}") with open(src_path, mode='rb') as file_handler: - tar_extract(file_handler, dest_folder) + tar_extract(file_handler, dest_folder, + is_tar_zst=src_path.endswith((".tar.zst", ".tzst"))) except Exception as e: error_msg = "Error while extracting downloaded file '%s' to %s\n%s\n"\ % (src_path, dest_folder, str(e)) diff --git a/conans/client/rest/rest_client_v2.py b/conans/client/rest/rest_client_v2.py index e4ffe9a8234..52298345c92 100644 --- a/conans/client/rest/rest_client_v2.py +++ b/conans/client/rest/rest_client_v2.py @@ -17,7 +17,7 @@ RecipeNotFoundException, PackageNotFoundException, EXCEPTION_CODE_MAPPING from conan.errors import ConanException from conans.model.package_ref import PkgReference -from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME +from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME from conans.model.recipe_ref import RecipeReference from conans.util.dates import from_iso8601_to_timestamp from conans.util.thread import ExceptionThread @@ -256,8 +256,12 @@ def get_package(self, pref, dest_folder, metadata, only_metadata): result = {} # Download only known files, but not metadata (except sign) if not only_metadata: # Retrieve package first, then metadata - accepted_files = ["conaninfo.txt", "conan_package.tgz", "conanmanifest.txt", - "metadata/sign"] + accepted_package_files = [PACKAGE_TZSTD_NAME, PACKAGE_TGZ_NAME] + accepted_files = ["conaninfo.txt", "conanmanifest.txt", "metadata/sign"] + for f in accepted_package_files: + if f in server_files: + accepted_files = [f] + accepted_files + break files = [f for f in server_files if any(f.startswith(m) for m in accepted_files)] # If we didn't indicated reference, server got the latest, use absolute now, it's safer urls = {fn: self.router.package_file(pref, fn) for fn in files} diff --git a/conans/model/conf.py b/conans/model/conf.py index ebc41bc5885..349afc1306e 100644 --- a/conans/model/conf.py +++ b/conans/model/conf.py @@ -48,8 +48,11 @@ "core.net.http:cacert_path": "Path containing a custom Cacert file", "core.net.http:client_cert": "Path or tuple of files containing a client cert (and key)", "core.net.http:clean_system_proxy": "If defined, the proxies system env-vars will be discarded", - # Gzip compression + # Compression for `conan upload` + "core.upload:compression_format": "The compression format used when uploading Conan packages. " + "Possible values: 'zstd', 'gzip' (default=gzip)", "core.gzip:compresslevel": "The Gzip compression level for Conan artifacts (default=9)", + "core.zstd:compresslevel": "The zstd compression level for Conan artifacts (default=3)", # Excluded from revision_mode = "scm" dirty and Git().is_dirty() checks "core.scm:excluded": "List of excluded patterns for builtin git dirty checks", "core.scm:local_url": "By default allows to store local folders as remote url, but not upload them. Use 'allow' for allowing upload and 'block' to completely forbid it", diff --git a/conans/model/manifest.py b/conans/model/manifest.py index 06ec8daa540..a9b1192e073 100644 --- a/conans/model/manifest.py +++ b/conans/model/manifest.py @@ -1,7 +1,7 @@ import os from collections import defaultdict -from conan.internal.paths import CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME +from conan.internal.paths import CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME from conans.util.dates import timestamp_now, timestamp_to_str from conans.util.files import load, md5, md5sum, save, gather_files @@ -91,7 +91,7 @@ def create(cls, folder, exports_sources_folder=None): """ files, _ = gather_files(folder) # The folders symlinks are discarded for the manifest - for f in (PACKAGE_TGZ_NAME, EXPORT_TGZ_NAME, CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME): + for f in (PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME, EXPORT_TGZ_NAME, CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME): files.pop(f, None) file_dict = {} diff --git a/conans/util/files.py b/conans/util/files.py index fd03715ee08..ade0e75f9a9 100644 --- a/conans/util/files.py +++ b/conans/util/files.py @@ -9,6 +9,12 @@ import tarfile import time +try: + import zstandard + zstandard_exception = None +except ImportError as e: + zstandard_exception = e + from contextlib import contextmanager from conan.errors import ConanException @@ -290,8 +296,57 @@ def gzopen_without_timestamps(name, mode="r", fileobj=None, compresslevel=None, return t -def tar_extract(fileobj, destination_dir): - the_tar = tarfile.open(fileobj=fileobj) +def raise_if_zstandard_not_present(operation): + if zstandard_exception: + raise ConanException( + f"zstandard {operation} was requested, but the required package is not present. " + f"Please install it using 'pip install zstandard' and try again. " + f"Exception details: {zstandard_exception}") + + +def tar_zst_compress(tar_path, files, compresslevel=None): + raise_if_zstandard_not_present("compression") + + with open(tar_path, "wb") as tarfile_obj: + # Only provide level if it was overridden by config. + zstd_kwargs = {} + if compresslevel is not None: + zstd_kwargs["level"] = compresslevel + + dctx = zstandard.ZstdCompressor(write_checksum=True, threads=-1, **zstd_kwargs) + + # Create a zstd stream writer so tarfile writes uncompressed data to + # the zstd stream writer, which in turn writes compressed data to the + # output tar.zst file. + with dctx.stream_writer(tarfile_obj) as stream_writer: + # The choice of bufsize=32768 comes from profiling compression at various + # values and finding that bufsize value consistently performs well. + # The variance in compression times at bufsize<=64KB is small. It is only + # when bufsize>=128KB that compression times start increasing. + with tarfile.open(mode="w|", fileobj=stream_writer, bufsize=32768, + format=tarfile.PAX_FORMAT) as tar: + current_frame_bytes = 0 + for filename, abs_path in sorted(files.items()): + tar.add(abs_path, filename, recursive=False) + + # Flush the current frame if it has reached a large enough size. + # There is no required size, but 128MB is a good starting point + # because it allows for faster random access to the file. + current_frame_bytes += os.path.getsize(abs_path) + if current_frame_bytes >= 134217728: + stream_writer.flush(zstandard.FLUSH_FRAME) + current_frame_bytes = 0 + + +def tar_extract(fileobj, destination_dir, is_tar_zst=False): + if is_tar_zst: + raise_if_zstandard_not_present("decompression") + dctx = zstandard.ZstdDecompressor() + stream_reader = dctx.stream_reader(fileobj) + the_tar = tarfile.open(fileobj=stream_reader, bufsize=32768, mode="r|") + else: + the_tar = tarfile.open(fileobj=fileobj) + # NOTE: The errorlevel=2 has been removed because it was failing in Win10, it didn't allow to # "could not change modification time", with time=0 # the_tar.errorlevel = 2 # raise exception if any error @@ -299,6 +354,9 @@ def tar_extract(fileobj, destination_dir): the_tar.extractall(path=destination_dir) the_tar.close() + if is_tar_zst: + stream_reader.close() + def merge_directories(src, dst): from conan.tools.files import copy diff --git a/test/unittests/client/remote_manager_test.py b/test/unittests/client/remote_manager_test.py index 871cf609731..c7bbb3ed701 100644 --- a/test/unittests/client/remote_manager_test.py +++ b/test/unittests/client/remote_manager_test.py @@ -2,14 +2,15 @@ import unittest from conan.internal.api.uploader import compress_files -from conan.internal.paths import PACKAGE_TGZ_NAME +from conans.client.remote_manager import uncompress_file +from conan.internal.paths import PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME from conan.test.utils.test_files import temp_folder from conans.util.files import save class RemoteManagerTest(unittest.TestCase): - def test_compress_files(self): + def test_compress_files_tgz(self): folder = temp_folder() save(os.path.join(folder, "one_file.txt"), "The contents") save(os.path.join(folder, "Two_file.txt"), "Two contents") @@ -23,3 +24,30 @@ def test_compress_files(self): self.assertTrue(os.path.exists(path)) expected_path = os.path.join(folder, PACKAGE_TGZ_NAME) self.assertEqual(path, expected_path) + + def test_compress_and_uncompress_zst_files(self): + folder = temp_folder() + save(os.path.join(folder, "one_file.txt"), "The contents") + save(os.path.join(folder, "Two_file.txt"), "Two contents") + + files = { + "one_file.txt": os.path.join(folder, "one_file.txt"), + "Two_file.txt": os.path.join(folder, "Two_file.txt"), + } + + path = compress_files(files, PACKAGE_TZSTD_NAME, dest_dir=folder, compressformat="zstd") + self.assertTrue(os.path.exists(path)) + expected_path = os.path.join(folder, PACKAGE_TZSTD_NAME) + self.assertEqual(path, expected_path) + + extract_dir = os.path.join(folder, "extracted") + uncompress_file(path, extract_dir) + + extract_files = list(sorted(os.listdir(extract_dir))) + expected_files = sorted(files.keys()) + self.assertEqual(extract_files, expected_files) + + for name, path in sorted(files.items()): + extract_path = os.path.join(extract_dir, name) + with open(path, "r") as f1, open(extract_path, "r") as f2: + self.assertEqual(f1.read(), f2.read())