Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for zstd compression #14706

Open
wants to merge 15 commits into
base: develop2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conan/internal/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def _user_home_from_conanrc_file():
CONAN_MANIFEST = "conanmanifest.txt"
CONANINFO = "conaninfo.txt"
PACKAGE_TGZ_NAME = "conan_package.tgz"
PACKAGE_TZSTD_NAME = "conan_package.tar.zst"
grossag marked this conversation as resolved.
Show resolved Hide resolved
EXPORT_TGZ_NAME = "conan_export.tgz"
EXPORT_SOURCES_TGZ_NAME = "conan_sources.tgz"
DATA_YML = "conandata.yml"
97 changes: 72 additions & 25 deletions conans/client/cmd/uploader.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import os
import shutil
import tarfile
import time
import zstandard

from conan.internal.conan_app import ConanApp
from conan.api.output import ConanOutput
from conans.client.source import retrieve_exports_sources
from conans.errors import ConanException, NotFoundException
from conan.internal.paths import (CONAN_MANIFEST, CONANFILE, EXPORT_SOURCES_TGZ_NAME,
EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, CONANINFO)
EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME, CONANINFO)
from conans.util.files import (clean_dirty, is_dirty, gather_files,
gzopen_without_timestamps, set_dirty_context_manager, mkdir,
human_size)
Expand Down Expand Up @@ -172,11 +174,24 @@ def _prepare_package(self, pref, prev_bundle):
def _compress_package_files(self, layout, pref):
output = ConanOutput(scope=str(pref))
download_pkg_folder = layout.download_package()
package_tgz = os.path.join(download_pkg_folder, PACKAGE_TGZ_NAME)
if is_dirty(package_tgz):
output.warning("Removing %s, marked as dirty" % PACKAGE_TGZ_NAME)
os.remove(package_tgz)
clean_dirty(package_tgz)

compression_format = self._global_conf.get("core.upload:compression_format",
default="gzip")
if compression_format == "gzip":
compress_level_config = "core.gzip:compresslevel"
package_file_name = PACKAGE_TGZ_NAME
package_file = os.path.join(download_pkg_folder, PACKAGE_TGZ_NAME)
elif compression_format == "zstd":
compress_level_config = "core.zstd:compresslevel"
package_file_name = PACKAGE_TZSTD_NAME
package_file = os.path.join(download_pkg_folder, PACKAGE_TZSTD_NAME)
else:
raise ConanException(f"Unsupported compression format '{compression_format}'")

if is_dirty(package_file):
output.warning(f"Removing {package_file_name}, marked as dirty")
os.remove(package_file)
clean_dirty(package_file)

# Get all the files in that directory
# existing package, will use short paths if defined
Expand All @@ -197,15 +212,19 @@ def _compress_package_files(self, layout, pref):
files.pop(CONANINFO)
files.pop(CONAN_MANIFEST)

if not os.path.isfile(package_tgz):
tgz_files = {f: path for f, path in files.items()}
compresslevel = self._global_conf.get("core.gzip:compresslevel", check_type=int)
tgz_path = compress_files(tgz_files, PACKAGE_TGZ_NAME, download_pkg_folder,
compresslevel=compresslevel, ref=pref)
assert tgz_path == package_tgz
assert os.path.exists(package_tgz)
if os.path.isfile(package_file):
output.info(f"Not writing '{package_file_name}' because it already exists.")
grossag marked this conversation as resolved.
Show resolved Hide resolved
else:
source_files = {f: path for f, path in files.items()}
compresslevel = self._global_conf.get(compress_level_config, check_type=int)
compressed_path = compress_files(source_files, package_file_name, download_pkg_folder,
compresslevel=compresslevel, compressformat=compression_format,
ref=pref)

assert compressed_path == package_file
assert os.path.exists(package_file)

return {PACKAGE_TGZ_NAME: package_tgz,
return {package_file_name: package_file,
CONANINFO: os.path.join(download_pkg_folder, CONANINFO),
CONAN_MANIFEST: os.path.join(download_pkg_folder, CONAN_MANIFEST)}

Expand Down Expand Up @@ -253,22 +272,50 @@ def upload_package(self, pref, prev_bundle, remote):
output.debug(f"Upload {pref} in {duration} time")


def compress_files(files, name, dest_dir, compresslevel=None, ref=None):
def compress_files(files, name, dest_dir, compressformat=None, compresslevel=None, ref=None):
t1 = time.time()
# FIXME, better write to disk sequentially and not keep tgz contents in memory
tgz_path = os.path.join(dest_dir, name)
tar_path = os.path.join(dest_dir, name)
ConanOutput(scope=str(ref)).info(f"Compressing {name}")
with set_dirty_context_manager(tgz_path), open(tgz_path, "wb") as tgz_handle:
tgz = gzopen_without_timestamps(name, mode="w", fileobj=tgz_handle,
compresslevel=compresslevel)
for filename, abs_path in sorted(files.items()):
# recursive is False in case it is a symlink to a folder
tgz.add(abs_path, filename, recursive=False)
tgz.close()

if compressformat == "zstd":
with open(tar_path, "wb") as tarfile_obj:
grossag marked this conversation as resolved.
Show resolved Hide resolved
# Only provide level if it was overridden by config.
zstd_kwargs = {}
if compresslevel is not None:
zstd_kwargs["level"] = compresslevel

dctx = zstandard.ZstdCompressor(write_checksum=True, threads=-1, **zstd_kwargs)
grossag marked this conversation as resolved.
Show resolved Hide resolved

# Create a zstd stream writer so tarfile writes uncompressed data to
# the zstd stream writer, which in turn writes compressed data to the
# output tar.zst file.
with dctx.stream_writer(tarfile_obj) as stream_writer:
# The choice of bufsize=32768 comes from profiling compression at various
# values and finding that bufsize value consistently performs well.
# The variance in compression times at bufsize<=64KB is small. It is only
# when bufsize>=128KB that compression times start increasing.
with tarfile.open(mode="w|", fileobj=stream_writer, bufsize=32768,
format=tarfile.PAX_FORMAT) as tar:
unflushed_bytes = 0
for filename, abs_path in sorted(files.items()):
tar.add(abs_path, filename, recursive=False)

unflushed_bytes += os.path.getsize(abs_path)
if unflushed_bytes >= 2097152:
grossag marked this conversation as resolved.
Show resolved Hide resolved
stream_writer.flush() # Flush the current zstd block.
unflushed_bytes = 0
else:
with set_dirty_context_manager(tar_path), open(tar_path, "wb") as tgz_handle:
tgz = gzopen_without_timestamps(name, mode="w", fileobj=tgz_handle,
compresslevel=compresslevel)
for filename, abs_path in sorted(files.items()):
# recursive is False in case it is a symlink to a folder
tgz.add(abs_path, filename, recursive=False)
tgz.close()

duration = time.time() - t1
ConanOutput().debug(f"{name} compressed in {duration} time")
return tgz_path
return tar_path


def _total_size(cache_files):
Expand Down
20 changes: 15 additions & 5 deletions conans/client/remote_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import shutil
import tarfile
import time
from typing import List

from requests.exceptions import ConnectionError
Expand All @@ -16,7 +18,7 @@
from conans.model.package_ref import PkgReference
from conans.model.recipe_ref import RecipeReference
from conans.util.files import rmdir, human_size
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME
from conans.util.files import mkdir, tar_extract


Expand Down Expand Up @@ -164,14 +166,21 @@ def _get_package(self, layout, pref, remote, scoped_output, metadata):
metadata, only_metadata=False)
zipped_files = {k: v for k, v in zipped_files.items() if not k.startswith(METADATA)}
# quick server package integrity check:
for f in ("conaninfo.txt", "conanmanifest.txt", "conan_package.tgz"):
for f in ("conaninfo.txt", "conanmanifest.txt"):
if f not in zipped_files:
raise ConanException(f"Corrupted {pref} in '{remote.name}' remote: no {f}")
accepted_package_files = [PACKAGE_TZSTD_NAME, PACKAGE_TGZ_NAME]
package_file = next((f for f in zipped_files if f in accepted_package_files), None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically, a package could contain both compressed artifacts, but it will prioritize and only download the zstd one if existing?

Wouldn't it be a bit less confusing to not allow to have both compressed formats artifacts in the same package?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A package is only supposed to contain one. Let's say an organization switches to zstd compression on Jan 1 2025. The expectation would be that packages produced before then would have .tgz extension and packages produced after then would have .tzst extension. I would like to avoid producing both because it would result in unnecessary storage usage in Artifactory.

if not package_file:
raise ConanException(f"Corrupted {pref} in '{remote.name}' remote: no {accepted_package_files} found")
self._signer.verify(pref, download_pkg_folder, zipped_files)

tgz_file = zipped_files.pop(PACKAGE_TGZ_NAME, None)
package_file = zipped_files.pop(package_file, None)
package_folder = layout.package()
uncompress_file(tgz_file, package_folder, scope=str(pref.ref))
t1 = time.time()
uncompress_file(package_file, package_folder, scope=str(pref.ref))
duration = time.time() - t1
scoped_output.debug(f"Decompressed {package_file} in {duration} seconds")
mkdir(package_folder) # Just in case it doesn't exist, because uncompress did nothing
for file_name, file_path in zipped_files.items(): # copy CONANINFO and CONANMANIFEST
shutil.move(file_path, os.path.join(package_folder, file_name))
Expand Down Expand Up @@ -276,7 +285,8 @@ def uncompress_file(src_path, dest_folder, scope=None):
hs = human_size(filesize)
ConanOutput(scope=scope).info(f"Decompressing {hs} {os.path.basename(src_path)}")
with open(src_path, mode='rb') as file_handler:
tar_extract(file_handler, dest_folder)
tar_extract(file_handler, dest_folder,
is_tar_zst=src_path.endswith(".tar.zst"))
except Exception as e:
error_msg = "Error while extracting downloaded file '%s' to %s\n%s\n"\
% (src_path, dest_folder, str(e))
Expand Down
10 changes: 7 additions & 3 deletions conans/client/rest/rest_client_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from conans.errors import ConanException, NotFoundException, PackageNotFoundException, \
RecipeNotFoundException, AuthenticationException, ForbiddenException
from conans.model.package_ref import PkgReference
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME
from conans.util.dates import from_iso8601_to_timestamp
from conans.util.thread import ExceptionThread

Expand Down Expand Up @@ -81,8 +81,12 @@ def get_package(self, pref, dest_folder, metadata, only_metadata):
result = {}
# Download only known files, but not metadata (except sign)
if not only_metadata: # Retrieve package first, then metadata
accepted_files = ["conaninfo.txt", "conan_package.tgz", "conanmanifest.txt",
"metadata/sign"]
accepted_package_files = [PACKAGE_TZSTD_NAME, PACKAGE_TGZ_NAME]
accepted_files = ["conaninfo.txt", "conanmanifest.txt", "metadata/sign"]
for f in accepted_package_files:
if f in server_files:
accepted_files = [f] + accepted_files
break
Comment on lines +84 to +89
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we assumed there can only be 1 compressed artifact in one of the formats, this would be simplified?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I think I'm missing what you are saying here. I don't have the context about if/how these accepted files changed over time. But Artifactory would only have .tgz or .tzst, not both. If that means we can simplify this a bit, that's fine with me.

files = [f for f in server_files if any(f.startswith(m) for m in accepted_files)]
# If we didn't indicated reference, server got the latest, use absolute now, it's safer
urls = {fn: self.router.package_file(pref, fn) for fn in files}
Expand Down
5 changes: 4 additions & 1 deletion conans/model/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,11 @@
"core.net.http:cacert_path": "Path containing a custom Cacert file",
"core.net.http:client_cert": "Path or tuple of files containing a client cert (and key)",
"core.net.http:clean_system_proxy": "If defined, the proxies system env-vars will be discarded",
# Gzip compression
# Compression for `conan upload`
"core.upload:compression_format": "The compression format used when uploading Conan packages. "
"Possible values: 'zstd', 'gzip' (default=gzip)",
"core.gzip:compresslevel": "The Gzip compression level for Conan artifacts (default=9)",
"core.zstd:compresslevel": "The zstd compression level for Conan artifacts",
grossag marked this conversation as resolved.
Show resolved Hide resolved
# Excluded from revision_mode = "scm" dirty and Git().is_dirty() checks
"core.scm:excluded": "List of excluded patterns for builtin git dirty checks",
"core.scm:local_url": "By default allows to store local folders as remote url, but not upload them. Use 'allow' for allowing upload and 'block' to completely forbid it",
Expand Down
4 changes: 2 additions & 2 deletions conans/model/manifest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from collections import defaultdict

from conan.internal.paths import CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME
from conan.internal.paths import CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME
from conans.util.dates import timestamp_now, timestamp_to_str
from conans.util.files import load, md5, md5sum, save, gather_files

Expand Down Expand Up @@ -91,7 +91,7 @@ def create(cls, folder, exports_sources_folder=None):
"""
files, _ = gather_files(folder)
# The folders symlinks are discarded for the manifest
for f in (PACKAGE_TGZ_NAME, EXPORT_TGZ_NAME, CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME):
for f in (PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME, EXPORT_TGZ_NAME, CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME):
files.pop(f, None)

file_dict = {}
Expand Down
1 change: 1 addition & 0 deletions conans/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ fasteners>=0.15
distro>=1.4.0, <=1.8.0; platform_system == 'Linux' or platform_system == 'FreeBSD'
Jinja2>=3.0, <4.0.0
python-dateutil>=2.8.0, <3
zstandard>=0.20, <= 0.23
grossag marked this conversation as resolved.
Show resolved Hide resolved
22 changes: 15 additions & 7 deletions conans/util/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import tarfile
import time
import zstandard

from contextlib import contextmanager

Expand Down Expand Up @@ -281,13 +282,20 @@ def gzopen_without_timestamps(name, mode="r", fileobj=None, compresslevel=None,
return t


def tar_extract(fileobj, destination_dir):
the_tar = tarfile.open(fileobj=fileobj)
# NOTE: The errorlevel=2 has been removed because it was failing in Win10, it didn't allow to
# "could not change modification time", with time=0
# the_tar.errorlevel = 2 # raise exception if any error
the_tar.extractall(path=destination_dir)
the_tar.close()
def tar_extract(fileobj, destination_dir, is_tar_zst=False):
if is_tar_zst:
dctx = zstandard.ZstdDecompressor()
with dctx.stream_reader(fileobj) as stream_reader:
# The choice of bufsize=32768 comes from profiling decompression at various
# values and finding that bufsize value consistently performs well.
with tarfile.open(fileobj=stream_reader, bufsize=32768, mode="r|") as the_tar:
the_tar.extractall(path=destination_dir)
else:
with tarfile.open(fileobj=fileobj) as the_tar:
# NOTE: The errorlevel=2 has been removed because it was failing in Win10, it didn't allow to
# "could not change modification time", with time=0
# the_tar.errorlevel = 2 # raise exception if any error
the_tar.extractall(path=destination_dir)


def exception_message_safe(exc):
Expand Down
32 changes: 30 additions & 2 deletions test/unittests/client/remote_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
import unittest

from conans.client.cmd.uploader import compress_files
from conan.internal.paths import PACKAGE_TGZ_NAME
from conans.client.remote_manager import uncompress_file
from conan.internal.paths import PACKAGE_TGZ_NAME, PACKAGE_TZSTD_NAME
from conan.test.utils.test_files import temp_folder
from conans.util.files import save


class RemoteManagerTest(unittest.TestCase):

def test_compress_files(self):
def test_compress_files_tgz(self):
folder = temp_folder()
save(os.path.join(folder, "one_file.txt"), "The contents")
save(os.path.join(folder, "Two_file.txt"), "Two contents")
Expand All @@ -23,3 +24,30 @@ def test_compress_files(self):
self.assertTrue(os.path.exists(path))
expected_path = os.path.join(folder, PACKAGE_TGZ_NAME)
self.assertEqual(path, expected_path)

def test_compress_and_uncompress_zst_files(self):
folder = temp_folder()
save(os.path.join(folder, "one_file.txt"), "The contents")
save(os.path.join(folder, "Two_file.txt"), "Two contents")

files = {
"one_file.txt": os.path.join(folder, "one_file.txt"),
"Two_file.txt": os.path.join(folder, "Two_file.txt"),
}

path = compress_files(files, PACKAGE_TZSTD_NAME, dest_dir=folder, compressformat="zstd")
self.assertTrue(os.path.exists(path))
expected_path = os.path.join(folder, PACKAGE_TZSTD_NAME)
self.assertEqual(path, expected_path)

extract_dir = os.path.join(folder, "extracted")
uncompress_file(path, extract_dir)

extract_files = list(sorted(os.listdir(extract_dir)))
expected_files = sorted(files.keys())
self.assertEqual(extract_files, expected_files)

for name, path in sorted(files.items()):
extract_path = os.path.join(extract_dir, name)
with open(path, "r") as f1, open(extract_path, "r") as f2:
self.assertEqual(f1.read(), f2.read())