From 8b2fd37aabbac1e4edd8cc0dc6ddaf44e12cfb68 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Wed, 1 May 2019 20:56:43 -0700 Subject: [PATCH 01/31] Add deterministic_datetime() function --- pex/common.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/pex/common.py b/pex/common.py index 6f8a9bad6..e796bd05f 100644 --- a/pex/common.py +++ b/pex/common.py @@ -15,9 +15,32 @@ import threading import zipfile from collections import defaultdict +from datetime import datetime from uuid import uuid4 +def deterministic_datetime(): + """Return a deterministic UTC datetime object that does not depend on the system time. + + First try to read from the standard $SOURCE_DATE_EPOCH, then default to midnight January 1, + 1980, which is the start of time for MS-DOS time. Per section 4.4.6 of the zip spec at + https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TX, Zipfiles use MS-DOS time. So, we + use this default to ensure no issues with Zipfiles. + + Even though we use MS-DOS to inform the default value, note that we still return a normal UTC + datetime object for flexibility with how the result is used. + + For more information on $SOURCE_DATE_EPOCH, refer to + https://reproducible-builds.org/docs/source-date-epoch/.""" + return ( + datetime.utcfromtimestamp(int(os.environ['SOURCE_DATE_EPOCH'])) + if "SOURCE_DATE_EPOCH" in os.environ + else datetime( + year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None + ) + ) + + def die(msg, exit_code=1): print(msg, file=sys.stderr) sys.exit(exit_code) @@ -366,4 +389,8 @@ def delete(self): def zip(self, filename, mode='w'): with open_zip(filename, mode) as zf: for f in sorted(self.files()): - zf.write(os.path.join(self.chroot, f), arcname=f, compress_type=zipfile.ZIP_DEFLATED) + zf.write( + os.path.join(self.chroot, f), + arcname=f, + compress_type=zipfile.ZIP_DEFLATED + ) From b440fbf1bfcfc3f5b2bfa0ce6c7572d45fb497b6 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Thu, 2 May 2019 15:54:20 -0700 Subject: [PATCH 02/31] Proof of concept #1: monkey patch approach that works on 3.7 --- pex/common.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/pex/common.py b/pex/common.py index e796bd05f..e38c85f55 100644 --- a/pex/common.py +++ b/pex/common.py @@ -118,11 +118,71 @@ def _chmod(self, info, path): attr = info.external_attr >> 16 os.chmod(path, attr) +class DeterministicTimestampZipFile(PermPreservingZipFile): + """A ZipFile that always uses the same timestamp, in addition to preserving permissions.""" + + def write(self, filename, arcname=None, + compress_type=None, compresslevel=None): + """Put the bytes from filename into the archive under the name + arcname. """ + # NB: Everything is copied from https://github.com/python/cpython/blob/master/Lib/zipfile.py, + # excluding lines marked with NB. + if not self.fp: + raise ValueError( + "Attempt to write to ZIP archive that was already closed") + if self._writing: + raise ValueError( + "Can't write to ZIP archive while an open writing handle exists" + ) + + zinfo = zipfile.ZipInfo.from_file(filename, arcname) + # NB: We hardcode the date to be deterministic. + zinfo.date_time = deterministic_datetime().timetuple()[:6] + + if zinfo.is_dir(): + zinfo.compress_size = 0 + zinfo.CRC = 0 + else: + if compress_type is not None: + zinfo.compress_type = compress_type + else: + zinfo.compress_type = self.compression + + if compresslevel is not None: + zinfo._compresslevel = compresslevel + else: + zinfo._compresslevel = self.compresslevel + + if zinfo.is_dir(): + with self._lock: + if self._seekable: + self.fp.seek(self.start_dir) + zinfo.header_offset = self.fp.tell() # Start of header bytes + if zinfo.compress_type == ZIP_LZMA: + # Compressed data includes an end-of-stream (EOS) marker + zinfo.flag_bits |= 0x02 + + self._writecheck(zinfo) + self._didModify = True + + self.filelist.append(zinfo) + self.NameToInfo[zinfo.filename] = zinfo + self.fp.write(zinfo.FileHeader(False)) + self.start_dir = self.fp.tell() + else: + with open(filename, "rb") as src, self.open(zinfo, 'w') as dest: + shutil.copyfileobj(src, dest, 1024*8) + @contextlib.contextmanager -def open_zip(path, *args, **kwargs): - """A contextmanager for zip files. Passes through positional and kwargs to zipfile.ZipFile.""" - with contextlib.closing(PermPreservingZipFile(path, *args, **kwargs)) as zip: +def open_zip(path, *args, use_deterministic_timestamp=False, **kwargs): + """A contextmanager for zip files. Passes through positional and kwargs to zipfile.ZipFile.""" + zf = ( + DeterministicTimestampZipFile(path, *args, **kwargs) + if use_deterministic_timestamp + else PermPreservingZipFile(path, *args, **kwargs) + ) + with contextlib.closing(zf) as zip: yield zip @@ -386,8 +446,8 @@ def __str__(self): def delete(self): shutil.rmtree(self.chroot) - def zip(self, filename, mode='w'): - with open_zip(filename, mode) as zf: + def zip(self, filename, mode='w', use_deterministic_timestamp=False): + with open_zip(filename, mode, use_deterministic_timestamp=use_deterministic_timestamp) as zf: for f in sorted(self.files()): zf.write( os.path.join(self.chroot, f), From 4b53a9e9e2d172e7a35388d2ccce80ad4d71ea0b Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Thu, 2 May 2019 16:06:28 -0700 Subject: [PATCH 03/31] Add new CLI flag --use-system-time --- pex/bin/pex.py | 18 +++++++++++++++++- pex/common.py | 8 ++++---- pex/pex_builder.py | 5 +++-- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pex/bin/pex.py b/pex/bin/pex.py index 3fdeca69b..f731bd5f3 100755 --- a/pex/bin/pex.py +++ b/pex/bin/pex.py @@ -302,6 +302,18 @@ def configure_clp_pex_options(parser): 'likely will not be reproducible, meaning that if you were to run `./pex -o` with the ' 'same inputs then the new pex would not be byte-for-byte identical to the original.') + group.add_option( + '--use-system-time', '--no-use-system-time', + dest='use_system_time', + # TODO: set this to True. It's only False for now to test what happens in CI if we + # always use deterministic timestamps. + default=False, + action='callback', + callback=parse_bool, + help='Use the current system time to generate timestamps for the new pex. Otherwise, Pex ' + 'will first try to read the time from the environment variable SOURCE_DATE_EPOCH and ' + 'will fallback to midnight on January 1, 1980. TODO: explain reproducibility.') + parser.add_option_group(group) @@ -694,7 +706,11 @@ def main(args=None): log('Saving PEX file to %s' % options.pex_name, V=options.verbosity) tmp_name = options.pex_name + '~' safe_delete(tmp_name) - pex_builder.build(tmp_name, bytecode_compile=options.compile) + pex_builder.build( + tmp_name, + bytecode_compile=options.compile, + deterministic_timestamp=not options.use_system_time + ) os.rename(tmp_name, options.pex_name) else: if not _compatible_with_current_platform(options.platforms): diff --git a/pex/common.py b/pex/common.py index e38c85f55..1c6051783 100644 --- a/pex/common.py +++ b/pex/common.py @@ -175,11 +175,11 @@ def write(self, filename, arcname=None, @contextlib.contextmanager -def open_zip(path, *args, use_deterministic_timestamp=False, **kwargs): +def open_zip(path, *args, deterministic_timestamp=False, **kwargs): """A contextmanager for zip files. Passes through positional and kwargs to zipfile.ZipFile.""" zf = ( DeterministicTimestampZipFile(path, *args, **kwargs) - if use_deterministic_timestamp + if deterministic_timestamp else PermPreservingZipFile(path, *args, **kwargs) ) with contextlib.closing(zf) as zip: @@ -446,8 +446,8 @@ def __str__(self): def delete(self): shutil.rmtree(self.chroot) - def zip(self, filename, mode='w', use_deterministic_timestamp=False): - with open_zip(filename, mode, use_deterministic_timestamp=use_deterministic_timestamp) as zf: + def zip(self, filename, mode='w', deterministic_timestamp=False): + with open_zip(filename, mode, deterministic_timestamp=deterministic_timestamp) as zf: for f in sorted(self.files()): zf.write( os.path.join(self.chroot, f), diff --git a/pex/pex_builder.py b/pex/pex_builder.py index f39fe4434..0439fae59 100644 --- a/pex/pex_builder.py +++ b/pex/pex_builder.py @@ -428,11 +428,12 @@ def freeze(self, bytecode_compile=True): self._precompile_source() self._frozen = True - def build(self, filename, bytecode_compile=True): + def build(self, filename, bytecode_compile=True, deterministic_timestamp=False): """Package the PEX into a zipfile. :param filename: The filename where the PEX should be stored. :param bytecode_compile: If True, precompile .py files into .pyc files. + :param deterministic_timestamp: If True, will not use system time for the zipfile timestamps. If the PEXBuilder is not yet frozen, it will be frozen by ``build``. This renders the PEXBuilder immutable. @@ -450,7 +451,7 @@ def build(self, filename, bytecode_compile=True): with open(filename + '~', 'ab') as pexfile: assert os.path.getsize(pexfile.name) == 0 pexfile.write(to_bytes('%s\n' % self._shebang)) - self._chroot.zip(filename + '~', mode='a') + self._chroot.zip(filename + '~', mode='a', deterministic_timestamp=deterministic_timestamp) if os.path.exists(filename): os.unlink(filename) os.rename(filename + '~', filename) From 226c3bcc00f3f2032715485eb73f488c9f315614 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Thu, 2 May 2019 16:57:26 -0700 Subject: [PATCH 04/31] Approach #2: use writestr() Much better and works accross Py27 and Py3! --- pex/common.py | 106 ++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 68 deletions(-) diff --git a/pex/common.py b/pex/common.py index 1c6051783..5194320be 100644 --- a/pex/common.py +++ b/pex/common.py @@ -118,71 +118,11 @@ def _chmod(self, info, path): attr = info.external_attr >> 16 os.chmod(path, attr) -class DeterministicTimestampZipFile(PermPreservingZipFile): - """A ZipFile that always uses the same timestamp, in addition to preserving permissions.""" - - def write(self, filename, arcname=None, - compress_type=None, compresslevel=None): - """Put the bytes from filename into the archive under the name - arcname. """ - # NB: Everything is copied from https://github.com/python/cpython/blob/master/Lib/zipfile.py, - # excluding lines marked with NB. - if not self.fp: - raise ValueError( - "Attempt to write to ZIP archive that was already closed") - if self._writing: - raise ValueError( - "Can't write to ZIP archive while an open writing handle exists" - ) - - zinfo = zipfile.ZipInfo.from_file(filename, arcname) - # NB: We hardcode the date to be deterministic. - zinfo.date_time = deterministic_datetime().timetuple()[:6] - - if zinfo.is_dir(): - zinfo.compress_size = 0 - zinfo.CRC = 0 - else: - if compress_type is not None: - zinfo.compress_type = compress_type - else: - zinfo.compress_type = self.compression - - if compresslevel is not None: - zinfo._compresslevel = compresslevel - else: - zinfo._compresslevel = self.compresslevel - - if zinfo.is_dir(): - with self._lock: - if self._seekable: - self.fp.seek(self.start_dir) - zinfo.header_offset = self.fp.tell() # Start of header bytes - if zinfo.compress_type == ZIP_LZMA: - # Compressed data includes an end-of-stream (EOS) marker - zinfo.flag_bits |= 0x02 - - self._writecheck(zinfo) - self._didModify = True - - self.filelist.append(zinfo) - self.NameToInfo[zinfo.filename] = zinfo - self.fp.write(zinfo.FileHeader(False)) - self.start_dir = self.fp.tell() - else: - with open(filename, "rb") as src, self.open(zinfo, 'w') as dest: - shutil.copyfileobj(src, dest, 1024*8) - @contextlib.contextmanager -def open_zip(path, *args, deterministic_timestamp=False, **kwargs): +def open_zip(path, *args, **kwargs): """A contextmanager for zip files. Passes through positional and kwargs to zipfile.ZipFile.""" - zf = ( - DeterministicTimestampZipFile(path, *args, **kwargs) - if deterministic_timestamp - else PermPreservingZipFile(path, *args, **kwargs) - ) - with contextlib.closing(zf) as zip: + with contextlib.closing(PermPreservingZipFile(path, *args, **kwargs)) as zip: yield zip @@ -447,10 +387,40 @@ def delete(self): shutil.rmtree(self.chroot) def zip(self, filename, mode='w', deterministic_timestamp=False): - with open_zip(filename, mode, deterministic_timestamp=deterministic_timestamp) as zf: + with open_zip(filename, mode) as zf: for f in sorted(self.files()): - zf.write( - os.path.join(self.chroot, f), - arcname=f, - compress_type=zipfile.ZIP_DEFLATED - ) + full_path = os.path.join(self.chroot, f) + if not deterministic_timestamp: + zf.write(full_path, compress_type=zipfile.ZIP_DEFLATED) + else: + # This code mostly reimplements ZipInfo.from_file(), which is not available in Python + # 2.7. See https://github.com/python/cpython/blob/master/Lib/zipfile.py#L495. + st = os.stat(filename) + isdir = stat.S_ISDIR(st.st_mode) + # Determine arcname, i.e. the archive name. + arcname = f + arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) + while arcname[0] in (os.sep, os.altsep): + arcname = arcname[1:] + if isdir: + arcname += '/' + # Setup ZipInfo with deterministic datetime. + zinfo = zipfile.ZipInfo( + filename=arcname, + # NB: the constructor expects a six tuple of year-month-day-hour-minute-second. + date_time=deterministic_datetime().timetuple()[:6] + ) + zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 + if isdir: + zinfo.file_size = 0 + zinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + zinfo.file_size = st.st_size + # Determine the data to write. + if isdir: + data = b'' + else: + with open(full_path, 'rb') as open_f: + data = open_f.read() + # Write to the archive. + zf.writestr(zinfo, data, compress_type=zipfile.ZIP_DEFLATED) From baa20c4a64181d74c4c963c42c0c159886bb050a Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Thu, 2 May 2019 17:31:15 -0700 Subject: [PATCH 05/31] Add test for deterministic timestamp --- tests/test_pex_builder.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_pex_builder.py b/tests/test_pex_builder.py index bc2b91694..853971f79 100644 --- a/tests/test_pex_builder.py +++ b/tests/test_pex_builder.py @@ -1,8 +1,10 @@ # Copyright 2014 Pants project contributors (see CONTRIBUTORS.md). # Licensed under the Apache License, Version 2.0 (see LICENSE). +import datetime import os import stat +import zipfile import pytest @@ -177,3 +179,23 @@ def build_and_check(path, copy): build_and_check(td2, False) build_and_check(td3, True) + + +def test_pex_builder_deterministic_timestamp(): + def assert_all_files_have_timestamp(timestamp): + pb = PEXBuilder() + with temporary_dir() as td: + target = os.path.join(td, 'foo.pex') + pb.build(target, deterministic_timestamp=True) + with zipfile.ZipFile(target) as zf: + assert all(zinfo.date_time == timestamp for zinfo in zf.infolist()) + + assert_all_files_have_timestamp((1980, 1, 1, 0, 0, 0)) + # Also test that $SOURCE_DATE_EPOCH is respected. + requested_date = (2019, 5, 1, 10, 10, 10) + utc_timestamp = int(( + datetime.datetime(*requested_date) - datetime.datetime(1970, 1, 1) + ).total_seconds()) + os.environ["SOURCE_DATE_EPOCH"] = str(utc_timestamp) + assert_all_files_have_timestamp(requested_date) + os.environ.pop("SOURCE_DATE_EPOCH") From a27122c1564d1c7656f215815b94a8c64e0064f8 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Thu, 2 May 2019 17:32:47 -0700 Subject: [PATCH 06/31] Unksip 6/7 of the acceptance tests! --- tests/test_integration.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index b93121053..9bf2374be 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1375,7 +1375,6 @@ def create_pex(path, seed): assert filecmp.cmp(pex1, pex2, shallow=False) -@pytest.mark.skip("Acceptance test for landing https://github.com/pantsbuild/pex/issues/716.") def test_reproducible_build_no_args(): assert_reproducible_build([]) @@ -1386,17 +1385,14 @@ def test_reproducible_build_bdist_requirements(): assert_reproducible_build(['six==1.12.0', 'cryptography==2.6.1']) -@pytest.mark.skip("Acceptance test for landing https://github.com/pantsbuild/pex/issues/716.") def test_reproducible_build_sdist_requirements(): assert_reproducible_build(['pycparser==2.19', '--no-build']) -@pytest.mark.skip("Acceptance test for landing https://github.com/pantsbuild/pex/issues/716.") def test_reproducible_build_m_flag(): assert_reproducible_build(['-m', 'pydoc']) -@pytest.mark.skip("Acceptance test for landing https://github.com/pantsbuild/pex/issues/716.") def test_reproducible_build_c_flag(): setup_py = dedent(""" from setuptools import setup @@ -1410,11 +1406,9 @@ def test_reproducible_build_c_flag(): assert_reproducible_build([project_dir, '-c', 'my_app']) -@pytest.mark.skip("Acceptance test for landing https://github.com/pantsbuild/pex/issues/716.") def test_reproducible_build_python_flag(): assert_reproducible_build(['--python=python2.7']) -@pytest.mark.skip("Acceptance test for landing https://github.com/pantsbuild/pex/issues/716.") def test_reproducible_build_python_shebang_flag(): assert_reproducible_build(['--python-shebang=/usr/bin/python']) From e9a067a0403386afa15621b984107b6aa51dc74e Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Thu, 2 May 2019 17:36:39 -0700 Subject: [PATCH 07/31] Explain reproducibility --- pex/bin/pex.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pex/bin/pex.py b/pex/bin/pex.py index f731bd5f3..86ae95e20 100755 --- a/pex/bin/pex.py +++ b/pex/bin/pex.py @@ -312,7 +312,9 @@ def configure_clp_pex_options(parser): callback=parse_bool, help='Use the current system time to generate timestamps for the new pex. Otherwise, Pex ' 'will first try to read the time from the environment variable SOURCE_DATE_EPOCH and ' - 'will fallback to midnight on January 1, 1980. TODO: explain reproducibility.') + 'will fallback to midnight on January 1, 1980. By using system time, the generated pex ' + 'will not be reproducible, meaning that if you were to run `./pex -o` with the ' + 'same inputs then the new pex would not be byte-for-byte identical to the original.') parser.add_option_group(group) From 0295ddc6d548c3367621ec0c48064b6ba201dacf Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Thu, 2 May 2019 17:40:58 -0700 Subject: [PATCH 08/31] Setup integration tests to use --no-use-system-time --- tests/test_integration.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 9bf2374be..e6c568805 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1351,7 +1351,10 @@ def assert_reproducible_build(args): # from the random seed, such as data structures, as Tox sets this value by default. See # https://tox.readthedocs.io/en/latest/example/basic.html#special-handling-of-pythonhashseed. def create_pex(path, seed): - run_pex_command(args + ['-o', path, '--no-compile'], env=make_env(PYTHONHASHSEED=seed)) + run_pex_command( + args + ['-o', path, '--no-compile', '--no-use-system-time'], + env=make_env(PYTHONHASHSEED=seed) + ) create_pex(pex1, seed=111) # We sleep to ensure that there is no non-reproducibility from timestamps or # anything that may depend on the system time. Note that we must sleep for at least From 80adeff6d3b4c0ab270a3d368ebcfef16e78d55b Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Fri, 3 May 2019 15:29:40 -0700 Subject: [PATCH 09/31] Fix regression to normal case --- pex/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pex/common.py b/pex/common.py index 5194320be..77beaf3be 100644 --- a/pex/common.py +++ b/pex/common.py @@ -391,7 +391,11 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): for f in sorted(self.files()): full_path = os.path.join(self.chroot, f) if not deterministic_timestamp: - zf.write(full_path, compress_type=zipfile.ZIP_DEFLATED) + zf.write( + full_path, + arcname=f, + compress_type=zipfile.ZIP_DEFLATED + ) else: # This code mostly reimplements ZipInfo.from_file(), which is not available in Python # 2.7. See https://github.com/python/cpython/blob/master/Lib/zipfile.py#L495. From 5fe4a5938a4d0c20ff1e98fc28acd7371cb557f6 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Fri, 3 May 2019 15:30:47 -0700 Subject: [PATCH 10/31] Set default to true to ensure we arent breaking anything --- pex/bin/pex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pex/bin/pex.py b/pex/bin/pex.py index 86ae95e20..da45ffa5f 100755 --- a/pex/bin/pex.py +++ b/pex/bin/pex.py @@ -307,7 +307,7 @@ def configure_clp_pex_options(parser): dest='use_system_time', # TODO: set this to True. It's only False for now to test what happens in CI if we # always use deterministic timestamps. - default=False, + default=True, action='callback', callback=parse_bool, help='Use the current system time to generate timestamps for the new pex. Otherwise, Pex ' From 829d12d37a39c8d6e6de93721ff32e5780c9d1f9 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Fri, 3 May 2019 15:34:39 -0700 Subject: [PATCH 11/31] Test deterministic timestamp against all CI --- pex/bin/pex.py | 2 +- pex/common.py | 2 +- pex/pex_builder.py | 2 +- tests/test_common.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pex/bin/pex.py b/pex/bin/pex.py index da45ffa5f..86ae95e20 100755 --- a/pex/bin/pex.py +++ b/pex/bin/pex.py @@ -307,7 +307,7 @@ def configure_clp_pex_options(parser): dest='use_system_time', # TODO: set this to True. It's only False for now to test what happens in CI if we # always use deterministic timestamps. - default=True, + default=False, action='callback', callback=parse_bool, help='Use the current system time to generate timestamps for the new pex. Otherwise, Pex ' diff --git a/pex/common.py b/pex/common.py index 77beaf3be..ed29890a5 100644 --- a/pex/common.py +++ b/pex/common.py @@ -386,7 +386,7 @@ def __str__(self): def delete(self): shutil.rmtree(self.chroot) - def zip(self, filename, mode='w', deterministic_timestamp=False): + def zip(self, filename, mode='w', deterministic_timestamp=True): with open_zip(filename, mode) as zf: for f in sorted(self.files()): full_path = os.path.join(self.chroot, f) diff --git a/pex/pex_builder.py b/pex/pex_builder.py index 0439fae59..a5ab69506 100644 --- a/pex/pex_builder.py +++ b/pex/pex_builder.py @@ -428,7 +428,7 @@ def freeze(self, bytecode_compile=True): self._precompile_source() self._frozen = True - def build(self, filename, bytecode_compile=True, deterministic_timestamp=False): + def build(self, filename, bytecode_compile=True, deterministic_timestamp=True): """Package the PEX into a zipfile. :param filename: The filename where the PEX should be stored. diff --git a/tests/test_common.py b/tests/test_common.py index c3ac69041..23c0673c6 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -105,7 +105,7 @@ def assert_chroot_perms(copyfn): assert extract_perms(two) == extract_perms(os.path.join(chroot.path(), 'two')) zip_path = os.path.join(src, 'chroot.zip') - chroot.zip(zip_path) + chroot.zip(zip_path, deterministic_timestamp=True) with temporary_dir() as extract_dir: with contextlib.closing(PermPreservingZipFile(zip_path)) as zf: zf.extractall(extract_dir) From fdc1dc0ff93bb64019fd8d3f0683efefeb8e91bd Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Fri, 3 May 2019 15:38:16 -0700 Subject: [PATCH 12/31] Skip acceptance tests still failing --- tests/test_integration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_integration.py b/tests/test_integration.py index e6c568805..d994253e2 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1388,6 +1388,7 @@ def test_reproducible_build_bdist_requirements(): assert_reproducible_build(['six==1.12.0', 'cryptography==2.6.1']) +@pytest.mark.skip("Acceptance test for landing https://github.com/pantsbuild/pex/issues/716.") def test_reproducible_build_sdist_requirements(): assert_reproducible_build(['pycparser==2.19', '--no-build']) @@ -1396,6 +1397,7 @@ def test_reproducible_build_m_flag(): assert_reproducible_build(['-m', 'pydoc']) +@pytest.mark.skip("Acceptance test for landing https://github.com/pantsbuild/pex/issues/716.") def test_reproducible_build_c_flag(): setup_py = dedent(""" from setuptools import setup From 80b5008d262b96dee61a1c0e2223815905598640 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Fri, 3 May 2019 15:51:42 -0700 Subject: [PATCH 13/31] Fix typo in link --- pex/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pex/common.py b/pex/common.py index ed29890a5..086cf08b1 100644 --- a/pex/common.py +++ b/pex/common.py @@ -24,7 +24,7 @@ def deterministic_datetime(): First try to read from the standard $SOURCE_DATE_EPOCH, then default to midnight January 1, 1980, which is the start of time for MS-DOS time. Per section 4.4.6 of the zip spec at - https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TX, Zipfiles use MS-DOS time. So, we + https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, Zipfiles use MS-DOS time. So, we use this default to ensure no issues with Zipfiles. Even though we use MS-DOS to inform the default value, note that we still return a normal UTC From 794084ec9cf1ca1b077a560c5fcfd946d4e6a918 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Fri, 3 May 2019 16:05:35 -0700 Subject: [PATCH 14/31] Fix bug with permissions not copying Was accidently using the zip's overall filename to determine permissions, rather than the actual files themselves. --- pex/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pex/common.py b/pex/common.py index 086cf08b1..f5aafc568 100644 --- a/pex/common.py +++ b/pex/common.py @@ -399,7 +399,7 @@ def zip(self, filename, mode='w', deterministic_timestamp=True): else: # This code mostly reimplements ZipInfo.from_file(), which is not available in Python # 2.7. See https://github.com/python/cpython/blob/master/Lib/zipfile.py#L495. - st = os.stat(filename) + st = os.stat(full_path) isdir = stat.S_ISDIR(st.st_mode) # Determine arcname, i.e. the archive name. arcname = f From 952cf9ee3761620bee9e36dd13c73a2f8b00c345 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Fri, 3 May 2019 16:10:34 -0700 Subject: [PATCH 15/31] Revert "Test deterministic timestamp against all CI" This reverts commit 829d12d37a39c8d6e6de93721ff32e5780c9d1f9. --- pex/bin/pex.py | 4 +--- pex/common.py | 2 +- pex/pex_builder.py | 2 +- tests/test_common.py | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pex/bin/pex.py b/pex/bin/pex.py index 86ae95e20..d7737a2f6 100755 --- a/pex/bin/pex.py +++ b/pex/bin/pex.py @@ -305,9 +305,7 @@ def configure_clp_pex_options(parser): group.add_option( '--use-system-time', '--no-use-system-time', dest='use_system_time', - # TODO: set this to True. It's only False for now to test what happens in CI if we - # always use deterministic timestamps. - default=False, + default=True, action='callback', callback=parse_bool, help='Use the current system time to generate timestamps for the new pex. Otherwise, Pex ' diff --git a/pex/common.py b/pex/common.py index f5aafc568..5270dd7c6 100644 --- a/pex/common.py +++ b/pex/common.py @@ -386,7 +386,7 @@ def __str__(self): def delete(self): shutil.rmtree(self.chroot) - def zip(self, filename, mode='w', deterministic_timestamp=True): + def zip(self, filename, mode='w', deterministic_timestamp=False): with open_zip(filename, mode) as zf: for f in sorted(self.files()): full_path = os.path.join(self.chroot, f) diff --git a/pex/pex_builder.py b/pex/pex_builder.py index a5ab69506..0439fae59 100644 --- a/pex/pex_builder.py +++ b/pex/pex_builder.py @@ -428,7 +428,7 @@ def freeze(self, bytecode_compile=True): self._precompile_source() self._frozen = True - def build(self, filename, bytecode_compile=True, deterministic_timestamp=True): + def build(self, filename, bytecode_compile=True, deterministic_timestamp=False): """Package the PEX into a zipfile. :param filename: The filename where the PEX should be stored. diff --git a/tests/test_common.py b/tests/test_common.py index 23c0673c6..c3ac69041 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -105,7 +105,7 @@ def assert_chroot_perms(copyfn): assert extract_perms(two) == extract_perms(os.path.join(chroot.path(), 'two')) zip_path = os.path.join(src, 'chroot.zip') - chroot.zip(zip_path, deterministic_timestamp=True) + chroot.zip(zip_path) with temporary_dir() as extract_dir: with contextlib.closing(PermPreservingZipFile(zip_path)) as zf: zf.extractall(extract_dir) From a7b8558d5b79f12e70be6a64af461760f0985cb2 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Fri, 3 May 2019 16:18:46 -0700 Subject: [PATCH 16/31] Ensure env gets cleaned up, even if test fails --- tests/test_pex_builder.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_pex_builder.py b/tests/test_pex_builder.py index 853971f79..3cf6a8f4c 100644 --- a/tests/test_pex_builder.py +++ b/tests/test_pex_builder.py @@ -197,5 +197,7 @@ def assert_all_files_have_timestamp(timestamp): datetime.datetime(*requested_date) - datetime.datetime(1970, 1, 1) ).total_seconds()) os.environ["SOURCE_DATE_EPOCH"] = str(utc_timestamp) - assert_all_files_have_timestamp(requested_date) - os.environ.pop("SOURCE_DATE_EPOCH") + try: + assert_all_files_have_timestamp(requested_date) + finally: + os.environ.pop("SOURCE_DATE_EPOCH") From f7364de731d90a72e4185990cda84f0324a96162 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Sun, 5 May 2019 10:20:11 -0700 Subject: [PATCH 17/31] Convert deterministic_datetime from a function to a constant We use the result many times, and it is not expected to ever change between a pex run, so this is better set to be a property. We eagerly evaluate the constant for simplicity of implementation. Because the calculation is so cheap, it is not necessary to lazily evaluate. --- pex/common.py | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/pex/common.py b/pex/common.py index 5270dd7c6..6b064a879 100644 --- a/pex/common.py +++ b/pex/common.py @@ -19,26 +19,21 @@ from uuid import uuid4 -def deterministic_datetime(): - """Return a deterministic UTC datetime object that does not depend on the system time. - - First try to read from the standard $SOURCE_DATE_EPOCH, then default to midnight January 1, - 1980, which is the start of time for MS-DOS time. Per section 4.4.6 of the zip spec at - https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, Zipfiles use MS-DOS time. So, we - use this default to ensure no issues with Zipfiles. - - Even though we use MS-DOS to inform the default value, note that we still return a normal UTC - datetime object for flexibility with how the result is used. - - For more information on $SOURCE_DATE_EPOCH, refer to - https://reproducible-builds.org/docs/source-date-epoch/.""" - return ( - datetime.utcfromtimestamp(int(os.environ['SOURCE_DATE_EPOCH'])) - if "SOURCE_DATE_EPOCH" in os.environ - else datetime( - year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None - ) +# We resolve a deterministic UTC datetime object that does not depend on the system time. +# * First try to read from the standard $SOURCE_DATE_EPOCH. See +# https://reproducible-builds.org/docs/source-date-epoch/. +# * Fallback to midnight January 1, 1980, which is the start of MS-DOS time. Per section +# 4.4.6 of the zip spec at https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, +# Zipfiles use MS-DOS time. +# * Even though we use MS-DOS to inform the default value, note that we still return a normal +# UTC datetime object for flexibility with how the result is used. +DETERMINISTIC_DATETIME = ( + datetime.utcfromtimestamp(int(os.environ['SOURCE_DATE_EPOCH'])) + if "SOURCE_DATE_EPOCH" in os.environ + else datetime( + year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None ) +) def die(msg, exit_code=1): @@ -412,7 +407,7 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): zinfo = zipfile.ZipInfo( filename=arcname, # NB: the constructor expects a six tuple of year-month-day-hour-minute-second. - date_time=deterministic_datetime().timetuple()[:6] + date_time=DETERMINISTIC_DATETIME.timetuple()[:6] ) zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 if isdir: From 0835f606ea678e065e4c8dcfd163a7ffda58dfd9 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Sun, 5 May 2019 12:25:55 -0700 Subject: [PATCH 18/31] Revert making deterministic_datetime a constant If it's a constant, theen it becomes very difficult to test that $SOURCE_DATE_EPOCH works because constants get evaluated at import time, and at import time os.environ does not have the env var set. It will ignore the test setting the value, because the constant never gets re-evaluated. --- pex/common.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/pex/common.py b/pex/common.py index 6b064a879..79378b0fb 100644 --- a/pex/common.py +++ b/pex/common.py @@ -19,21 +19,25 @@ from uuid import uuid4 -# We resolve a deterministic UTC datetime object that does not depend on the system time. -# * First try to read from the standard $SOURCE_DATE_EPOCH. See -# https://reproducible-builds.org/docs/source-date-epoch/. -# * Fallback to midnight January 1, 1980, which is the start of MS-DOS time. Per section -# 4.4.6 of the zip spec at https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, -# Zipfiles use MS-DOS time. -# * Even though we use MS-DOS to inform the default value, note that we still return a normal -# UTC datetime object for flexibility with how the result is used. -DETERMINISTIC_DATETIME = ( - datetime.utcfromtimestamp(int(os.environ['SOURCE_DATE_EPOCH'])) - if "SOURCE_DATE_EPOCH" in os.environ - else datetime( - year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None +def deterministic_datetime(): + """Resolve a deterministic UTC datetime object that does not depend on the system time. + + First try to read from the standard $SOURCE_DATE_EPOCH. See + https://reproducible-builds.org/docs/source-date-epoch/. + + Fallback to midnight January 1, 1980, which is the start of MS-DOS time. Per section 4.4.6 of + the zip spec at https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, Zipfiles use + MS-DOS time. + + Even though we use MS-DOS to inform the default value, note that we still return a normal + UTC datetime object for flexibility with how the result is used.""" + return ( + datetime.utcfromtimestamp(int(os.environ['SOURCE_DATE_EPOCH'])) + if "SOURCE_DATE_EPOCH" in os.environ + else datetime( + year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None + ) ) -) def die(msg, exit_code=1): @@ -407,7 +411,7 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): zinfo = zipfile.ZipInfo( filename=arcname, # NB: the constructor expects a six tuple of year-month-day-hour-minute-second. - date_time=DETERMINISTIC_DATETIME.timetuple()[:6] + date_time=deterministic_datetime().timetuple()[:6] ) zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 if isdir: From 0c9bd41cf9b082539fa5ef8302d9d8b7ddba8c7d Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Mon, 6 May 2019 11:05:30 -0700 Subject: [PATCH 19/31] No longer respect SOURCE_DATE_EPOCH We have no compelling use case for it and it complicates our code. --- pex/bin/pex.py | 1 - pex/common.py | 32 ++++++++++++-------------------- tests/test_pex_builder.py | 26 ++++++-------------------- 3 files changed, 18 insertions(+), 41 deletions(-) diff --git a/pex/bin/pex.py b/pex/bin/pex.py index d7737a2f6..5f55d3858 100755 --- a/pex/bin/pex.py +++ b/pex/bin/pex.py @@ -309,7 +309,6 @@ def configure_clp_pex_options(parser): action='callback', callback=parse_bool, help='Use the current system time to generate timestamps for the new pex. Otherwise, Pex ' - 'will first try to read the time from the environment variable SOURCE_DATE_EPOCH and ' 'will fallback to midnight on January 1, 1980. By using system time, the generated pex ' 'will not be reproducible, meaning that if you were to run `./pex -o` with the ' 'same inputs then the new pex would not be byte-for-byte identical to the original.') diff --git a/pex/common.py b/pex/common.py index 79378b0fb..766dd81e8 100644 --- a/pex/common.py +++ b/pex/common.py @@ -19,25 +19,17 @@ from uuid import uuid4 -def deterministic_datetime(): - """Resolve a deterministic UTC datetime object that does not depend on the system time. - - First try to read from the standard $SOURCE_DATE_EPOCH. See - https://reproducible-builds.org/docs/source-date-epoch/. - - Fallback to midnight January 1, 1980, which is the start of MS-DOS time. Per section 4.4.6 of - the zip spec at https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, Zipfiles use - MS-DOS time. - - Even though we use MS-DOS to inform the default value, note that we still return a normal - UTC datetime object for flexibility with how the result is used.""" - return ( - datetime.utcfromtimestamp(int(os.environ['SOURCE_DATE_EPOCH'])) - if "SOURCE_DATE_EPOCH" in os.environ - else datetime( - year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None - ) - ) +# We hardcode the datetime to January 1, 1980, which is the start of MS-DOS time. Per section +# 4.4.6 of the zip spec at https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, +# Zipfiles use MS-DOS time. +# NB: we do not respect the standard env var $SOURCE_DATE_EPOCH often used by tools for +# deterministic timestamps, as the intended use case is security by checking if the binary was +# tampered with; we do not have a compelling use case for respecing this env var, and it only +# complicates the code, but this decision can be revisited if a user has a compelling use +# case for it. +DETERMINISTIC_DATETIME = datetime( + year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None +) def die(msg, exit_code=1): @@ -411,7 +403,7 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): zinfo = zipfile.ZipInfo( filename=arcname, # NB: the constructor expects a six tuple of year-month-day-hour-minute-second. - date_time=deterministic_datetime().timetuple()[:6] + date_time=DETERMINISTIC_DATETIME.timetuple()[:6] ) zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 if isdir: diff --git a/tests/test_pex_builder.py b/tests/test_pex_builder.py index 3cf6a8f4c..19a583bc9 100644 --- a/tests/test_pex_builder.py +++ b/tests/test_pex_builder.py @@ -1,7 +1,6 @@ # Copyright 2014 Pants project contributors (see CONTRIBUTORS.md). # Licensed under the Apache License, Version 2.0 (see LICENSE). -import datetime import os import stat import zipfile @@ -182,22 +181,9 @@ def build_and_check(path, copy): def test_pex_builder_deterministic_timestamp(): - def assert_all_files_have_timestamp(timestamp): - pb = PEXBuilder() - with temporary_dir() as td: - target = os.path.join(td, 'foo.pex') - pb.build(target, deterministic_timestamp=True) - with zipfile.ZipFile(target) as zf: - assert all(zinfo.date_time == timestamp for zinfo in zf.infolist()) - - assert_all_files_have_timestamp((1980, 1, 1, 0, 0, 0)) - # Also test that $SOURCE_DATE_EPOCH is respected. - requested_date = (2019, 5, 1, 10, 10, 10) - utc_timestamp = int(( - datetime.datetime(*requested_date) - datetime.datetime(1970, 1, 1) - ).total_seconds()) - os.environ["SOURCE_DATE_EPOCH"] = str(utc_timestamp) - try: - assert_all_files_have_timestamp(requested_date) - finally: - os.environ.pop("SOURCE_DATE_EPOCH") + pb = PEXBuilder() + with temporary_dir() as td: + target = os.path.join(td, 'foo.pex') + pb.build(target, deterministic_timestamp=True) + with zipfile.ZipFile(target) as zf: + assert all(zinfo.date_time == (1980, 1, 1, 0, 0, 0) for zinfo in zf.infolist()) From 9774c2a820306d20bffd9cca901873a0b1dcba60 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Mon, 6 May 2019 11:25:08 -0700 Subject: [PATCH 20/31] Improve comments --- pex/bin/pex.py | 2 +- pex/common.py | 10 +++++----- pex/pex_builder.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pex/bin/pex.py b/pex/bin/pex.py index 5f55d3858..5697605b7 100755 --- a/pex/bin/pex.py +++ b/pex/bin/pex.py @@ -309,7 +309,7 @@ def configure_clp_pex_options(parser): action='callback', callback=parse_bool, help='Use the current system time to generate timestamps for the new pex. Otherwise, Pex ' - 'will fallback to midnight on January 1, 1980. By using system time, the generated pex ' + 'will use midnight on January 1, 1980. By using system time, the generated pex ' 'will not be reproducible, meaning that if you were to run `./pex -o` with the ' 'same inputs then the new pex would not be byte-for-byte identical to the original.') diff --git a/pex/common.py b/pex/common.py index 766dd81e8..4de704285 100644 --- a/pex/common.py +++ b/pex/common.py @@ -22,11 +22,11 @@ # We hardcode the datetime to January 1, 1980, which is the start of MS-DOS time. Per section # 4.4.6 of the zip spec at https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, # Zipfiles use MS-DOS time. -# NB: we do not respect the standard env var $SOURCE_DATE_EPOCH often used by tools for -# deterministic timestamps, as the intended use case is security by checking if the binary was -# tampered with; we do not have a compelling use case for respecing this env var, and it only -# complicates the code, but this decision can be revisited if a user has a compelling use -# case for it. +# NB: We do not respect the standard env var $SOURCE_DATE_EPOCH often used by tools for +# deterministic timestamps, as the intended use case is for security by checking if the binary +# was tampered with. We do not have a compelling use case for respecting this env var, and it +# only complicates the code, so we do not use it. However, this decision can be revisited if a +# user has a compelling use case for it. DETERMINISTIC_DATETIME = datetime( year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None ) diff --git a/pex/pex_builder.py b/pex/pex_builder.py index 0439fae59..0e6ba3a8a 100644 --- a/pex/pex_builder.py +++ b/pex/pex_builder.py @@ -433,7 +433,7 @@ def build(self, filename, bytecode_compile=True, deterministic_timestamp=False): :param filename: The filename where the PEX should be stored. :param bytecode_compile: If True, precompile .py files into .pyc files. - :param deterministic_timestamp: If True, will not use system time for the zipfile timestamps. + :param deterministic_timestamp: If True, will use our hardcoded time for zipfile timestamps. If the PEXBuilder is not yet frozen, it will be frozen by ``build``. This renders the PEXBuilder immutable. From 477e2ebbc73e1954a40ea22a1f0a7c96a8c6c3ea Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Mon, 6 May 2019 11:29:43 -0700 Subject: [PATCH 21/31] Change rationale for not respecting SOURCE_DATE_EPOCH --- pex/common.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pex/common.py b/pex/common.py index 4de704285..7a78632be 100644 --- a/pex/common.py +++ b/pex/common.py @@ -23,10 +23,12 @@ # 4.4.6 of the zip spec at https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, # Zipfiles use MS-DOS time. # NB: We do not respect the standard env var $SOURCE_DATE_EPOCH often used by tools for -# deterministic timestamps, as the intended use case is for security by checking if the binary -# was tampered with. We do not have a compelling use case for respecting this env var, and it -# only complicates the code, so we do not use it. However, this decision can be revisited if a -# user has a compelling use case for it. +# deterministic timestamps, as doing so risks reducing the reproducibility of built pexes across +# different platforms, e.g. if two platforms set the env var differently. Usually, this is +# supposed to be set for the sake of security to check that a shipped binary was not tampered +# with, but that is not our primary use case, so we do not respect it both for simplicity of +# our codebase and to avoid this potential reduction in reproducibility. Refer to +# https://reproducible-builds.org/docs/source-date-epoch/ for more information. DETERMINISTIC_DATETIME = datetime( year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None ) From 631b3ed99810d0c946d757a3dd294d42d7d40274 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 15:25:25 -0700 Subject: [PATCH 22/31] Drop overly verbose comment about SOURCE_DATE_EPOCH We don't need to explain this here because readers of the code are unlikely to anticipate it being an issue, so are likely to get distracted more than anything. Instead, this PR description is updated to make the explanation if anyone is wondering "wait why didn't they do this?" Also make the MS-DOS explanation more concise. --- pex/common.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pex/common.py b/pex/common.py index 7a78632be..4891e1a3e 100644 --- a/pex/common.py +++ b/pex/common.py @@ -19,16 +19,8 @@ from uuid import uuid4 -# We hardcode the datetime to January 1, 1980, which is the start of MS-DOS time. Per section -# 4.4.6 of the zip spec at https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, -# Zipfiles use MS-DOS time. -# NB: We do not respect the standard env var $SOURCE_DATE_EPOCH often used by tools for -# deterministic timestamps, as doing so risks reducing the reproducibility of built pexes across -# different platforms, e.g. if two platforms set the env var differently. Usually, this is -# supposed to be set for the sake of security to check that a shipped binary was not tampered -# with, but that is not our primary use case, so we do not respect it both for simplicity of -# our codebase and to avoid this potential reduction in reproducibility. Refer to -# https://reproducible-builds.org/docs/source-date-epoch/ for more information. +# We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of +# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). DETERMINISTIC_DATETIME = datetime( year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None ) From 69106c051ad9ccfe86e2f6561a1ebda1fc70c595 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 15:42:31 -0700 Subject: [PATCH 23/31] Create new zip_info_from_file() function as part of PermPreservingZipFile --- pex/common.py | 62 +++++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/pex/common.py b/pex/common.py index 4891e1a3e..e80d86315 100644 --- a/pex/common.py +++ b/pex/common.py @@ -90,6 +90,35 @@ def teardown(self): class PermPreservingZipFile(zipfile.ZipFile, object): """A ZipFile that works around https://bugs.python.org/issue15795""" + @classmethod + def zip_info_from_file(cls, filename, arcname=None, date_time=None): + """Construct a ZipInfo for a file on the filesystem. + + Usually this is provided directly as a method of ZipInfo, but it is not implemented in Python + 2.7 so we re-implement it here. The main divergance we make from the original is adding a + parameter for the datetime (a six-tuple), which allows us to use a deterministic timestamp. + See https://github.com/python/cpython/blob/master/Lib/zipfile.py#L495.""" + st = os.stat(filename) + isdir = stat.S_ISDIR(st.st_mode) + if arcname is None: + arcname = filename + arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) + while arcname[0] in (os.sep, os.altsep): + arcname = arcname[1:] + if isdir: + arcname += '/' + if date_time is None: + mtime = time.localtime(st.st_mtime) + date_time = mtime[0:6] + zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time) + zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes + if isdir: + zinfo.file_size = 0 + zinfo.external_attr |= 0x10 # MS-DOS directory flag + else: + zinfo.file_size = st.st_size + return zinfo + def _extract_member(self, member, targetpath, pwd): result = super(PermPreservingZipFile, self)._extract_member(member, targetpath, pwd) info = member if isinstance(member, zipfile.ZipInfo) else self.getinfo(member) @@ -104,6 +133,7 @@ def _chmod(self, info, path): os.chmod(path, attr) + @contextlib.contextmanager def open_zip(path, *args, **kwargs): """A contextmanager for zip files. Passes through positional and kwargs to zipfile.ZipFile.""" @@ -382,34 +412,12 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): compress_type=zipfile.ZIP_DEFLATED ) else: - # This code mostly reimplements ZipInfo.from_file(), which is not available in Python - # 2.7. See https://github.com/python/cpython/blob/master/Lib/zipfile.py#L495. - st = os.stat(full_path) - isdir = stat.S_ISDIR(st.st_mode) - # Determine arcname, i.e. the archive name. - arcname = f - arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) - while arcname[0] in (os.sep, os.altsep): - arcname = arcname[1:] - if isdir: - arcname += '/' - # Setup ZipInfo with deterministic datetime. - zinfo = zipfile.ZipInfo( - filename=arcname, + zinfo = zf.zip_info_from_file( + filename=full_path, + arcname=f, # NB: the constructor expects a six tuple of year-month-day-hour-minute-second. date_time=DETERMINISTIC_DATETIME.timetuple()[:6] ) - zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 - if isdir: - zinfo.file_size = 0 - zinfo.external_attr |= 0x10 # MS-DOS directory flag - else: - zinfo.file_size = st.st_size - # Determine the data to write. - if isdir: - data = b'' - else: - with open(full_path, 'rb') as open_f: - data = open_f.read() - # Write to the archive. + with open(full_path, 'rb') as open_f: + data = open_f.read() zf.writestr(zinfo, data, compress_type=zipfile.ZIP_DEFLATED) From d82cd99cef58d2d07181a163572fba9b27e4f393 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 15:48:35 -0700 Subject: [PATCH 24/31] Always use zf.writestr() Makes the code more consistent and simpler. --- pex/common.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/pex/common.py b/pex/common.py index e80d86315..bfbb980a3 100644 --- a/pex/common.py +++ b/pex/common.py @@ -405,19 +405,13 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): with open_zip(filename, mode) as zf: for f in sorted(self.files()): full_path = os.path.join(self.chroot, f) - if not deterministic_timestamp: - zf.write( - full_path, - arcname=f, - compress_type=zipfile.ZIP_DEFLATED - ) - else: - zinfo = zf.zip_info_from_file( + zinfo = zf.zip_info_from_file( filename=full_path, arcname=f, - # NB: the constructor expects a six tuple of year-month-day-hour-minute-second. - date_time=DETERMINISTIC_DATETIME.timetuple()[:6] - ) - with open(full_path, 'rb') as open_f: - data = open_f.read() - zf.writestr(zinfo, data, compress_type=zipfile.ZIP_DEFLATED) + date_time=( + None if not deterministic_timestamp else DETERMINISTIC_DATETIME.timetuple()[:6] + ) + ) + with open(full_path, 'rb') as open_f: + data = open_f.read() + zf.writestr(zinfo, data, compress_type=zipfile.ZIP_DEFLATED) From b9d36e8e0d1e5be5b5cdf8b5984a174b3da2d940 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 15:51:23 -0700 Subject: [PATCH 25/31] Remove bad extra whitespace --- pex/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pex/common.py b/pex/common.py index bfbb980a3..1dc4fde9c 100644 --- a/pex/common.py +++ b/pex/common.py @@ -133,7 +133,6 @@ def _chmod(self, info, path): os.chmod(path, attr) - @contextlib.contextmanager def open_zip(path, *args, **kwargs): """A contextmanager for zip files. Passes through positional and kwargs to zipfile.ZipFile.""" From ebe15c256d4c44577c95fa2170187f15fb182cea Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 16:49:50 -0700 Subject: [PATCH 26/31] Store DETERMINISTIC_DATETIME as a 6-tuple rather than datetime We have no need for the more general datetime format yet. Don't make things more general until necessary! --- pex/common.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pex/common.py b/pex/common.py index 1dc4fde9c..655419513 100644 --- a/pex/common.py +++ b/pex/common.py @@ -20,10 +20,11 @@ # We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of -# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). +# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). ZipInfo expects a 6-tuple of +# year-month-day-hour-minute-second, which we use here. DETERMINISTIC_DATETIME = datetime( - year=1980, month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None -) + year=1980, month=1, day=1, hour=0, minute=0, second=0, tzinfo=None +).timetuple[:6] def die(msg, exit_code=1): @@ -407,9 +408,7 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): zinfo = zf.zip_info_from_file( filename=full_path, arcname=f, - date_time=( - None if not deterministic_timestamp else DETERMINISTIC_DATETIME.timetuple()[:6] - ) + date_time=None if not deterministic_timestamp else DETERMINISTIC_DATETIME ) with open(full_path, 'rb') as open_f: data = open_f.read() From 099f14663644c6900206125a364f8e09dd2342bb Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 16:53:25 -0700 Subject: [PATCH 27/31] Invert ternary expression --- pex/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pex/common.py b/pex/common.py index 655419513..7c5e9d0d6 100644 --- a/pex/common.py +++ b/pex/common.py @@ -408,7 +408,7 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): zinfo = zf.zip_info_from_file( filename=full_path, arcname=f, - date_time=None if not deterministic_timestamp else DETERMINISTIC_DATETIME + date_time=DETERMINISTIC_DATETIME if deterministic_timestamp else None ) with open(full_path, 'rb') as open_f: data = open_f.read() From 23c5a6fc86f5b2d3cb6302a9eb6c293db4f06c50 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 16:54:29 -0700 Subject: [PATCH 28/31] Oops, typo in constant My bad for not testing before pushing. --- pex/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pex/common.py b/pex/common.py index 7c5e9d0d6..1fc7ec2bc 100644 --- a/pex/common.py +++ b/pex/common.py @@ -24,7 +24,7 @@ # year-month-day-hour-minute-second, which we use here. DETERMINISTIC_DATETIME = datetime( year=1980, month=1, day=1, hour=0, minute=0, second=0, tzinfo=None -).timetuple[:6] +).timetuple()[:6] def die(msg, exit_code=1): From 8bb8ed493db2e18165dde24d6739d9845354b03b Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 17:28:29 -0700 Subject: [PATCH 29/31] Deduplicate slicing the time.struct_time --- pex/common.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pex/common.py b/pex/common.py index 1fc7ec2bc..8ceb56ea2 100644 --- a/pex/common.py +++ b/pex/common.py @@ -20,11 +20,11 @@ # We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of -# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). ZipInfo expects a 6-tuple of -# year-month-day-hour-minute-second, which we use here. +# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). Note that ZipInfo expects a +# time.struct_time, which we set here. DETERMINISTIC_DATETIME = datetime( year=1980, month=1, day=1, hour=0, minute=0, second=0, tzinfo=None -).timetuple()[:6] +).timetuple() def die(msg, exit_code=1): @@ -97,8 +97,8 @@ def zip_info_from_file(cls, filename, arcname=None, date_time=None): Usually this is provided directly as a method of ZipInfo, but it is not implemented in Python 2.7 so we re-implement it here. The main divergance we make from the original is adding a - parameter for the datetime (a six-tuple), which allows us to use a deterministic timestamp. - See https://github.com/python/cpython/blob/master/Lib/zipfile.py#L495.""" + parameter for the datetime (a time.struct_time), which allows us to use a deterministic + timestamp. See https://github.com/python/cpython/blob/master/Lib/zipfile.py#L495.""" st = os.stat(filename) isdir = stat.S_ISDIR(st.st_mode) if arcname is None: @@ -109,9 +109,8 @@ def zip_info_from_file(cls, filename, arcname=None, date_time=None): if isdir: arcname += '/' if date_time is None: - mtime = time.localtime(st.st_mtime) - date_time = mtime[0:6] - zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time) + datetime = time.localtime(st.st_mtime) + zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6]) zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes if isdir: zinfo.file_size = 0 From a528eb4fc212f5188be48a29822aa3594e28dfeb Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 17:49:22 -0700 Subject: [PATCH 30/31] Don't lie about expected input --- pex/common.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pex/common.py b/pex/common.py index 8ceb56ea2..ef3409a9c 100644 --- a/pex/common.py +++ b/pex/common.py @@ -20,11 +20,10 @@ # We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of -# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). Note that ZipInfo expects a -# time.struct_time, which we set here. +# https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT). DETERMINISTIC_DATETIME = datetime( year=1980, month=1, day=1, hour=0, minute=0, second=0, tzinfo=None -).timetuple() +) def die(msg, exit_code=1): @@ -407,7 +406,7 @@ def zip(self, filename, mode='w', deterministic_timestamp=False): zinfo = zf.zip_info_from_file( filename=full_path, arcname=f, - date_time=DETERMINISTIC_DATETIME if deterministic_timestamp else None + date_time=DETERMINISTIC_DATETIME.timetuple() if deterministic_timestamp else None ) with open(full_path, 'rb') as open_f: data = open_f.read() From adf272654ef7cdf553b113a2f30e47ac48994b16 Mon Sep 17 00:00:00 2001 From: Eric Arellano <ericarellano@me.com> Date: Tue, 7 May 2019 17:50:55 -0700 Subject: [PATCH 31/31] Fix typo that was causing failures --- pex/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pex/common.py b/pex/common.py index ef3409a9c..3daca1009 100644 --- a/pex/common.py +++ b/pex/common.py @@ -108,7 +108,7 @@ def zip_info_from_file(cls, filename, arcname=None, date_time=None): if isdir: arcname += '/' if date_time is None: - datetime = time.localtime(st.st_mtime) + date_time = time.localtime(st.st_mtime) zinfo = zipfile.ZipInfo(filename=arcname, date_time=date_time[:6]) zinfo.external_attr = (st.st_mode & 0xFFFF) << 16 # Unix attributes if isdir: