From bf42f38f7c08a96c60cb4fffb3153d1544eda2c2 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Thu, 24 Mar 2022 16:38:57 +0100 Subject: [PATCH 01/25] Add filter classes and tests Signed-off-by: Tim Bauer --- flytekit/tools/filter.py | 100 ++++++++ tests/flytekit/unit/tools/test_filter.py | 281 +++++++++++++++++++++++ 2 files changed, 381 insertions(+) create mode 100644 flytekit/tools/filter.py create mode 100644 tests/flytekit/unit/tools/test_filter.py diff --git a/flytekit/tools/filter.py b/flytekit/tools/filter.py new file mode 100644 index 0000000000..a524b22f03 --- /dev/null +++ b/flytekit/tools/filter.py @@ -0,0 +1,100 @@ +from fnmatch import fnmatch +import os +import subprocess +from abc import ABC, abstractmethod +from pathlib import Path +import tarfile as _tarfile +from shutil import which +from typing import List, Optional +from docker.utils.build import PatternMatcher + +from flytekit.loggers import cli_logger + + +STANDARD_IGNORE_PATTERNS = [ + "*.pyc", + ".cache", + ".cache/*", + "__pycache__", + "**/__pycache__" +] + +class Ignore(ABC): + """ Base for Ignores, implements core logic. Children have to implement _is_ignored """ + def __init__(self, root: str): + self.root = root + + def is_ignored(self, path: str) -> bool: + if os.path.isabs(path): + path = os.path.relpath(path, self.root) + return self._is_ignored(path) + + def tar_filter(self, tarinfo: _tarfile.TarInfo) -> Optional[_tarfile.TarInfo]: + if self.is_ignored(tarinfo.name): + return None + return tarinfo + + @abstractmethod + def _is_ignored(self, path: str) -> bool: + pass + + +class GitIgnore(Ignore): + """ Uses git cli (if available) to check whether a path is ignored. """ + def __init__(self, root: Path): + super().__init__(root) + self.has_git = which("git") is not None + + def _is_ignored(self, path: str) -> bool: + if self.has_git: + out = subprocess.run(["git", "check-ignore", path], cwd=self.root) + # Returncode is 0 if file is ignored and 1 if otherwise + return not out.returncode + cli_logger.info(f"No git executable found, not applying any filters") + return False + + +class DockerIgnore(Ignore): + """ Uses docker-py's PatternMatcher to check whether a path is ignored. """ + def __init__(self, root: Path): + super().__init__(root) + self.pm = self._parse() + + def _parse(self) -> List[str]: + patterns = [] + dockerignore = os.path.join(self.root, ".dockerignore") + if os.path.isfile(dockerignore): + with open(dockerignore, "r") as f: + patterns = [l.strip() for l in f.readlines() if l and not l.startswith("#")] + cli_logger.info(f"No .dockerignore found in {self.root}, not applying any filters") + return PatternMatcher(patterns) + + def _is_ignored(self, path: str) -> bool: + return self.pm.matches(path) + + +class StandardIgnore(Ignore): + """ Retains the standard ignore functionality that previously existed. Could in theory + by fed with custom ignore patterns from cli. """ + def __init__(self, root: Path, patterns: Optional[List[str]] = None): + super().__init__(root) + self.patterns = patterns if patterns else STANDARD_IGNORE_PATTERNS + + def _is_ignored(self, path: str) -> bool: + for pattern in self.patterns: + if fnmatch(path, pattern): + return True + return False + + +class IgnoreGroup(Ignore): + """ Groups multiple Ignores and checks a path against them. A file is ignored if any + Ignore considers it ignored.""" + def __init__(self, root: str, ignores: List[Ignore]): + self.ignores = [ignore(root) for ignore in ignores] + + def _is_ignored(self, path: str) -> bool: + for ignore in self.ignores: + if ignore.is_ignored(path): + return True + return False diff --git a/tests/flytekit/unit/tools/test_filter.py b/tests/flytekit/unit/tools/test_filter.py new file mode 100644 index 0000000000..f4db2686fd --- /dev/null +++ b/tests/flytekit/unit/tools/test_filter.py @@ -0,0 +1,281 @@ +import pytest +from typing import List, Dict +from flytekit.tools.filter import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore +from pathlib import Path +import subprocess +from unittest.mock import patch +from docker.utils.build import PatternMatcher +from tarfile import TarInfo + +def make_tree(root: Path, tree: Dict): + for name, content in tree.items(): + if isinstance(content, dict): + directory = root / name + directory.mkdir() + make_tree(directory, content) + if isinstance(content, str): + file = root / name + file.write_text(content) + + +@pytest.fixture +def simple_gitignore(tmp_path): + tree = { + "sub" : { + "some.bar": "" + }, + "test.foo": "", + "keep.foo": "", + ".gitignore": "\n".join([ + "*.foo", + "!keep.foo", + "# A comment", + "sub" + ]) + } + + make_tree(tmp_path, tree) + subprocess.run(["git", "init", tmp_path]) + return tmp_path + + +@pytest.fixture +def nested_gitignore(tmp_path): + tree = { + "sub" : { + "some.foo": "", + "another.foo": "", + ".gitignore": "!another.foo" + }, + "data" : { + ".gitignore" : "*", + "large.file" : "" + }, + "test.foo": "", + "keep.foo": "", + ".gitignore": "\n".join([ + "*.foo", + "!keep.foo", + "# A comment", + ]) + } + + make_tree(tmp_path, tree) + subprocess.run(["git", "init", tmp_path]) + return tmp_path + + +@pytest.fixture +def simple_dockerignore(tmp_path): + tree = { + "sub" : { + "some.bar": "" + }, + "test.foo": "", + "keep.foo": "", + ".dockerignore": "\n".join([ + "*.foo", + "!keep.foo", + "# A comment", + "sub" + ]) + } + + make_tree(tmp_path, tree) + return tmp_path + + +@pytest.fixture +def no_ignore(tmp_path): + tree = { + "sub" : { + "some.bar": "" + }, + "test.foo": "", + "keep.foo": "", + } + + make_tree(tmp_path, tree) + return tmp_path + + +@pytest.fixture +def all_ignore(tmp_path): + tree = { + "sub" : { + "some.bar": "", + "__pycache__": { + "some.pyc" + }, + }, + "data" : { + "reallybigfile.bar": "" + }, + ".cache" : { + "something.cached": "" + }, + "test.foo": "", + "keep.foo": "", + ".dockerignore": "\n".join([ + "# A comment", + "data", + ".git" + ]), + ".gitignore": "\n".join([ + "*.foo", + "!keep.foo" + ]) + } + + make_tree(tmp_path, tree) + subprocess.run(["git", "init", tmp_path]) + return tmp_path + + +def test_simple_gitignore(simple_gitignore): + gitignore = GitIgnore(simple_gitignore) + assert gitignore.is_ignored(str(simple_gitignore / "test.foo")) + assert gitignore.is_ignored(str(simple_gitignore / "sub")) + assert gitignore.is_ignored(str(simple_gitignore / "sub" / "some.bar")) + assert not gitignore.is_ignored(str(simple_gitignore / "keep.foo")) + assert not gitignore.is_ignored(str(simple_gitignore / ".gitignore")) + assert not gitignore.is_ignored(str(simple_gitignore / ".git")) + + +def test_not_subpath(simple_gitignore): + """ Test edge case that if path is not on root it cannot not be ignored """ + gitignore = GitIgnore(simple_gitignore) + assert not gitignore.is_ignored("/whatever/test.foo") + + +def test_nested_gitignore(nested_gitignore): + """ Test override with nested gitignore and star-ignore """ + gitignore = GitIgnore(nested_gitignore) + assert gitignore.is_ignored(str(nested_gitignore / "test.foo")) + assert not gitignore.is_ignored(str(nested_gitignore / "sub")) + assert gitignore.is_ignored(str(nested_gitignore / "sub" / "some.foo")) + assert gitignore.is_ignored(str(nested_gitignore / "data" / "large.file")) + assert not gitignore.is_ignored(str(nested_gitignore / "sub" / "another.foo")) + assert not gitignore.is_ignored(str(nested_gitignore / ".gitignore")) + assert not gitignore.is_ignored(str(nested_gitignore / ".git")) + + +@patch('flytekit.tools.filter.which') +def test_no_git(mock_which, simple_gitignore): + """ Test that nothing is ignored if no git cli available """ + mock_which.return_value = None + gitignore = GitIgnore(simple_gitignore) + assert not gitignore.has_git + assert not gitignore.is_ignored(str(simple_gitignore / "test.foo")) + assert not gitignore.is_ignored(str(simple_gitignore / "sub")) + assert not gitignore.is_ignored(str(simple_gitignore / "keep.foo")) + assert not gitignore.is_ignored(str(simple_gitignore / ".gitignore")) + assert not gitignore.is_ignored(str(simple_gitignore / ".git")) + + +def test_dockerignore_parse(simple_dockerignore): + """ Test .dockerignore file parsing """ + dockerignore = DockerIgnore(simple_dockerignore) + assert [p.cleaned_pattern for p in dockerignore.pm.patterns] == [ + "*.foo", + "keep.foo", + "sub", + ".dockerignore" + ] + assert [p.exclusion for p in dockerignore.pm.patterns] == [ + False, + True, + False, + True + ] + +def test_patternmatcher(): + """ Test that PatternMatcher works as expected """ + patterns = [ + "*.foo", + "!keep.foo", + "sub" + ] + pm = PatternMatcher(patterns) + assert pm.matches("whatever.foo") + assert not pm.matches("keep.foo") + assert pm.matches("sub") + assert pm.matches("sub/stuff.txt") + + +def test_simple_dockerignore(simple_dockerignore): + dockerignore = DockerIgnore(simple_dockerignore) + assert dockerignore.is_ignored(str(simple_dockerignore / "test.foo")) + assert dockerignore.is_ignored(str(simple_dockerignore / "sub")) + assert dockerignore.is_ignored(str(simple_dockerignore / "sub" / "some.bar")) + assert not dockerignore.is_ignored(str(simple_dockerignore / "keep.foo")) + + +def test_no_ignore(no_ignore): + """ Test that nothing is ignored if no ignore files present """ + dockerignore = DockerIgnore(no_ignore) + assert not dockerignore.is_ignored(str(no_ignore / "test.foo")) + assert not dockerignore.is_ignored(str(no_ignore / "keep.foo")) + assert not dockerignore.is_ignored(str(no_ignore / "sub")) + assert not dockerignore.is_ignored(str(no_ignore / "sub" / "some.bar")) + + gitignore = GitIgnore(no_ignore) + assert not gitignore.is_ignored(str(no_ignore / "test.foo")) + assert not gitignore.is_ignored(str(no_ignore / "keep.foo")) + assert not gitignore.is_ignored(str(no_ignore / "sub")) + assert not gitignore.is_ignored(str(no_ignore / "sub" / "some.bar")) + + +def test_standard_ignore(): + """ Test the standard ignore cases previously hardcoded """ + patterns = [ + "*.pyc", + ".cache", + ".cache/*", + "__pycache__", + "**/__pycache__", + "*.foo" + ] + ignore = StandardIgnore(root=".", patterns=patterns) + assert not ignore.is_ignored("foo.py") + assert ignore.is_ignored("foo.pyc") + assert ignore.is_ignored(".cache/foo") + assert ignore.is_ignored("__pycache__") + assert ignore.is_ignored("foo/__pycache__") + assert ignore.is_ignored("spam/ham/some.foo") + + +def test_all_ignore(all_ignore): + """ Test all ignores grouped together """ + ignore = IgnoreGroup(all_ignore, [GitIgnore, DockerIgnore, StandardIgnore]) + assert not ignore.is_ignored("sub") + assert not ignore.is_ignored("sub/some.bar") + assert ignore.is_ignored("sub/__pycache__") + assert ignore.is_ignored("sub/__pycache__/some.pyc") + assert ignore.is_ignored("data") + assert ignore.is_ignored("data/reallybigfile.bar") + assert ignore.is_ignored(".cache") + assert ignore.is_ignored(".cache/something.cached") + assert ignore.is_ignored("test.foo") + assert not ignore.is_ignored("keep.foo") + assert not ignore.is_ignored(".gitignore") + assert not ignore.is_ignored(".dockerignore") + assert ignore.is_ignored(".git") + + +def test_all_ignore_tar_filter(all_ignore): + """ Test tar_filter method of all ignores grouped together """ + ignore = IgnoreGroup(all_ignore, [GitIgnore, DockerIgnore, StandardIgnore]) + assert ignore.tar_filter(TarInfo(name="sub")).name == "sub" + assert ignore.tar_filter(TarInfo(name="sub/some.bar")).name == "sub/some.bar" + assert not ignore.tar_filter(TarInfo(name="sub/__pycache__")) + assert not ignore.tar_filter(TarInfo(name="sub/__pycache__/some.pyc")) + assert not ignore.tar_filter(TarInfo(name="data")) + assert not ignore.tar_filter(TarInfo(name="data/reallybigfile.bar")) + assert not ignore.tar_filter(TarInfo(name=".cache")) + assert not ignore.tar_filter(TarInfo(name=".cache/something.cached")) + assert not ignore.tar_filter(TarInfo(name="test.foo")) + assert ignore.tar_filter(TarInfo(name="keep.foo")).name == "keep.foo" + assert ignore.tar_filter(TarInfo(name=".gitignore")).name == ".gitignore" + assert ignore.tar_filter(TarInfo(name=".dockerignore")).name == ".dockerignore" + assert not ignore.tar_filter(TarInfo(name=".git")) From 9e925cdaf9695181ae075dd49f4a1ff8040b35cd Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Thu, 24 Mar 2022 17:33:10 +0100 Subject: [PATCH 02/25] Rename to ignore Signed-off-by: Tim Bauer --- flytekit/tools/{filter.py => ignore.py} | 0 tests/flytekit/unit/tools/{test_filter.py => test_ignore.py} | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename flytekit/tools/{filter.py => ignore.py} (100%) rename tests/flytekit/unit/tools/{test_filter.py => test_ignore.py} (98%) diff --git a/flytekit/tools/filter.py b/flytekit/tools/ignore.py similarity index 100% rename from flytekit/tools/filter.py rename to flytekit/tools/ignore.py diff --git a/tests/flytekit/unit/tools/test_filter.py b/tests/flytekit/unit/tools/test_ignore.py similarity index 98% rename from tests/flytekit/unit/tools/test_filter.py rename to tests/flytekit/unit/tools/test_ignore.py index f4db2686fd..15e571d309 100644 --- a/tests/flytekit/unit/tools/test_filter.py +++ b/tests/flytekit/unit/tools/test_ignore.py @@ -1,6 +1,6 @@ import pytest from typing import List, Dict -from flytekit.tools.filter import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore +from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore from pathlib import Path import subprocess from unittest.mock import patch @@ -160,7 +160,7 @@ def test_nested_gitignore(nested_gitignore): assert not gitignore.is_ignored(str(nested_gitignore / ".git")) -@patch('flytekit.tools.filter.which') +@patch('flytekit.tools.ignore.which') def test_no_git(mock_which, simple_gitignore): """ Test that nothing is ignored if no git cli available """ mock_which.return_value = None From 8f298a209ec57e672f0d41f0f9a7ba8a0c3715cf Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Thu, 24 Mar 2022 17:35:18 +0100 Subject: [PATCH 03/25] Remove old filter test Signed-off-by: Tim Bauer --- .../unit/tools/test_fast_registration.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/tests/flytekit/unit/tools/test_fast_registration.py b/tests/flytekit/unit/tools/test_fast_registration.py index 62201d515e..f3bcf0f111 100644 --- a/tests/flytekit/unit/tools/test_fast_registration.py +++ b/tests/flytekit/unit/tools/test_fast_registration.py @@ -1,20 +1,6 @@ import tarfile -from flytekit.tools.fast_registration import filter_tar_file_fn, get_additional_distribution_loc - - -def testfilter_tar_file_fn(): - valid_tarinfo = tarfile.TarInfo(name="foo.py") - assert filter_tar_file_fn(valid_tarinfo) is not None - - invalid_tarinfo = tarfile.TarInfo(name="foo.pyc") - assert not filter_tar_file_fn(invalid_tarinfo) - - invalid_tarinfo = tarfile.TarInfo(name=".cache/foo") - assert not filter_tar_file_fn(invalid_tarinfo) - - invalid_tarinfo = tarfile.TarInfo(name="__pycache__") - assert not filter_tar_file_fn(invalid_tarinfo) +from flytekit.tools.fast_registration import get_additional_distribution_loc def test_get_additional_distribution_loc(): From a6ea4e2fd441b0e01421542681f009a8ea017cf6 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Thu, 24 Mar 2022 17:35:40 +0100 Subject: [PATCH 04/25] Create archive function and test Signed-off-by: Tim Bauer --- flytekit/tools/package_helpers.py | 8 +++ tests/flytekit/unit/tools/test_archive.py | 64 +++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 flytekit/tools/package_helpers.py create mode 100644 tests/flytekit/unit/tools/test_archive.py diff --git a/flytekit/tools/package_helpers.py b/flytekit/tools/package_helpers.py new file mode 100644 index 0000000000..97e0e28f7b --- /dev/null +++ b/flytekit/tools/package_helpers.py @@ -0,0 +1,8 @@ +import tarfile +from flytekit.tools.ignore import IgnoreGroup, GitIgnore, DockerIgnore, StandardIgnore + + +def create_archive(source: str, name: str) -> None: + ignore = IgnoreGroup(source, [GitIgnore, DockerIgnore, StandardIgnore]) + with tarfile.open(name, "w:gz") as tar: + tar.add(source, arcname="", filter=ignore.tar_filter) diff --git a/tests/flytekit/unit/tools/test_archive.py b/tests/flytekit/unit/tools/test_archive.py new file mode 100644 index 0000000000..469a9a4912 --- /dev/null +++ b/tests/flytekit/unit/tools/test_archive.py @@ -0,0 +1,64 @@ +import pytest +import subprocess +import tarfile +from flytekit.tools.package_helpers import create_archive +from tests.flytekit.unit.tools.test_ignore import make_tree + + +@pytest.fixture +def flyte_project(tmp_path): + tree = { + "data" : { + "large.file": "", + "more.files": "" + }, + "src" : { + "workflows" : { + "__pycache__" : { + "some.pyc": "" + }, + "hello_world.py": "print('Hello World!')", + } + }, + ".venv": { + "lots": "", + "of": "", + "packages": "" + }, + ".env": "supersecret", + "some.bar": "", + "some.foo": "", + "keep.foo": "", + ".gitignore": "\n".join([ + ".env", + ".venv", + "# A comment", + "data", + "*.foo", + "!keep.foo" + ]), + ".dockerignore": "\n".join([ + "data", + "*.bar", + ".git" + ]) + } + + make_tree(tmp_path, tree) + subprocess.run(["git", "init", tmp_path]) + return tmp_path + + +def test_archive(flyte_project, tmp_path): + archive_fname = tmp_path / "archive.tar.gz" + create_archive(source=flyte_project, name=archive_fname) + with tarfile.open(archive_fname) as tar: + assert tar.getnames() == [ + '', # tar root, output removes leading '/' + '.dockerignore', + '.gitignore', + 'keep.foo', + 'src', + 'src/workflows', + 'src/workflows/hello_world.py' + ] From 7c9e6cb9a0fcb9b905b74a4fa183645e5d3d1d84 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Thu, 24 Mar 2022 17:36:04 +0100 Subject: [PATCH 05/25] Refactor to use archive function Signed-off-by: Tim Bauer --- flytekit/clis/sdk_in_container/package.py | 3 +++ flytekit/clis/sdk_in_container/serialize.py | 6 ++++-- flytekit/tools/fast_registration.py | 17 ++--------------- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/flytekit/clis/sdk_in_container/package.py b/flytekit/clis/sdk_in_container/package.py index 71efeab576..863151e2fc 100644 --- a/flytekit/clis/sdk_in_container/package.py +++ b/flytekit/clis/sdk_in_container/package.py @@ -12,6 +12,9 @@ ) from flytekit.tools.repo import NoSerializableEntitiesError, serialize_and_package +from flytekit.core import context_manager +from flytekit.tools import fast_registration, module_loader, serialize_helpers +from flytekit.tools.package_helpers import create_archive @click.command("package") @click.option( diff --git a/flytekit/clis/sdk_in_container/serialize.py b/flytekit/clis/sdk_in_container/serialize.py index 311ff01fad..ca4a092e31 100644 --- a/flytekit/clis/sdk_in_container/serialize.py +++ b/flytekit/clis/sdk_in_container/serialize.py @@ -13,6 +13,9 @@ from flytekit.tools.fast_registration import compute_digest as _compute_digest from flytekit.tools.fast_registration import filter_tar_file_fn as _filter_tar_file_fn from flytekit.tools.repo import serialize_to_folder +from flytekit.tools.module_loader import trigger_loading +from flytekit.tools.serialize_helpers import get_registrable_entities, persist_registrable_entities +from flytekit.tools.package_helpers import create_archive CTX_IMAGE = "image" CTX_LOCAL_SRC_ROOT = "local_source_root" @@ -170,8 +173,7 @@ def fast_workflows(ctx, folder=None): archive_fname = os.path.join(folder, f"{digest}.tar.gz") click.echo(f"Writing compressed archive to {archive_fname}") # Write using gzip - with _tarfile.open(archive_fname, "w:gz") as tar: - tar.add(source_dir, arcname="", filter=_filter_tar_file_fn) + create_archive(source_dir, archive_fname) pkgs = ctx.obj[CTX_PACKAGES] dir = ctx.obj[CTX_LOCAL_SRC_ROOT] diff --git a/flytekit/tools/fast_registration.py b/flytekit/tools/fast_registration.py index feb23b0f45..0be611ba2a 100644 --- a/flytekit/tools/fast_registration.py +++ b/flytekit/tools/fast_registration.py @@ -8,6 +8,7 @@ import checksumdir from flytekit.core.context_manager import FlyteContextManager +from flytekit.tools.package_helpers import create_archive _tmp_versions_dir = "tmp/versions" @@ -30,19 +31,6 @@ def _write_marker(marker: _os.PathLike): pass -def filter_tar_file_fn(tarinfo: _tarfile.TarInfo) -> _tarfile.TarInfo: - """ - Excludes designated file types from tar archive - :param _tarfile.TarInfo tarinfo: - :return _tarfile.TarInfo: - """ - if tarinfo.name.endswith(".pyc"): - return None - if tarinfo.name.startswith(".cache"): - return None - if "__pycache__" in tarinfo.name: - return None - return tarinfo def get_additional_distribution_loc(remote_location: str, identifier: str) -> str: @@ -79,8 +67,7 @@ def upload_package(source_dir: _os.PathLike, identifier: str, remote_location: s with _tempfile.NamedTemporaryFile() as fp: # Write using gzip - with _tarfile.open(fp.name, "w:gz") as tar: - tar.add(source_dir, arcname="", filter=filter_tar_file_fn) + create_archive(source_dir, fp.name) if dry_run: print("Would upload {} to {}".format(fp.name, full_remote_path)) else: From d05a32595a765ff90550bbfeeeb6732d40b71b7a Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Fri, 25 Mar 2022 10:34:43 +0100 Subject: [PATCH 06/25] Lint Signed-off-by: Tim Bauer --- flytekit/clis/sdk_in_container/package.py | 1 + flytekit/clis/sdk_in_container/serialize.py | 2 +- flytekit/tools/fast_registration.py | 2 - flytekit/tools/ignore.py | 34 ++--- flytekit/tools/package_helpers.py | 3 +- tests/flytekit/unit/tools/test_archive.py | 54 +++----- tests/flytekit/unit/tools/test_ignore.py | 137 ++++++-------------- 7 files changed, 82 insertions(+), 151 deletions(-) diff --git a/flytekit/clis/sdk_in_container/package.py b/flytekit/clis/sdk_in_container/package.py index 863151e2fc..e9d529eddc 100644 --- a/flytekit/clis/sdk_in_container/package.py +++ b/flytekit/clis/sdk_in_container/package.py @@ -16,6 +16,7 @@ from flytekit.tools import fast_registration, module_loader, serialize_helpers from flytekit.tools.package_helpers import create_archive + @click.command("package") @click.option( "-i", diff --git a/flytekit/clis/sdk_in_container/serialize.py b/flytekit/clis/sdk_in_container/serialize.py index ca4a092e31..0026c6a859 100644 --- a/flytekit/clis/sdk_in_container/serialize.py +++ b/flytekit/clis/sdk_in_container/serialize.py @@ -14,8 +14,8 @@ from flytekit.tools.fast_registration import filter_tar_file_fn as _filter_tar_file_fn from flytekit.tools.repo import serialize_to_folder from flytekit.tools.module_loader import trigger_loading -from flytekit.tools.serialize_helpers import get_registrable_entities, persist_registrable_entities from flytekit.tools.package_helpers import create_archive +from flytekit.tools.serialize_helpers import get_registrable_entities, persist_registrable_entities CTX_IMAGE = "image" CTX_LOCAL_SRC_ROOT = "local_source_root" diff --git a/flytekit/tools/fast_registration.py b/flytekit/tools/fast_registration.py index 0be611ba2a..e9f5eecc56 100644 --- a/flytekit/tools/fast_registration.py +++ b/flytekit/tools/fast_registration.py @@ -31,8 +31,6 @@ def _write_marker(marker: _os.PathLike): pass - - def get_additional_distribution_loc(remote_location: str, identifier: str) -> str: """ :param Text remote_location: diff --git a/flytekit/tools/ignore.py b/flytekit/tools/ignore.py index a524b22f03..6e882cf044 100644 --- a/flytekit/tools/ignore.py +++ b/flytekit/tools/ignore.py @@ -1,29 +1,25 @@ -from fnmatch import fnmatch import os import subprocess +import tarfile as _tarfile from abc import ABC, abstractmethod +from fnmatch import fnmatch from pathlib import Path -import tarfile as _tarfile from shutil import which from typing import List, Optional + from docker.utils.build import PatternMatcher from flytekit.loggers import cli_logger +STANDARD_IGNORE_PATTERNS = ["*.pyc", ".cache", ".cache/*", "__pycache__", "**/__pycache__"] -STANDARD_IGNORE_PATTERNS = [ - "*.pyc", - ".cache", - ".cache/*", - "__pycache__", - "**/__pycache__" -] class Ignore(ABC): - """ Base for Ignores, implements core logic. Children have to implement _is_ignored """ + """Base for Ignores, implements core logic. Children have to implement _is_ignored""" + def __init__(self, root: str): self.root = root - + def is_ignored(self, path: str) -> bool: if os.path.isabs(path): path = os.path.relpath(path, self.root) @@ -40,7 +36,8 @@ def _is_ignored(self, path: str) -> bool: class GitIgnore(Ignore): - """ Uses git cli (if available) to check whether a path is ignored. """ + """Uses git cli (if available) to check whether a path is ignored.""" + def __init__(self, root: Path): super().__init__(root) self.has_git = which("git") is not None @@ -55,12 +52,13 @@ def _is_ignored(self, path: str) -> bool: class DockerIgnore(Ignore): - """ Uses docker-py's PatternMatcher to check whether a path is ignored. """ + """Uses docker-py's PatternMatcher to check whether a path is ignored.""" + def __init__(self, root: Path): super().__init__(root) self.pm = self._parse() - def _parse(self) -> List[str]: + def _parse(self) -> PatternMatcher: patterns = [] dockerignore = os.path.join(self.root, ".dockerignore") if os.path.isfile(dockerignore): @@ -74,8 +72,9 @@ def _is_ignored(self, path: str) -> bool: class StandardIgnore(Ignore): - """ Retains the standard ignore functionality that previously existed. Could in theory - by fed with custom ignore patterns from cli. """ + """Retains the standard ignore functionality that previously existed. Could in theory + by fed with custom ignore patterns from cli.""" + def __init__(self, root: Path, patterns: Optional[List[str]] = None): super().__init__(root) self.patterns = patterns if patterns else STANDARD_IGNORE_PATTERNS @@ -88,8 +87,9 @@ def _is_ignored(self, path: str) -> bool: class IgnoreGroup(Ignore): - """ Groups multiple Ignores and checks a path against them. A file is ignored if any + """Groups multiple Ignores and checks a path against them. A file is ignored if any Ignore considers it ignored.""" + def __init__(self, root: str, ignores: List[Ignore]): self.ignores = [ignore(root) for ignore in ignores] diff --git a/flytekit/tools/package_helpers.py b/flytekit/tools/package_helpers.py index 97e0e28f7b..288c66e73c 100644 --- a/flytekit/tools/package_helpers.py +++ b/flytekit/tools/package_helpers.py @@ -1,5 +1,6 @@ import tarfile -from flytekit.tools.ignore import IgnoreGroup, GitIgnore, DockerIgnore, StandardIgnore + +from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore def create_archive(source: str, name: str) -> None: diff --git a/tests/flytekit/unit/tools/test_archive.py b/tests/flytekit/unit/tools/test_archive.py index 469a9a4912..485a3ca765 100644 --- a/tests/flytekit/unit/tools/test_archive.py +++ b/tests/flytekit/unit/tools/test_archive.py @@ -1,6 +1,8 @@ -import pytest import subprocess import tarfile + +import pytest + from flytekit.tools.package_helpers import create_archive from tests.flytekit.unit.tools.test_ignore import make_tree @@ -8,40 +10,20 @@ @pytest.fixture def flyte_project(tmp_path): tree = { - "data" : { - "large.file": "", - "more.files": "" - }, - "src" : { - "workflows" : { - "__pycache__" : { - "some.pyc": "" - }, + "data": {"large.file": "", "more.files": ""}, + "src": { + "workflows": { + "__pycache__": {"some.pyc": ""}, "hello_world.py": "print('Hello World!')", } }, - ".venv": { - "lots": "", - "of": "", - "packages": "" - }, + ".venv": {"lots": "", "of": "", "packages": ""}, ".env": "supersecret", "some.bar": "", "some.foo": "", - "keep.foo": "", - ".gitignore": "\n".join([ - ".env", - ".venv", - "# A comment", - "data", - "*.foo", - "!keep.foo" - ]), - ".dockerignore": "\n".join([ - "data", - "*.bar", - ".git" - ]) + "keep.foo": "", + ".gitignore": "\n".join([".env", ".venv", "# A comment", "data", "*.foo", "!keep.foo"]), + ".dockerignore": "\n".join(["data", "*.bar", ".git"]), } make_tree(tmp_path, tree) @@ -54,11 +36,11 @@ def test_archive(flyte_project, tmp_path): create_archive(source=flyte_project, name=archive_fname) with tarfile.open(archive_fname) as tar: assert tar.getnames() == [ - '', # tar root, output removes leading '/' - '.dockerignore', - '.gitignore', - 'keep.foo', - 'src', - 'src/workflows', - 'src/workflows/hello_world.py' + "", # tar root, output removes leading '/' + ".dockerignore", + ".gitignore", + "keep.foo", + "src", + "src/workflows", + "src/workflows/hello_world.py", ] diff --git a/tests/flytekit/unit/tools/test_ignore.py b/tests/flytekit/unit/tools/test_ignore.py index 15e571d309..ebfed2394a 100644 --- a/tests/flytekit/unit/tools/test_ignore.py +++ b/tests/flytekit/unit/tools/test_ignore.py @@ -1,11 +1,14 @@ -import pytest -from typing import List, Dict -from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore -from pathlib import Path import subprocess +from pathlib import Path +from tarfile import TarInfo +from typing import Dict, List from unittest.mock import patch + +import pytest from docker.utils.build import PatternMatcher -from tarfile import TarInfo + +from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore + def make_tree(root: Path, tree: Dict): for name, content in tree.items(): @@ -21,17 +24,10 @@ def make_tree(root: Path, tree: Dict): @pytest.fixture def simple_gitignore(tmp_path): tree = { - "sub" : { - "some.bar": "" - }, + "sub": {"some.bar": ""}, "test.foo": "", "keep.foo": "", - ".gitignore": "\n".join([ - "*.foo", - "!keep.foo", - "# A comment", - "sub" - ]) + ".gitignore": "\n".join(["*.foo", "!keep.foo", "# A comment", "sub"]), } make_tree(tmp_path, tree) @@ -42,22 +38,17 @@ def simple_gitignore(tmp_path): @pytest.fixture def nested_gitignore(tmp_path): tree = { - "sub" : { - "some.foo": "", - "another.foo": "", - ".gitignore": "!another.foo" - }, - "data" : { - ".gitignore" : "*", - "large.file" : "" - }, + "sub": {"some.foo": "", "another.foo": "", ".gitignore": "!another.foo"}, + "data": {".gitignore": "*", "large.file": ""}, "test.foo": "", "keep.foo": "", - ".gitignore": "\n".join([ - "*.foo", - "!keep.foo", - "# A comment", - ]) + ".gitignore": "\n".join( + [ + "*.foo", + "!keep.foo", + "# A comment", + ] + ), } make_tree(tmp_path, tree) @@ -68,17 +59,10 @@ def nested_gitignore(tmp_path): @pytest.fixture def simple_dockerignore(tmp_path): tree = { - "sub" : { - "some.bar": "" - }, + "sub": {"some.bar": ""}, "test.foo": "", "keep.foo": "", - ".dockerignore": "\n".join([ - "*.foo", - "!keep.foo", - "# A comment", - "sub" - ]) + ".dockerignore": "\n".join(["*.foo", "!keep.foo", "# A comment", "sub"]), } make_tree(tmp_path, tree) @@ -88,9 +72,7 @@ def simple_dockerignore(tmp_path): @pytest.fixture def no_ignore(tmp_path): tree = { - "sub" : { - "some.bar": "" - }, + "sub": {"some.bar": ""}, "test.foo": "", "keep.foo": "", } @@ -102,29 +84,16 @@ def no_ignore(tmp_path): @pytest.fixture def all_ignore(tmp_path): tree = { - "sub" : { + "sub": { "some.bar": "", - "__pycache__": { - "some.pyc" - }, - }, - "data" : { - "reallybigfile.bar": "" - }, - ".cache" : { - "something.cached": "" + "__pycache__": {"some.pyc"}, }, + "data": {"reallybigfile.bar": ""}, + ".cache": {"something.cached": ""}, "test.foo": "", "keep.foo": "", - ".dockerignore": "\n".join([ - "# A comment", - "data", - ".git" - ]), - ".gitignore": "\n".join([ - "*.foo", - "!keep.foo" - ]) + ".dockerignore": "\n".join(["# A comment", "data", ".git"]), + ".gitignore": "\n".join(["*.foo", "!keep.foo"]), } make_tree(tmp_path, tree) @@ -143,13 +112,13 @@ def test_simple_gitignore(simple_gitignore): def test_not_subpath(simple_gitignore): - """ Test edge case that if path is not on root it cannot not be ignored """ + """Test edge case that if path is not on root it cannot not be ignored""" gitignore = GitIgnore(simple_gitignore) assert not gitignore.is_ignored("/whatever/test.foo") def test_nested_gitignore(nested_gitignore): - """ Test override with nested gitignore and star-ignore """ + """Test override with nested gitignore and star-ignore""" gitignore = GitIgnore(nested_gitignore) assert gitignore.is_ignored(str(nested_gitignore / "test.foo")) assert not gitignore.is_ignored(str(nested_gitignore / "sub")) @@ -160,9 +129,9 @@ def test_nested_gitignore(nested_gitignore): assert not gitignore.is_ignored(str(nested_gitignore / ".git")) -@patch('flytekit.tools.ignore.which') +@patch("flytekit.tools.ignore.which") def test_no_git(mock_which, simple_gitignore): - """ Test that nothing is ignored if no git cli available """ + """Test that nothing is ignored if no git cli available""" mock_which.return_value = None gitignore = GitIgnore(simple_gitignore) assert not gitignore.has_git @@ -174,28 +143,15 @@ def test_no_git(mock_which, simple_gitignore): def test_dockerignore_parse(simple_dockerignore): - """ Test .dockerignore file parsing """ + """Test .dockerignore file parsing""" dockerignore = DockerIgnore(simple_dockerignore) - assert [p.cleaned_pattern for p in dockerignore.pm.patterns] == [ - "*.foo", - "keep.foo", - "sub", - ".dockerignore" - ] - assert [p.exclusion for p in dockerignore.pm.patterns] == [ - False, - True, - False, - True - ] + assert [p.cleaned_pattern for p in dockerignore.pm.patterns] == ["*.foo", "keep.foo", "sub", ".dockerignore"] + assert [p.exclusion for p in dockerignore.pm.patterns] == [False, True, False, True] + def test_patternmatcher(): - """ Test that PatternMatcher works as expected """ - patterns = [ - "*.foo", - "!keep.foo", - "sub" - ] + """Test that PatternMatcher works as expected""" + patterns = ["*.foo", "!keep.foo", "sub"] pm = PatternMatcher(patterns) assert pm.matches("whatever.foo") assert not pm.matches("keep.foo") @@ -212,7 +168,7 @@ def test_simple_dockerignore(simple_dockerignore): def test_no_ignore(no_ignore): - """ Test that nothing is ignored if no ignore files present """ + """Test that nothing is ignored if no ignore files present""" dockerignore = DockerIgnore(no_ignore) assert not dockerignore.is_ignored(str(no_ignore / "test.foo")) assert not dockerignore.is_ignored(str(no_ignore / "keep.foo")) @@ -227,15 +183,8 @@ def test_no_ignore(no_ignore): def test_standard_ignore(): - """ Test the standard ignore cases previously hardcoded """ - patterns = [ - "*.pyc", - ".cache", - ".cache/*", - "__pycache__", - "**/__pycache__", - "*.foo" - ] + """Test the standard ignore cases previously hardcoded""" + patterns = ["*.pyc", ".cache", ".cache/*", "__pycache__", "**/__pycache__", "*.foo"] ignore = StandardIgnore(root=".", patterns=patterns) assert not ignore.is_ignored("foo.py") assert ignore.is_ignored("foo.pyc") @@ -246,7 +195,7 @@ def test_standard_ignore(): def test_all_ignore(all_ignore): - """ Test all ignores grouped together """ + """Test all ignores grouped together""" ignore = IgnoreGroup(all_ignore, [GitIgnore, DockerIgnore, StandardIgnore]) assert not ignore.is_ignored("sub") assert not ignore.is_ignored("sub/some.bar") @@ -264,7 +213,7 @@ def test_all_ignore(all_ignore): def test_all_ignore_tar_filter(all_ignore): - """ Test tar_filter method of all ignores grouped together """ + """Test tar_filter method of all ignores grouped together""" ignore = IgnoreGroup(all_ignore, [GitIgnore, DockerIgnore, StandardIgnore]) assert ignore.tar_filter(TarInfo(name="sub")).name == "sub" assert ignore.tar_filter(TarInfo(name="sub/some.bar")).name == "sub/some.bar" From 226d24d3934630520d9b593ef6ce5fe5e6f82b81 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Wed, 30 Mar 2022 19:38:26 +0200 Subject: [PATCH 07/25] Add alternative version with ls-files Signed-off-by: Tim Bauer --- flytekit/tools/ignore.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/flytekit/tools/ignore.py b/flytekit/tools/ignore.py index 6e882cf044..3eefb8a1de 100644 --- a/flytekit/tools/ignore.py +++ b/flytekit/tools/ignore.py @@ -35,7 +35,7 @@ def _is_ignored(self, path: str) -> bool: pass -class GitIgnore(Ignore): +class GitIgnoreV1(Ignore): """Uses git cli (if available) to check whether a path is ignored.""" def __init__(self, root: Path): @@ -51,6 +51,31 @@ def _is_ignored(self, path: str) -> bool: return False +class GitIgnore(Ignore): + """Uses git cli (if available) to check whether a path is ignored.""" + + def __init__(self, root: Path): + super().__init__(root) + self.has_git = which("git") is not None + self.ignored = self._list_ignored() + + def _list_ignored(self) -> List[str]: + if self.has_git: + out = subprocess.run(["git", "ls-files", "-io", "--exclude-standard"], cwd=self.root, capture_output=True) + if out.returncode == 0: + return out.stdout.decode("utf-8").split("\n")[:-1] + cli_logger.warning(f"Could not determine ignored files due to:\n{out.stderr}\nNot applying any filters") + return [] + cli_logger.info("No git executable found, not applying any filters") + return [] + + def _is_ignored(self, path: str) -> bool: + if self.ignored: + if path in self.ignored: + return True + return False + + class DockerIgnore(Ignore): """Uses docker-py's PatternMatcher to check whether a path is ignored.""" From 17b6131dea6991d98f6362f3d3eb4cc4b3d9cfc1 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Wed, 30 Mar 2022 21:06:48 +0200 Subject: [PATCH 08/25] Add benchmark test Signed-off-by: Tim Bauer --- tests/flytekit/unit/tools/test_benchmark.py | 53 +++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 tests/flytekit/unit/tools/test_benchmark.py diff --git a/tests/flytekit/unit/tools/test_benchmark.py b/tests/flytekit/unit/tools/test_benchmark.py new file mode 100644 index 0000000000..7fdf15d82f --- /dev/null +++ b/tests/flytekit/unit/tools/test_benchmark.py @@ -0,0 +1,53 @@ +import pytest +import subprocess +import tarfile + +from flytekit.tools.ignore import GitIgnore, GitIgnoreV1, IgnoreGroup + +from tests.flytekit.unit.tools.test_ignore import make_tree + + +def create_archive_v1(source: str, name: str) -> None: + ignore = IgnoreGroup(source, [GitIgnoreV1]) + with tarfile.open(name, "w:gz") as tar: + tar.add(source, arcname="", filter=ignore.tar_filter) + + +def create_archive_v2(source: str, name: str) -> None: + ignore = IgnoreGroup(source, [GitIgnore]) + with tarfile.open(name, "w:gz") as tar: + tar.add(source, arcname="", filter=ignore.tar_filter) + + +@pytest.fixture +def flyte_project(tmp_path): + tree = { + "data": {"large.file": "", "more.files": ""}, + "src": { + "workflows": { + "__pycache__": {"some.pyc": ""}, + "hello_world.py": "print('Hello World!')", + } + }, + ".venv": {"lots": "", "of": "", "packages": ""}, + ".env": "supersecret", + "some.bar": "", + "some.foo": "", + "keep.foo": "", + ".gitignore": "\n".join([".env", ".venv", "# A comment", "data", "*.foo", "!keep.foo"]), + ".dockerignore": "\n".join(["data", "*.bar", ".git"]), + } + + make_tree(tmp_path, tree) + subprocess.run(["git", "init", tmp_path]) + return tmp_path + + +def test_v1(benchmark, flyte_project, tmp_path): + archive_fname = tmp_path / "archive.tar.gz" + benchmark(create_archive_v1, source=flyte_project, name=archive_fname) + + +def test_v2(benchmark, flyte_project, tmp_path): + archive_fname = tmp_path / "archive.tar.gz" + benchmark(create_archive_v2, source=flyte_project, name=archive_fname) From c5791901348ebb2b531876456eab29808c30a03f Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Mon, 4 Apr 2022 10:13:03 +0200 Subject: [PATCH 09/25] Adjust tests to ignore included empty dirs Signed-off-by: Tim Bauer --- tests/flytekit/unit/tools/test_archive.py | 1 + tests/flytekit/unit/tools/test_ignore.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/flytekit/unit/tools/test_archive.py b/tests/flytekit/unit/tools/test_archive.py index 485a3ca765..c3d6117218 100644 --- a/tests/flytekit/unit/tools/test_archive.py +++ b/tests/flytekit/unit/tools/test_archive.py @@ -39,6 +39,7 @@ def test_archive(flyte_project, tmp_path): "", # tar root, output removes leading '/' ".dockerignore", ".gitignore", + ".venv", "keep.foo", "src", "src/workflows", diff --git a/tests/flytekit/unit/tools/test_ignore.py b/tests/flytekit/unit/tools/test_ignore.py index ebfed2394a..8f2ec2949c 100644 --- a/tests/flytekit/unit/tools/test_ignore.py +++ b/tests/flytekit/unit/tools/test_ignore.py @@ -104,7 +104,6 @@ def all_ignore(tmp_path): def test_simple_gitignore(simple_gitignore): gitignore = GitIgnore(simple_gitignore) assert gitignore.is_ignored(str(simple_gitignore / "test.foo")) - assert gitignore.is_ignored(str(simple_gitignore / "sub")) assert gitignore.is_ignored(str(simple_gitignore / "sub" / "some.bar")) assert not gitignore.is_ignored(str(simple_gitignore / "keep.foo")) assert not gitignore.is_ignored(str(simple_gitignore / ".gitignore")) From 149bfef1196149e85628257f804931c648f3250a Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Mon, 4 Apr 2022 10:31:30 +0200 Subject: [PATCH 10/25] Remove old version and benchmark test Signed-off-by: Tim Bauer --- flytekit/tools/ignore.py | 18 +------ tests/flytekit/unit/tools/test_benchmark.py | 53 --------------------- 2 files changed, 1 insertion(+), 70 deletions(-) delete mode 100644 tests/flytekit/unit/tools/test_benchmark.py diff --git a/flytekit/tools/ignore.py b/flytekit/tools/ignore.py index 3eefb8a1de..64588a6435 100644 --- a/flytekit/tools/ignore.py +++ b/flytekit/tools/ignore.py @@ -35,24 +35,8 @@ def _is_ignored(self, path: str) -> bool: pass -class GitIgnoreV1(Ignore): - """Uses git cli (if available) to check whether a path is ignored.""" - - def __init__(self, root: Path): - super().__init__(root) - self.has_git = which("git") is not None - - def _is_ignored(self, path: str) -> bool: - if self.has_git: - out = subprocess.run(["git", "check-ignore", path], cwd=self.root) - # Returncode is 0 if file is ignored and 1 if otherwise - return not out.returncode - cli_logger.info(f"No git executable found, not applying any filters") - return False - - class GitIgnore(Ignore): - """Uses git cli (if available) to check whether a path is ignored.""" + """Uses git cli (if available) to list all ignored files and compare with those.""" def __init__(self, root: Path): super().__init__(root) diff --git a/tests/flytekit/unit/tools/test_benchmark.py b/tests/flytekit/unit/tools/test_benchmark.py deleted file mode 100644 index 7fdf15d82f..0000000000 --- a/tests/flytekit/unit/tools/test_benchmark.py +++ /dev/null @@ -1,53 +0,0 @@ -import pytest -import subprocess -import tarfile - -from flytekit.tools.ignore import GitIgnore, GitIgnoreV1, IgnoreGroup - -from tests.flytekit.unit.tools.test_ignore import make_tree - - -def create_archive_v1(source: str, name: str) -> None: - ignore = IgnoreGroup(source, [GitIgnoreV1]) - with tarfile.open(name, "w:gz") as tar: - tar.add(source, arcname="", filter=ignore.tar_filter) - - -def create_archive_v2(source: str, name: str) -> None: - ignore = IgnoreGroup(source, [GitIgnore]) - with tarfile.open(name, "w:gz") as tar: - tar.add(source, arcname="", filter=ignore.tar_filter) - - -@pytest.fixture -def flyte_project(tmp_path): - tree = { - "data": {"large.file": "", "more.files": ""}, - "src": { - "workflows": { - "__pycache__": {"some.pyc": ""}, - "hello_world.py": "print('Hello World!')", - } - }, - ".venv": {"lots": "", "of": "", "packages": ""}, - ".env": "supersecret", - "some.bar": "", - "some.foo": "", - "keep.foo": "", - ".gitignore": "\n".join([".env", ".venv", "# A comment", "data", "*.foo", "!keep.foo"]), - ".dockerignore": "\n".join(["data", "*.bar", ".git"]), - } - - make_tree(tmp_path, tree) - subprocess.run(["git", "init", tmp_path]) - return tmp_path - - -def test_v1(benchmark, flyte_project, tmp_path): - archive_fname = tmp_path / "archive.tar.gz" - benchmark(create_archive_v1, source=flyte_project, name=archive_fname) - - -def test_v2(benchmark, flyte_project, tmp_path): - archive_fname = tmp_path / "archive.tar.gz" - benchmark(create_archive_v2, source=flyte_project, name=archive_fname) From 2961a29e60100fe896573f6dad8291eac48ea26d Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Mon, 4 Apr 2022 15:17:17 +0200 Subject: [PATCH 11/25] Add list file functionality Signed-off-by: Tim Bauer --- flytekit/tools/ignore.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/flytekit/tools/ignore.py b/flytekit/tools/ignore.py index 64588a6435..7238327846 100644 --- a/flytekit/tools/ignore.py +++ b/flytekit/tools/ignore.py @@ -100,6 +100,7 @@ class IgnoreGroup(Ignore): Ignore considers it ignored.""" def __init__(self, root: str, ignores: List[Ignore]): + super().__init__(root) self.ignores = [ignore(root) for ignore in ignores] def _is_ignored(self, path: str) -> bool: @@ -107,3 +108,12 @@ def _is_ignored(self, path: str) -> bool: if ignore.is_ignored(path): return True return False + + def list_ignored(self) -> List[str]: + ignored = [] + for root, _, files in os.walk(self.root): + for file in files: + abs_path = os.path.join(root, file) + if self.is_ignored(abs_path): + ignored.append(os.path.relpath(abs_path, self.root)) + return ignored From 046fcc6e3e0caf5947ff027ce0c1a45aac71d0e2 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Mon, 4 Apr 2022 15:19:32 +0200 Subject: [PATCH 12/25] Refactor digest as part of create archive Signed-off-by: Tim Bauer --- flytekit/clis/sdk_in_container/package.py | 2 +- flytekit/clis/sdk_in_container/serialize.py | 7 ++---- flytekit/tools/package_helpers.py | 26 +++++++++++++++++++-- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/flytekit/clis/sdk_in_container/package.py b/flytekit/clis/sdk_in_container/package.py index e9d529eddc..69c6f603a8 100644 --- a/flytekit/clis/sdk_in_container/package.py +++ b/flytekit/clis/sdk_in_container/package.py @@ -13,7 +13,7 @@ from flytekit.tools.repo import NoSerializableEntitiesError, serialize_and_package from flytekit.core import context_manager -from flytekit.tools import fast_registration, module_loader, serialize_helpers +from flytekit.tools import module_loader, serialize_helpers from flytekit.tools.package_helpers import create_archive diff --git a/flytekit/clis/sdk_in_container/serialize.py b/flytekit/clis/sdk_in_container/serialize.py index 0026c6a859..cb9c9216e9 100644 --- a/flytekit/clis/sdk_in_container/serialize.py +++ b/flytekit/clis/sdk_in_container/serialize.py @@ -168,12 +168,9 @@ def fast_workflows(ctx, folder=None): click.echo(f"Writing output to {folder}") source_dir = ctx.obj[CTX_LOCAL_SRC_ROOT] - digest = _compute_digest(source_dir) - folder = folder if folder else "" - archive_fname = os.path.join(folder, f"{digest}.tar.gz") - click.echo(f"Writing compressed archive to {archive_fname}") # Write using gzip - create_archive(source_dir, archive_fname) + archive_fname = create_archive(source_dir, folder) + click.echo(f"Writing compressed archive to {archive_fname}") pkgs = ctx.obj[CTX_PACKAGES] dir = ctx.obj[CTX_LOCAL_SRC_ROOT] diff --git a/flytekit/tools/package_helpers.py b/flytekit/tools/package_helpers.py index 288c66e73c..6993fce365 100644 --- a/flytekit/tools/package_helpers.py +++ b/flytekit/tools/package_helpers.py @@ -1,9 +1,31 @@ +import os import tarfile +import checksumdir +from typing import List, Optional from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore +FAST_PREFIX = "fast" +FAST_FILEENDING = ".tar.gz" -def create_archive(source: str, name: str) -> None: +def compute_digest(source_dir: os.PathLike, excluded_files: Optional[List[os.PathLike]] = None) -> str: + """ + Walks the entirety of the source dir to compute a deterministic hex digest of the dir contents. + :param _os.PathLike source_dir: + :return Text: + """ + return checksumdir.dirhash(source_dir, 'md5', include_paths=True, excluded_files=excluded_files) + + +def create_archive(source: os.PathLike, output_dir: Optional[os.PathLike] = None) -> os.PathLike: ignore = IgnoreGroup(source, [GitIgnore, DockerIgnore, StandardIgnore]) - with tarfile.open(name, "w:gz") as tar: + ignored_files = ignore.list_ignored() + digest = compute_digest(source, ignored_files) + archive_fname = f"{FAST_PREFIX}{digest}{FAST_FILEENDING}" + + if output_dir: + archive_fname = os.path.join(output_dir, archive_fname) + + with tarfile.open(archive_fname, "w:gz") as tar: tar.add(source, arcname="", filter=ignore.tar_filter) + return archive_fname From 576bb27ab8362c8024ff06f8a032888b30400fa9 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Mon, 4 Apr 2022 15:19:47 +0200 Subject: [PATCH 13/25] Remove unused code Signed-off-by: Tim Bauer --- flytekit/tools/fast_registration.py | 61 ----------------------------- 1 file changed, 61 deletions(-) diff --git a/flytekit/tools/fast_registration.py b/flytekit/tools/fast_registration.py index e9f5eecc56..814bc92828 100644 --- a/flytekit/tools/fast_registration.py +++ b/flytekit/tools/fast_registration.py @@ -1,36 +1,13 @@ import os as _os import posixpath import subprocess as _subprocess -import tarfile as _tarfile -import tempfile as _tempfile -from pathlib import Path as _Path - -import checksumdir from flytekit.core.context_manager import FlyteContextManager -from flytekit.tools.package_helpers import create_archive -_tmp_versions_dir = "tmp/versions" file_access = FlyteContextManager.current_context().file_access -def compute_digest(source_dir: _os.PathLike) -> str: - """ - Walks the entirety of the source dir to compute a deterministic hex digest of the dir contents. - :param _os.PathLike source_dir: - :return Text: - """ - return f"fast{checksumdir.dirhash(source_dir, 'md5', include_paths=True)}" - - -def _write_marker(marker: _os.PathLike): - try: - open(marker, "x") - except FileExistsError: - pass - - def get_additional_distribution_loc(remote_location: str, identifier: str) -> str: """ :param Text remote_location: @@ -40,44 +17,6 @@ def get_additional_distribution_loc(remote_location: str, identifier: str) -> st return posixpath.join(remote_location, "{}.{}".format(identifier, "tar.gz")) -def upload_package(source_dir: _os.PathLike, identifier: str, remote_location: str, dry_run=False) -> str: - """ - Uploads the contents of the source dir as a tar package to a destination specified by the unique identifier and - remote_location. - :param _os.PathLike source_dir: - :param Text identifier: - :param Text remote_location: - :param bool dry_run: - :return Text: - """ - tmp_versions_dir = _os.path.join(_os.getcwd(), _tmp_versions_dir) - _os.makedirs(tmp_versions_dir, exist_ok=True) - marker = _Path(_os.path.join(tmp_versions_dir, identifier)) - full_remote_path = get_additional_distribution_loc(remote_location, identifier) - if _os.path.exists(marker): - print("Local marker for identifier {} already exists, skipping upload".format(identifier)) - return full_remote_path - - if file_access.exists(full_remote_path): - print("Remote file {} already exists, skipping upload".format(full_remote_path)) - _write_marker(marker) - return full_remote_path - - with _tempfile.NamedTemporaryFile() as fp: - # Write using gzip - create_archive(source_dir, fp.name) - if dry_run: - print("Would upload {} to {}".format(fp.name, full_remote_path)) - else: - file_access.put_data(fp.name, full_remote_path) - print("Uploaded {} to {}".format(fp.name, full_remote_path)) - - # Finally, touch the marker file so we have a flag in the future to avoid re-uploading the package dir as an - # optimization - _write_marker(marker) - return full_remote_path - - def download_distribution(additional_distribution: str, destination: str): """ Downloads a remote code distribution and overwrites any local files. From e614e51ebe51203c175c82efc0d91739a5f2f877 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Mon, 4 Apr 2022 15:20:06 +0200 Subject: [PATCH 14/25] Add test for digest Signed-off-by: Tim Bauer --- tests/flytekit/unit/tools/test_archive.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/flytekit/unit/tools/test_archive.py b/tests/flytekit/unit/tools/test_archive.py index c3d6117218..1d250f5e3c 100644 --- a/tests/flytekit/unit/tools/test_archive.py +++ b/tests/flytekit/unit/tools/test_archive.py @@ -3,8 +3,9 @@ import pytest -from flytekit.tools.package_helpers import create_archive +from flytekit.tools.package_helpers import FAST_FILEENDING, create_archive, compute_digest, FAST_PREFIX from tests.flytekit.unit.tools.test_ignore import make_tree +from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore @pytest.fixture @@ -32,8 +33,7 @@ def flyte_project(tmp_path): def test_archive(flyte_project, tmp_path): - archive_fname = tmp_path / "archive.tar.gz" - create_archive(source=flyte_project, name=archive_fname) + archive_fname = create_archive(source=flyte_project, output_dir=tmp_path) with tarfile.open(archive_fname) as tar: assert tar.getnames() == [ "", # tar root, output removes leading '/' @@ -45,3 +45,18 @@ def test_archive(flyte_project, tmp_path): "src/workflows", "src/workflows/hello_world.py", ] + assert str(archive_fname).startswith(FAST_PREFIX) + assert str(archive_fname).endswith(FAST_FILEENDING) + + +def test_digest_ignore(flyte_project, tmp_path): + ignore = IgnoreGroup(flyte_project, [GitIgnore, DockerIgnore, StandardIgnore]) + ignored_files = ignore.list_ignored() + digest1 = compute_digest(flyte_project, ignored_files) + + change_file = flyte_project / "data" / "large.file" + assert ignore.is_ignored(change_file) + change_file.write_text("I don't matter") + + digest2 = compute_digest(flyte_project, ignored_files) + assert digest1 == digest2 From 6da60360e9d0dae87766fcd7da9dd75d0f41c365 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 16:02:31 +0200 Subject: [PATCH 15/25] Include own digest and refactor to fast_package Signed-off-by: Tim Bauer --- flytekit/clis/sdk_in_container/package.py | 2 +- flytekit/clis/sdk_in_container/serialize.py | 5 +- flytekit/tools/fast_registration.py | 76 +++++++++++++++++- flytekit/tools/package_helpers.py | 31 -------- tests/flytekit/unit/tools/test_archive.py | 62 --------------- .../unit/tools/test_fast_registration.py | 79 ++++++++++++++++++- tests/flytekit/unit/tools/test_ignore.py | 2 +- 7 files changed, 154 insertions(+), 103 deletions(-) delete mode 100644 flytekit/tools/package_helpers.py delete mode 100644 tests/flytekit/unit/tools/test_archive.py diff --git a/flytekit/clis/sdk_in_container/package.py b/flytekit/clis/sdk_in_container/package.py index 69c6f603a8..0c4cc16d7c 100644 --- a/flytekit/clis/sdk_in_container/package.py +++ b/flytekit/clis/sdk_in_container/package.py @@ -14,7 +14,7 @@ from flytekit.core import context_manager from flytekit.tools import module_loader, serialize_helpers -from flytekit.tools.package_helpers import create_archive +from flytekit.tools.fast_registration import fast_package @click.command("package") diff --git a/flytekit/clis/sdk_in_container/serialize.py b/flytekit/clis/sdk_in_container/serialize.py index cb9c9216e9..2933c521a6 100644 --- a/flytekit/clis/sdk_in_container/serialize.py +++ b/flytekit/clis/sdk_in_container/serialize.py @@ -1,6 +1,5 @@ import os import sys -import tarfile as _tarfile import typing from enum import Enum as _Enum @@ -13,8 +12,8 @@ from flytekit.tools.fast_registration import compute_digest as _compute_digest from flytekit.tools.fast_registration import filter_tar_file_fn as _filter_tar_file_fn from flytekit.tools.repo import serialize_to_folder +from flytekit.tools.fast_registration import fast_package from flytekit.tools.module_loader import trigger_loading -from flytekit.tools.package_helpers import create_archive from flytekit.tools.serialize_helpers import get_registrable_entities, persist_registrable_entities CTX_IMAGE = "image" @@ -169,7 +168,7 @@ def fast_workflows(ctx, folder=None): source_dir = ctx.obj[CTX_LOCAL_SRC_ROOT] # Write using gzip - archive_fname = create_archive(source_dir, folder) + archive_fname = fast_package(source_dir, folder) click.echo(f"Writing compressed archive to {archive_fname}") pkgs = ctx.obj[CTX_PACKAGES] diff --git a/flytekit/tools/fast_registration.py b/flytekit/tools/fast_registration.py index 814bc92828..ade2903a26 100644 --- a/flytekit/tools/fast_registration.py +++ b/flytekit/tools/fast_registration.py @@ -1,13 +1,81 @@ -import os as _os +from __future__ import annotations + +import hashlib +import os import posixpath import subprocess as _subprocess +import tarfile +from typing import Optional from flytekit.core.context_manager import FlyteContextManager +from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore +FAST_PREFIX = "fast" +FAST_FILEENDING = ".tar.gz" file_access = FlyteContextManager.current_context().file_access +def fast_package(source: os.PathLike, output_dir: os.PathLike) -> os.PathLike: + """ + Takes a source directory and packages everything not covered by common ignores into a tarball + named after a hexdigest of the included files. + :param os.PathLike source: + :param os.PathLike output_dir: + :return os.PathLike: + """ + ignore = IgnoreGroup(source, [GitIgnore, DockerIgnore, StandardIgnore]) + digest = compute_digest(source, ignore.is_ignored) + archive_fname = f"{FAST_PREFIX}{digest}{FAST_FILEENDING}" + + if output_dir: + archive_fname = os.path.join(output_dir, archive_fname) + + with tarfile.open(archive_fname, "w:gz") as tar: + tar.add(source, arcname="", filter=ignore.tar_filter) + + return archive_fname + + +def compute_digest(source: os.PathLike, filter: Optional[callable] = None) -> str: + """ + Walks the entirety of the source dir to compute a deterministic md5 hex digest of the dir contents. + :param os.PathLike source: + :param Ignore ignore: + :return Text: + """ + hasher = hashlib.md5() + for root, _, files in os.walk(source, topdown=True): + + files.sort() + + for fname in files: + abspath = os.path.join(root, fname) + relpath = os.path.relpath(abspath, source) + if filter: + if filter(relpath): + continue + + _filehash_update(abspath, hasher) + _pathhash_update(relpath, hasher) + + return hasher.hexdigest() + + +def _filehash_update(path: os.PathLike, hasher: hashlib._Hash) -> None: + blocksize = 65536 + with open(path, "rb") as f: + bytes = f.read(blocksize) + while bytes: + hasher.update(bytes) + bytes = f.read(blocksize) + + +def _pathhash_update(path: os.PathLike, hasher: hashlib._Hash) -> None: + path_list = path.split(os.sep) + hasher.update("".join(path_list).encode("utf-8")) + + def get_additional_distribution_loc(remote_location: str, identifier: str) -> str: """ :param Text remote_location: @@ -21,16 +89,16 @@ def download_distribution(additional_distribution: str, destination: str): """ Downloads a remote code distribution and overwrites any local files. :param Text additional_distribution: - :param _os.PathLike destination: + :param os.PathLike destination: """ file_access.get_data(additional_distribution, destination) - tarfile_name = _os.path.basename(additional_distribution) + tarfile_name = os.path.basename(additional_distribution) if not tarfile_name.endswith(".tar.gz"): raise ValueError("Unrecognized additional distribution format for {}".format(additional_distribution)) # This will overwrite the existing user flyte workflow code in the current working code dir. result = _subprocess.run( - ["tar", "-xvf", _os.path.join(destination, tarfile_name), "-C", destination], + ["tar", "-xvf", os.path.join(destination, tarfile_name), "-C", destination], stdout=_subprocess.PIPE, ) result.check_returncode() diff --git a/flytekit/tools/package_helpers.py b/flytekit/tools/package_helpers.py deleted file mode 100644 index 6993fce365..0000000000 --- a/flytekit/tools/package_helpers.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import tarfile -import checksumdir -from typing import List, Optional - -from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore - -FAST_PREFIX = "fast" -FAST_FILEENDING = ".tar.gz" - -def compute_digest(source_dir: os.PathLike, excluded_files: Optional[List[os.PathLike]] = None) -> str: - """ - Walks the entirety of the source dir to compute a deterministic hex digest of the dir contents. - :param _os.PathLike source_dir: - :return Text: - """ - return checksumdir.dirhash(source_dir, 'md5', include_paths=True, excluded_files=excluded_files) - - -def create_archive(source: os.PathLike, output_dir: Optional[os.PathLike] = None) -> os.PathLike: - ignore = IgnoreGroup(source, [GitIgnore, DockerIgnore, StandardIgnore]) - ignored_files = ignore.list_ignored() - digest = compute_digest(source, ignored_files) - archive_fname = f"{FAST_PREFIX}{digest}{FAST_FILEENDING}" - - if output_dir: - archive_fname = os.path.join(output_dir, archive_fname) - - with tarfile.open(archive_fname, "w:gz") as tar: - tar.add(source, arcname="", filter=ignore.tar_filter) - return archive_fname diff --git a/tests/flytekit/unit/tools/test_archive.py b/tests/flytekit/unit/tools/test_archive.py deleted file mode 100644 index 1d250f5e3c..0000000000 --- a/tests/flytekit/unit/tools/test_archive.py +++ /dev/null @@ -1,62 +0,0 @@ -import subprocess -import tarfile - -import pytest - -from flytekit.tools.package_helpers import FAST_FILEENDING, create_archive, compute_digest, FAST_PREFIX -from tests.flytekit.unit.tools.test_ignore import make_tree -from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore - - -@pytest.fixture -def flyte_project(tmp_path): - tree = { - "data": {"large.file": "", "more.files": ""}, - "src": { - "workflows": { - "__pycache__": {"some.pyc": ""}, - "hello_world.py": "print('Hello World!')", - } - }, - ".venv": {"lots": "", "of": "", "packages": ""}, - ".env": "supersecret", - "some.bar": "", - "some.foo": "", - "keep.foo": "", - ".gitignore": "\n".join([".env", ".venv", "# A comment", "data", "*.foo", "!keep.foo"]), - ".dockerignore": "\n".join(["data", "*.bar", ".git"]), - } - - make_tree(tmp_path, tree) - subprocess.run(["git", "init", tmp_path]) - return tmp_path - - -def test_archive(flyte_project, tmp_path): - archive_fname = create_archive(source=flyte_project, output_dir=tmp_path) - with tarfile.open(archive_fname) as tar: - assert tar.getnames() == [ - "", # tar root, output removes leading '/' - ".dockerignore", - ".gitignore", - ".venv", - "keep.foo", - "src", - "src/workflows", - "src/workflows/hello_world.py", - ] - assert str(archive_fname).startswith(FAST_PREFIX) - assert str(archive_fname).endswith(FAST_FILEENDING) - - -def test_digest_ignore(flyte_project, tmp_path): - ignore = IgnoreGroup(flyte_project, [GitIgnore, DockerIgnore, StandardIgnore]) - ignored_files = ignore.list_ignored() - digest1 = compute_digest(flyte_project, ignored_files) - - change_file = flyte_project / "data" / "large.file" - assert ignore.is_ignored(change_file) - change_file.write_text("I don't matter") - - digest2 = compute_digest(flyte_project, ignored_files) - assert digest1 == digest2 diff --git a/tests/flytekit/unit/tools/test_fast_registration.py b/tests/flytekit/unit/tools/test_fast_registration.py index f3bcf0f111..6a2b3eb498 100644 --- a/tests/flytekit/unit/tools/test_fast_registration.py +++ b/tests/flytekit/unit/tools/test_fast_registration.py @@ -1,6 +1,83 @@ +import os +import subprocess import tarfile -from flytekit.tools.fast_registration import get_additional_distribution_loc +import pytest + +from flytekit.tools.fast_registration import ( + FAST_FILEENDING, + FAST_PREFIX, + compute_digest, + fast_package, + get_additional_distribution_loc, +) +from flytekit.tools.ignore import DockerIgnore, GitIgnore, IgnoreGroup, StandardIgnore +from tests.flytekit.unit.tools.test_ignore import make_tree + + +@pytest.fixture +def flyte_project(tmp_path): + tree = { + "data": {"large.file": "", "more.files": ""}, + "src": { + "workflows": { + "__pycache__": {"some.pyc": ""}, + "hello_world.py": "print('Hello World!')", + } + }, + ".venv": {"lots": "", "of": "", "packages": ""}, + ".env": "supersecret", + "some.bar": "", + "some.foo": "", + "keep.foo": "", + ".gitignore": "\n".join([".env", ".venv", "# A comment", "data", "*.foo", "!keep.foo"]), + ".dockerignore": "\n".join(["data", "*.bar", ".git"]), + } + + make_tree(tmp_path, tree) + subprocess.run(["git", "init", tmp_path]) + return tmp_path + + +def test_package(flyte_project, tmp_path): + archive_fname = fast_package(source=flyte_project, output_dir=tmp_path) + with tarfile.open(archive_fname) as tar: + assert tar.getnames() == [ + "", # tar root, output removes leading '/' + ".dockerignore", + ".gitignore", + ".venv", # Included in archive but empty - git ls-files operates on files only + "keep.foo", + "src", + "src/workflows", + "src/workflows/hello_world.py", + ] + assert str(os.path.basename(archive_fname)).startswith(FAST_PREFIX) + assert str(archive_fname).endswith(FAST_FILEENDING) + + +def test_digest_ignore(flyte_project): + ignore = IgnoreGroup(flyte_project, [GitIgnore, DockerIgnore, StandardIgnore]) + digest1 = compute_digest(flyte_project, ignore.is_ignored) + + change_file = flyte_project / "data" / "large.file" + assert ignore.is_ignored(change_file) + change_file.write_text("I don't matter") + + digest2 = compute_digest(flyte_project, ignore.is_ignored) + assert digest1 == digest2 + + +def test_digest_change(flyte_project): + ignore = IgnoreGroup(flyte_project, [GitIgnore, DockerIgnore, StandardIgnore]) + digest1 = compute_digest(flyte_project, ignore.is_ignored) + + change_file = flyte_project / "src" / "workflows" / "hello_world.py" + assert not ignore.is_ignored(change_file) + change_file.write_text("print('I do matter!')") + + digest2 = compute_digest(flyte_project, ignore.is_ignored) + assert digest1 != digest2 def test_get_additional_distribution_loc(): diff --git a/tests/flytekit/unit/tools/test_ignore.py b/tests/flytekit/unit/tools/test_ignore.py index 8f2ec2949c..5d8d22607c 100644 --- a/tests/flytekit/unit/tools/test_ignore.py +++ b/tests/flytekit/unit/tools/test_ignore.py @@ -1,7 +1,7 @@ import subprocess from pathlib import Path from tarfile import TarInfo -from typing import Dict, List +from typing import Dict from unittest.mock import patch import pytest From acf07999b05debbfb16e0f2181eba9b5be2ac965 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 16:31:22 +0200 Subject: [PATCH 16/25] Clean up rebase, integrate fast_package to package Signed-off-by: Tim Bauer --- flytekit/clis/sdk_in_container/package.py | 4 ---- flytekit/clis/sdk_in_container/serialize.py | 5 +---- flytekit/tools/repo.py | 7 +------ 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/flytekit/clis/sdk_in_container/package.py b/flytekit/clis/sdk_in_container/package.py index 0c4cc16d7c..71efeab576 100644 --- a/flytekit/clis/sdk_in_container/package.py +++ b/flytekit/clis/sdk_in_container/package.py @@ -12,10 +12,6 @@ ) from flytekit.tools.repo import NoSerializableEntitiesError, serialize_and_package -from flytekit.core import context_manager -from flytekit.tools import module_loader, serialize_helpers -from flytekit.tools.fast_registration import fast_package - @click.command("package") @click.option( diff --git a/flytekit/clis/sdk_in_container/serialize.py b/flytekit/clis/sdk_in_container/serialize.py index 2933c521a6..1e46fe6830 100644 --- a/flytekit/clis/sdk_in_container/serialize.py +++ b/flytekit/clis/sdk_in_container/serialize.py @@ -9,12 +9,9 @@ from flytekit.clis.sdk_in_container.constants import CTX_PACKAGES from flytekit.configuration import FastSerializationSettings, ImageConfig, SerializationSettings from flytekit.exceptions.scopes import system_entry_point -from flytekit.tools.fast_registration import compute_digest as _compute_digest -from flytekit.tools.fast_registration import filter_tar_file_fn as _filter_tar_file_fn from flytekit.tools.repo import serialize_to_folder from flytekit.tools.fast_registration import fast_package -from flytekit.tools.module_loader import trigger_loading -from flytekit.tools.serialize_helpers import get_registrable_entities, persist_registrable_entities + CTX_IMAGE = "image" CTX_LOCAL_SRC_ROOT = "local_source_root" diff --git a/flytekit/tools/repo.py b/flytekit/tools/repo.py index 92079b3812..5bcb95f952 100644 --- a/flytekit/tools/repo.py +++ b/flytekit/tools/repo.py @@ -84,12 +84,7 @@ def package( # If Fast serialization is enabled, then an archive is also created and packaged if fast: - digest = fast_registration.compute_digest(source) - archive_fname = os.path.join(output_tmpdir, f"{digest}.tar.gz") - click.secho(f"Fast mode enabled: compressed archive {archive_fname}", dim=True) - # Write using gzip - with tarfile.open(archive_fname, "w:gz") as tar: - tar.add(source, arcname="", filter=fast_registration.filter_tar_file_fn) + fast_registration.fast_package(source, output_tmpdir) with tarfile.open(output, "w:gz") as tar: tar.add(output_tmpdir, arcname="") From 9a9091a3bbaec436a02eded2ef2bdd2ae44da99a Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 17:52:21 +0200 Subject: [PATCH 17/25] Include recursive check for empty dirs Signed-off-by: Tim Bauer --- flytekit/tools/ignore.py | 3 +++ tests/flytekit/unit/tools/test_fast_registration.py | 1 - tests/flytekit/unit/tools/test_ignore.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/flytekit/tools/ignore.py b/flytekit/tools/ignore.py index 7238327846..4202c21567 100644 --- a/flytekit/tools/ignore.py +++ b/flytekit/tools/ignore.py @@ -57,6 +57,9 @@ def _is_ignored(self, path: str) -> bool: if self.ignored: if path in self.ignored: return True + # Ignore empty directories + if os.path.isdir(os.path.join(self.root, path)) and all([self.is_ignored(os.path.join(path, f)) for f in os.listdir(os.path.join(self.root, path))]): + return True return False diff --git a/tests/flytekit/unit/tools/test_fast_registration.py b/tests/flytekit/unit/tools/test_fast_registration.py index 6a2b3eb498..e7219f1d9a 100644 --- a/tests/flytekit/unit/tools/test_fast_registration.py +++ b/tests/flytekit/unit/tools/test_fast_registration.py @@ -46,7 +46,6 @@ def test_package(flyte_project, tmp_path): "", # tar root, output removes leading '/' ".dockerignore", ".gitignore", - ".venv", # Included in archive but empty - git ls-files operates on files only "keep.foo", "src", "src/workflows", diff --git a/tests/flytekit/unit/tools/test_ignore.py b/tests/flytekit/unit/tools/test_ignore.py index 5d8d22607c..86e8fbcfbe 100644 --- a/tests/flytekit/unit/tools/test_ignore.py +++ b/tests/flytekit/unit/tools/test_ignore.py @@ -104,6 +104,7 @@ def all_ignore(tmp_path): def test_simple_gitignore(simple_gitignore): gitignore = GitIgnore(simple_gitignore) assert gitignore.is_ignored(str(simple_gitignore / "test.foo")) + assert gitignore.is_ignored(str(simple_gitignore / "sub")) assert gitignore.is_ignored(str(simple_gitignore / "sub" / "some.bar")) assert not gitignore.is_ignored(str(simple_gitignore / "keep.foo")) assert not gitignore.is_ignored(str(simple_gitignore / ".gitignore")) From da379a9abededf0e62e8d6173c518779ffb92c51 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 17:53:22 +0200 Subject: [PATCH 18/25] Remove obsolete checksumdir, add docker-py Signed-off-by: Tim Bauer --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 337a7689a1..30b4c71321 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ "click>=6.6,<9.0", "croniter>=0.3.20,<4.0.0", "deprecated>=1.0,<2.0", + "docker-py>=1.10.6,<2.0", "python-dateutil>=2.1", "grpcio>=1.44.0,!=1.45.0,<2.0", "grpcio-status>=1.44,!=1.45.0", @@ -68,7 +69,6 @@ "typing_extensions", "docstring-parser>=0.9.0", "diskcache>=5.2.1", - "checksumdir>=1.2.0", "cloudpickle>=2.0.0", "cookiecutter>=1.7.3", "numpy<=1.22.1; python_version < '3.8.0'", From 69183ae15e9fd447893f63639817bba0fa791c1c Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 18:10:31 +0200 Subject: [PATCH 19/25] Dict key lookup for performance increase Signed-off-by: Tim Bauer --- flytekit/tools/ignore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/flytekit/tools/ignore.py b/flytekit/tools/ignore.py index 4202c21567..4f6228f443 100644 --- a/flytekit/tools/ignore.py +++ b/flytekit/tools/ignore.py @@ -5,7 +5,7 @@ from fnmatch import fnmatch from pathlib import Path from shutil import which -from typing import List, Optional +from typing import Dict, List, Optional from docker.utils.build import PatternMatcher @@ -43,15 +43,15 @@ def __init__(self, root: Path): self.has_git = which("git") is not None self.ignored = self._list_ignored() - def _list_ignored(self) -> List[str]: + def _list_ignored(self) -> Dict: if self.has_git: out = subprocess.run(["git", "ls-files", "-io", "--exclude-standard"], cwd=self.root, capture_output=True) if out.returncode == 0: - return out.stdout.decode("utf-8").split("\n")[:-1] + return dict.fromkeys(out.stdout.decode("utf-8").split("\n")[:-1]) cli_logger.warning(f"Could not determine ignored files due to:\n{out.stderr}\nNot applying any filters") - return [] + return {} cli_logger.info("No git executable found, not applying any filters") - return [] + return {} def _is_ignored(self, path: str) -> bool: if self.ignored: From d031cb5888d7d9b63a922577463aa50d37e782d3 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 18:55:15 +0200 Subject: [PATCH 20/25] Fix requirements Signed-off-by: Tim Bauer --- requirements.in | 2 ++ requirements.txt | 14 ++++++++++++-- setup.py | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/requirements.in b/requirements.in index a5e0af332e..b690b42427 100644 --- a/requirements.in +++ b/requirements.in @@ -11,3 +11,5 @@ pyyaml<6 # releases (pandas>=1.4.0 and numpy>=1.22.0). More details in https://github.com/flyteorg/flyte/issues/2115. pandas<1.4.0 numpy<1.22.0 +# This is required by docker-compose and otherwise clashes with docker-py +websocket-client<1.0.0 diff --git a/requirements.txt b/requirements.txt index 1ce263688f..720aa9d4ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,8 +22,6 @@ chardet==4.0.0 # via binaryornot charset-normalizer==2.0.12 # via requests -checksumdir==1.2.0 - # via flytekit click==8.1.2 # via # cookiecutter @@ -46,6 +44,10 @@ diskcache==5.4.0 # via flytekit docker-image-py==0.1.12 # via flytekit +docker-py==1.10.6 + # via flytekit +docker-pycreds==0.4.0 + # via docker-py docstring-parser==0.13 # via flytekit flyteidl==0.24.13 @@ -150,6 +152,7 @@ regex==2022.3.15 requests==2.27.1 # via # cookiecutter + # docker-py # flytekit # responses responses==0.20.0 @@ -161,9 +164,12 @@ secretstorage==3.3.1 six==1.16.0 # via # cookiecutter + # docker-py + # docker-pycreds # grpcio # jsonschema # python-dateutil + # websocket-client sortedcontainers==2.4.0 # via flytekit statsd==3.3.0 @@ -181,6 +187,10 @@ urllib3==1.26.9 # flytekit # requests # responses +websocket-client==0.59.0 + # via + # -r requirements.in + # docker-py wheel==0.37.1 # via flytekit wrapt==1.14.0 diff --git a/setup.py b/setup.py index 30b4c71321..00cc282a8a 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ "click>=6.6,<9.0", "croniter>=0.3.20,<4.0.0", "deprecated>=1.0,<2.0", - "docker-py>=1.10.6,<2.0", + "docker-py>=1.10.6,<2.0.0", "python-dateutil>=2.1", "grpcio>=1.44.0,!=1.45.0,<2.0", "grpcio-status>=1.44,!=1.45.0", From e239da987be5b16e57cc7ae4a440cf061f1ea187 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 19:02:00 +0200 Subject: [PATCH 21/25] Adjust click echos Signed-off-by: Tim Bauer --- flytekit/clis/sdk_in_container/serialize.py | 2 +- flytekit/tools/repo.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/flytekit/clis/sdk_in_container/serialize.py b/flytekit/clis/sdk_in_container/serialize.py index 1e46fe6830..485ba0cfc7 100644 --- a/flytekit/clis/sdk_in_container/serialize.py +++ b/flytekit/clis/sdk_in_container/serialize.py @@ -166,7 +166,7 @@ def fast_workflows(ctx, folder=None): source_dir = ctx.obj[CTX_LOCAL_SRC_ROOT] # Write using gzip archive_fname = fast_package(source_dir, folder) - click.echo(f"Writing compressed archive to {archive_fname}") + click.echo(f"Wrote compressed archive to {archive_fname}") pkgs = ctx.obj[CTX_PACKAGES] dir = ctx.obj[CTX_LOCAL_SRC_ROOT] diff --git a/flytekit/tools/repo.py b/flytekit/tools/repo.py index 5bcb95f952..740847a2fc 100644 --- a/flytekit/tools/repo.py +++ b/flytekit/tools/repo.py @@ -84,7 +84,8 @@ def package( # If Fast serialization is enabled, then an archive is also created and packaged if fast: - fast_registration.fast_package(source, output_tmpdir) + archive_fname = fast_registration.fast_package(source, output_tmpdir) + click.secho(f"Fast mode enabled: compressed archive {archive_fname}", dim=True) with tarfile.open(output, "w:gz") as tar: tar.add(output_tmpdir, arcname="") From ebe2f2f3a4608d06f64281e0feea573866dfc174 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 21:07:12 +0200 Subject: [PATCH 22/25] re-compile dependencies Signed-off-by: Tim Bauer --- dev-requirements.txt | 17 +++++++++++++---- doc-requirements.txt | 11 +++++++++-- requirements-spark2.txt | 14 ++++++++++++-- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 42f02c41a5..1823c493fe 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -46,10 +46,6 @@ charset-normalizer==2.0.12 # via # -c requirements.txt # requests -checksumdir==1.2.0 - # via - # -c requirements.txt - # flytekit click==8.1.2 # via # -c requirements.txt @@ -106,6 +102,14 @@ docker-image-py==0.1.12 # via # -c requirements.txt # flytekit +docker-py==1.10.6 + # via + # -c requirements.txt + # flytekit +docker-pycreds==0.4.0 + # via + # -c requirements.txt + # docker-py dockerpty==0.4.1 # via docker-compose docopt==0.6.2 @@ -357,6 +361,7 @@ requests==2.27.1 # cookiecutter # docker # docker-compose + # docker-py # flytekit # google-api-core # google-cloud-bigquery @@ -380,6 +385,8 @@ six==1.16.0 # -c requirements.txt # bcrypt # cookiecutter + # docker-py + # docker-pycreds # dockerpty # google-auth # grpcio @@ -429,8 +436,10 @@ virtualenv==20.14.0 # via pre-commit websocket-client==0.59.0 # via + # -c requirements.txt # docker # docker-compose + # docker-py wheel==0.37.1 # via # -c requirements.txt diff --git a/doc-requirements.txt b/doc-requirements.txt index c29fdfb431..e7df32ae10 100644 --- a/doc-requirements.txt +++ b/doc-requirements.txt @@ -29,8 +29,6 @@ chardet==4.0.0 # via binaryornot charset-normalizer==2.0.12 # via requests -checksumdir==1.2.0 - # via flytekit click==8.1.2 # via # cookiecutter @@ -57,6 +55,10 @@ diskcache==5.4.0 # via flytekit docker-image-py==0.1.12 # via flytekit +docker-py==1.10.6 + # via flytekit +docker-pycreds==0.4.0 + # via docker-py docstring-parser==0.13 # via flytekit docutils==0.17.1 @@ -180,6 +182,7 @@ regex==2022.3.15 requests==2.27.1 # via # cookiecutter + # docker-py # flytekit # responses # sphinx @@ -192,6 +195,8 @@ secretstorage==3.3.1 six==1.16.0 # via # cookiecutter + # docker-py + # docker-pycreds # grpcio # python-dateutil # sphinx-code-include @@ -264,6 +269,8 @@ urllib3==1.26.9 # flytekit # requests # responses +websocket-client==1.3.2 + # via docker-py wheel==0.37.1 # via flytekit wrapt==1.14.0 diff --git a/requirements-spark2.txt b/requirements-spark2.txt index 60dbf9e28c..26e94ffbc1 100644 --- a/requirements-spark2.txt +++ b/requirements-spark2.txt @@ -24,8 +24,6 @@ chardet==4.0.0 # via binaryornot charset-normalizer==2.0.12 # via requests -checksumdir==1.2.0 - # via flytekit click==8.1.2 # via # cookiecutter @@ -48,6 +46,10 @@ diskcache==5.4.0 # via flytekit docker-image-py==0.1.12 # via flytekit +docker-py==1.10.6 + # via flytekit +docker-pycreds==0.4.0 + # via docker-py docstring-parser==0.13 # via flytekit flyteidl==0.24.13 @@ -152,6 +154,7 @@ regex==2022.3.15 requests==2.27.1 # via # cookiecutter + # docker-py # flytekit # responses responses==0.20.0 @@ -163,9 +166,12 @@ secretstorage==3.3.1 six==1.16.0 # via # cookiecutter + # docker-py + # docker-pycreds # grpcio # jsonschema # python-dateutil + # websocket-client sortedcontainers==2.4.0 # via flytekit statsd==3.3.0 @@ -183,6 +189,10 @@ urllib3==1.26.9 # flytekit # requests # responses +websocket-client==0.59.0 + # via + # -r requirements.in + # docker-py wheel==0.37.1 # via flytekit wrapt==1.14.0 From becdf2ce396519e4a1e74ce549e937847ec0d853 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Tue, 5 Apr 2022 21:44:17 +0200 Subject: [PATCH 23/25] Fix lint Signed-off-by: Tim Bauer --- flytekit/clis/sdk_in_container/serialize.py | 3 +-- flytekit/tools/ignore.py | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/flytekit/clis/sdk_in_container/serialize.py b/flytekit/clis/sdk_in_container/serialize.py index 485ba0cfc7..0b12d6b406 100644 --- a/flytekit/clis/sdk_in_container/serialize.py +++ b/flytekit/clis/sdk_in_container/serialize.py @@ -9,9 +9,8 @@ from flytekit.clis.sdk_in_container.constants import CTX_PACKAGES from flytekit.configuration import FastSerializationSettings, ImageConfig, SerializationSettings from flytekit.exceptions.scopes import system_entry_point -from flytekit.tools.repo import serialize_to_folder from flytekit.tools.fast_registration import fast_package - +from flytekit.tools.repo import serialize_to_folder CTX_IMAGE = "image" CTX_LOCAL_SRC_ROOT = "local_source_root" diff --git a/flytekit/tools/ignore.py b/flytekit/tools/ignore.py index 4f6228f443..10068fd52d 100644 --- a/flytekit/tools/ignore.py +++ b/flytekit/tools/ignore.py @@ -58,8 +58,10 @@ def _is_ignored(self, path: str) -> bool: if path in self.ignored: return True # Ignore empty directories - if os.path.isdir(os.path.join(self.root, path)) and all([self.is_ignored(os.path.join(path, f)) for f in os.listdir(os.path.join(self.root, path))]): - return True + if os.path.isdir(os.path.join(self.root, path)) and all( + [self.is_ignored(os.path.join(path, f)) for f in os.listdir(os.path.join(self.root, path))] + ): + return True return False From 5aa1a15daae58bb653d33acf306ac8894f163120 Mon Sep 17 00:00:00 2001 From: Tim Bauer Date: Wed, 6 Apr 2022 17:07:15 +0200 Subject: [PATCH 24/25] Remove obsolete import Signed-off-by: Tim Bauer --- flytekit/tools/repo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flytekit/tools/repo.py b/flytekit/tools/repo.py index 740847a2fc..63b21324a5 100644 --- a/flytekit/tools/repo.py +++ b/flytekit/tools/repo.py @@ -1,4 +1,3 @@ -import os import tarfile import tempfile import typing From 5f83195e5ea32a62e3dcd9d862d5c77101ed3e0d Mon Sep 17 00:00:00 2001 From: Eduardo Apolinario Date: Wed, 20 Apr 2022 20:27:50 -0700 Subject: [PATCH 25/25] Fix tests on windows Signed-off-by: Eduardo Apolinario --- flytekit/tools/ignore.py | 3 ++- tests/flytekit/unit/tools/test_fast_registration.py | 2 +- tests/flytekit/unit/tools/test_ignore.py | 13 +++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/flytekit/tools/ignore.py b/flytekit/tools/ignore.py index 10068fd52d..769727c4dd 100644 --- a/flytekit/tools/ignore.py +++ b/flytekit/tools/ignore.py @@ -55,7 +55,8 @@ def _list_ignored(self) -> Dict: def _is_ignored(self, path: str) -> bool: if self.ignored: - if path in self.ignored: + # git-ls-files uses POSIX paths + if Path(path).as_posix() in self.ignored: return True # Ignore empty directories if os.path.isdir(os.path.join(self.root, path)) and all( diff --git a/tests/flytekit/unit/tools/test_fast_registration.py b/tests/flytekit/unit/tools/test_fast_registration.py index e7219f1d9a..0b50d6fdcf 100644 --- a/tests/flytekit/unit/tools/test_fast_registration.py +++ b/tests/flytekit/unit/tools/test_fast_registration.py @@ -35,7 +35,7 @@ def flyte_project(tmp_path): } make_tree(tmp_path, tree) - subprocess.run(["git", "init", tmp_path]) + subprocess.run(["git", "init", str(tmp_path)]) return tmp_path diff --git a/tests/flytekit/unit/tools/test_ignore.py b/tests/flytekit/unit/tools/test_ignore.py index 86e8fbcfbe..1f93cd2bee 100644 --- a/tests/flytekit/unit/tools/test_ignore.py +++ b/tests/flytekit/unit/tools/test_ignore.py @@ -1,3 +1,4 @@ +import os import subprocess from pathlib import Path from tarfile import TarInfo @@ -31,7 +32,7 @@ def simple_gitignore(tmp_path): } make_tree(tmp_path, tree) - subprocess.run(["git", "init", tmp_path]) + subprocess.run(["git", "init", str(tmp_path)]) return tmp_path @@ -52,7 +53,7 @@ def nested_gitignore(tmp_path): } make_tree(tmp_path, tree) - subprocess.run(["git", "init", tmp_path]) + subprocess.run(["git", "init", str(tmp_path)]) return tmp_path @@ -97,7 +98,7 @@ def all_ignore(tmp_path): } make_tree(tmp_path, tree) - subprocess.run(["git", "init", tmp_path]) + subprocess.run(["git", "init", str(tmp_path)]) return tmp_path @@ -114,7 +115,11 @@ def test_simple_gitignore(simple_gitignore): def test_not_subpath(simple_gitignore): """Test edge case that if path is not on root it cannot not be ignored""" gitignore = GitIgnore(simple_gitignore) - assert not gitignore.is_ignored("/whatever/test.foo") + if os.name == "nt": + # Relative paths can only be compared if files are in the same drive + assert not gitignore.is_ignored(str(Path(simple_gitignore.drive) / "whatever" / "test.foo")) + else: + assert not gitignore.is_ignored("/whatever/test.foo") def test_nested_gitignore(nested_gitignore):