Skip to content

Commit

Permalink
Add support for paginating differences in initial commit
Browse files Browse the repository at this point in the history
**Why?**

Before, we were only matching against the first page of differences.
This could result in files being deleted erroneously if the repository went
through a large change such as an update that touches almost all files.

**What?**

Added support for paginating the differences and updated the tests to cover
these.
  • Loading branch information
sbkok committed Sep 26, 2022
1 parent a80d05b commit 8ecab67
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
NOT_YET_CREATED = "NOT_YET_CREATED"
CC_CLIENT = boto3.client("codecommit")
CONFIG_FILE_REGEX = re.compile(r"\A.*[.](yaml|yml|json)\Z", re.I)
EXECUTABLE_FILES = []
EXECUTABLE_FILES: List[str] = []
ADF_LOG_LEVEL = os.environ.get("ADF_LOG_LEVEL", "INFO")
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -410,22 +410,30 @@ def get_files_to_delete(
repo_name: str,
directory_path: Path,
) -> List[FileToDelete]:
differences = CC_CLIENT.get_differences(
paginator = CC_CLIENT.get_paginator('get_differences')
page_iterator = paginator.paginate(
repositoryName=repo_name,
afterCommitSpecifier='HEAD'
)['differences']

# We never want to delete JSON or YAML files
file_paths = [
Path(file['afterBlob']['path'])
for file in differences
if not CONFIG_FILE_REGEX.match(file['afterBlob']['path'])
]
afterCommitSpecifier='HEAD',
)
unfiltered_file_paths = []
for page in page_iterator:
unfiltered_file_paths.extend(list(
map(
lambda obj: Path(obj['afterBlob']['path']),
page['differences'],
),
))

file_paths = list(filter(
# We never want to delete JSON or YAML files
lambda path: not CONFIG_FILE_REGEX.match(str(path)),
unfiltered_file_paths,
))

blobs = [
# Get the paths relative to the directory path so we can compare them
# correctly.
filename.relative_to(directory_path)
str(filename.relative_to(directory_path))
for filename in directory_path.rglob('*')
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

# pylint: skip-file

import os
import tempfile
from pathlib import Path
import pytest
from mock import Mock, patch
Expand Down Expand Up @@ -33,6 +35,7 @@
SHOULD_NOT_DELETE_DIRS = [
'deployment_maps',
'deployment',
'samples',
]
SHOULD_DELETE_PATHS = [
'other.txt',
Expand All @@ -56,9 +59,8 @@ def is_dir(self):
return self.path in SHOULD_NOT_DELETE_DIRS


@patch('initial_commit.Path')
@patch('initial_commit.CC_CLIENT')
def test_get_files_to_delete(cc_client, path_cls):
def test_get_files_to_delete(cc_client):
repo_name = 'some-repo-name'
difference_paths = (
SHOULD_NOT_DELETE_FILES +
Expand All @@ -69,31 +71,34 @@ def test_get_files_to_delete(cc_client, path_cls):
lambda x: {'afterBlob': {'path': x}},
difference_paths,
))
cc_client.get_differences.return_value = {
'differences': differences,
}
path_rglob_mock = Mock()
path_rglob_mock.rglob.return_value = list(map(
lambda path: "/var/task/pipelines_repository/{}".format(path),
FILES_IN_UPSTREAM_REPO,
))
path_cls.side_effect = lambda path: (
path_rglob_mock if path == '/var/task/pipelines_repository/'
else GenericPathMocked(path)
paginator = Mock()
cc_client.get_paginator.return_value = paginator
paginator.paginate.return_value = [
{
'differences': differences[:2],
},
{
'differences': differences[2:],
},
]
with tempfile.TemporaryDirectory() as temp_dir_path:
directory_path = Path(temp_dir_path)
for dir_name in SHOULD_NOT_DELETE_DIRS:
os.mkdir(str(directory_path / dir_name))
for file_name in SHOULD_NOT_DELETE_FILES:
with open(str(directory_path / file_name), "wb") as file_p:
file_p.write("Test".encode('utf-8'))

result = get_files_to_delete(repo_name, directory_path)

cc_client.get_paginator.assert_called_once_with(
'get_differences',
)

result = get_files_to_delete(repo_name)

cc_client.get_differences.assert_called_once_with(
paginator.paginate.assert_called_once_with(
repositoryName=repo_name,
afterCommitSpecifier='HEAD',
)

path_cls.assert_called_with(
'/var/task/pipelines_repository/'
)
path_rglob_mock.rglob.assert_called_once_with('*')

assert all(isinstance(x, FileToDelete) for x in result)

# Extract paths from result FileToDelete objects to make querying easier
Expand All @@ -103,26 +108,37 @@ def test_get_files_to_delete(cc_client, path_cls):
assert all(x not in result_paths for x in SHOULD_NOT_DELETE_FILES)
assert all(x not in result_paths for x in SHOULD_NOT_DELETE_DIRS)

# Should delete all other
assert result_paths == SHOULD_DELETE_PATHS
assert len(result_paths) == len(SHOULD_DELETE_PATHS)

# Extract paths from result FileToDelete objects to make querying easier
result_paths = list(map(lambda x: x.filePath, result))

# Should not delete JSON, YAML, and directories
assert all(x not in result_paths for x in SHOULD_NOT_DELETE_FILES)
assert all(x not in result_paths for x in SHOULD_NOT_DELETE_DIRS)

# Should delete all other
assert all(x in result_paths for x in SHOULD_DELETE_PATHS)
assert len(result_paths) == len(SHOULD_DELETE_PATHS)


@pytest.mark.parametrize("entry", SHOULD_NOT_BE_EXECUTABLE)
def test_determine_file_mode_normal(entry):
base_path = "test"
new_entry = f"/some/{base_path}/{entry}"
base_path = Path("/some/test")
new_entry = base_path / entry
assert determine_file_mode(
Path(new_entry),
new_entry,
base_path,
) == FileMode.NORMAL


@pytest.mark.parametrize("entry", EXECUTABLE_FILES)
def test_determine_file_mode_executable(entry):
base_path = "test"
new_entry = f"/some/{base_path}/{entry}"
base_path = Path("/some/test")
new_entry = base_path / entry
assert determine_file_mode(
Path(new_entry),
new_entry,
base_path,
) == FileMode.EXECUTABLE
34 changes: 21 additions & 13 deletions src/lambda_codebase/initial_commit/initial_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@
NOT_YET_CREATED = "NOT_YET_CREATED"
CC_CLIENT = boto3.client("codecommit")
CONFIG_FILE_REGEX = re.compile(r"\A.*[.](yaml|yml|json)\Z", re.I)
REWRITE_PATHS = {
REWRITE_PATHS: Dict[str, str] = {
"bootstrap_repository/adf-bootstrap/example-global-iam.yml": (
"adf-bootstrap/global-iam.yml"
),
"adf.yml.j2": "adf-accounts/adf.yml",
"adfconfig.yml.j2": "adfconfig.yml",
}
EXECUTABLE_FILES = [
EXECUTABLE_FILES: List[str] = [
"adf-build/shared/helpers/package_transform.sh",
"adf-build/shared/helpers/retrieve_organization_accounts.py",
"adf-build/shared/helpers/sync_to_s3.py",
Expand Down Expand Up @@ -450,22 +450,30 @@ def get_files_to_delete(
repo_name: str,
directory_path: Path,
) -> List[FileToDelete]:
differences = CC_CLIENT.get_differences(
paginator = CC_CLIENT.get_paginator('get_differences')
page_iterator = paginator.paginate(
repositoryName=repo_name,
afterCommitSpecifier='HEAD'
)['differences']

# We never want to delete JSON or YAML files
file_paths = [
Path(file['afterBlob']['path'])
for file in differences
if not CONFIG_FILE_REGEX.match(file['afterBlob']['path'])
]
afterCommitSpecifier='HEAD',
)
unfiltered_file_paths = []
for page in page_iterator:
unfiltered_file_paths.extend(list(
map(
lambda obj: Path(obj['afterBlob']['path']),
page['differences'],
),
))

file_paths = list(filter(
# We never want to delete JSON or YAML files
lambda path: not CONFIG_FILE_REGEX.match(str(path)),
unfiltered_file_paths,
))

blobs = [
# Get the paths relative to the directory path so we can compare them
# correctly.
filename.relative_to(directory_path)
str(filename.relative_to(directory_path))
for filename in directory_path.rglob('*')
]

Expand Down
72 changes: 44 additions & 28 deletions src/lambda_codebase/initial_commit/tests/test_initial_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

# pylint: skip-file

import os
import tempfile
from pathlib import Path
import pytest
from mock import Mock, patch
Expand Down Expand Up @@ -33,6 +35,7 @@
SHOULD_NOT_DELETE_DIRS = [
'deployment_maps',
'deployment',
'samples',
]
SHOULD_DELETE_PATHS = [
'other.txt',
Expand All @@ -56,9 +59,8 @@ def is_dir(self):
return self.path in SHOULD_NOT_DELETE_DIRS


@patch('initial_commit.Path')
@patch('initial_commit.CC_CLIENT')
def test_get_files_to_delete(cc_client, path_cls):
def test_get_files_to_delete(cc_client):
repo_name = 'some-repo-name'
difference_paths = (
SHOULD_NOT_DELETE_FILES +
Expand All @@ -69,31 +71,34 @@ def test_get_files_to_delete(cc_client, path_cls):
lambda x: {'afterBlob': {'path': x}},
difference_paths,
))
cc_client.get_differences.return_value = {
'differences': differences,
}
path_rglob_mock = Mock()
path_rglob_mock.rglob.return_value = list(map(
lambda path: "/var/task/bootstrap_repository/{}".format(path),
FILES_IN_UPSTREAM_REPO,
))
path_cls.side_effect = lambda path: (
path_rglob_mock if path == '/var/task/bootstrap_repository/'
else GenericPathMocked(path)
paginator = Mock()
cc_client.get_paginator.return_value = paginator
paginator.paginate.return_value = [
{
'differences': differences[:2],
},
{
'differences': differences[2:],
},
]
with tempfile.TemporaryDirectory() as temp_dir_path:
directory_path = Path(temp_dir_path)
for dir_name in SHOULD_NOT_DELETE_DIRS:
os.mkdir(str(directory_path / dir_name))
for file_name in SHOULD_NOT_DELETE_FILES:
with open(str(directory_path / file_name), "wb") as file_p:
file_p.write("Test".encode('utf-8'))

result = get_files_to_delete(repo_name, directory_path)

cc_client.get_paginator.assert_called_once_with(
'get_differences',
)

result = get_files_to_delete(repo_name)

cc_client.get_differences.assert_called_once_with(
paginator.paginate.assert_called_once_with(
repositoryName=repo_name,
afterCommitSpecifier='HEAD',
)

path_cls.assert_called_with(
'/var/task/bootstrap_repository/'
)
path_rglob_mock.rglob.assert_called_once_with('*')

assert all(isinstance(x, FileToDelete) for x in result)

# Extract paths from result FileToDelete objects to make querying easier
Expand All @@ -103,26 +108,37 @@ def test_get_files_to_delete(cc_client, path_cls):
assert all(x not in result_paths for x in SHOULD_NOT_DELETE_FILES)
assert all(x not in result_paths for x in SHOULD_NOT_DELETE_DIRS)

# Should delete all other
assert result_paths == SHOULD_DELETE_PATHS
assert len(result_paths) == len(SHOULD_DELETE_PATHS)

# Extract paths from result FileToDelete objects to make querying easier
result_paths = list(map(lambda x: x.filePath, result))

# Should not delete JSON, YAML, and directories
assert all(x not in result_paths for x in SHOULD_NOT_DELETE_FILES)
assert all(x not in result_paths for x in SHOULD_NOT_DELETE_DIRS)

# Should delete all other
assert all(x in result_paths for x in SHOULD_DELETE_PATHS)
assert len(result_paths) == len(SHOULD_DELETE_PATHS)


@pytest.mark.parametrize("entry", SHOULD_NOT_BE_EXECUTABLE)
def test_determine_file_mode_normal(entry):
base_path = "test"
new_entry = f"/some/{base_path}/{entry}"
base_path = Path("/some/test")
new_entry = base_path / entry
assert determine_file_mode(
Path(new_entry),
new_entry,
base_path,
) == FileMode.NORMAL


@pytest.mark.parametrize("entry", EXECUTABLE_FILES)
def test_determine_file_mode_executable(entry):
base_path = "test"
new_entry = f"/some/{base_path}/{entry}"
base_path = Path("/some/test")
new_entry = base_path / entry
assert determine_file_mode(
Path(new_entry),
new_entry,
base_path,
) == FileMode.EXECUTABLE

0 comments on commit 8ecab67

Please sign in to comment.