From a80d05b058d282bede91660b164bfe47de2fe070 Mon Sep 17 00:00:00 2001 From: Simon Kok Date: Tue, 20 Sep 2022 17:02:54 +0200 Subject: [PATCH 1/2] Fix detecting changes upon update properly **Why?** CodeCommit has a limit of 100 files per commit. To workaround this limit, the implementation walked across all the files and split it into chunks of 99 files. However, when one of those chunks would not have any changes, it would raise an exception that was handled at the incorrect level. Causing it to abort the creation of the PR all together. Even when there were changes in another chunk. **What?** Fixed handling the "no changes in this chunk" error at the chunk level. So it will only skip that chunk and continue with the next. Maintaining a list of commits that were created, and if any were, it will open the pull request. Additionally, the initial commit code had a lot of repetitive code. This got refactored as well. Since there are two initial_commit.py functions, the changes had to be applied twice. --- .../initial_commit/initial_commit.py | 358 +++++++++------- .../initial_commit/initial_commit.py | 392 +++++++++++------- 2 files changed, 454 insertions(+), 296 deletions(-) diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/initial_commit.py b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/initial_commit.py index 9c153be55..351326f09 100644 --- a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/initial_commit.py +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/initial_commit.py @@ -3,6 +3,8 @@ initial pipelines repository content. """ +import os +import logging from typing import Mapping, Optional, Union, List, Dict, Any, Tuple from dataclasses import dataclass, fields from enum import Enum @@ -10,7 +12,7 @@ import re import boto3 import jinja2 -from cfn_custom_resource import ( # pylint: disable=unused-import +from cfn_custom_resource import ( # pylint: disable=unused-import lambda_handler, create, update, @@ -25,6 +27,10 @@ CC_CLIENT = boto3.client("codecommit") CONFIG_FILE_REGEX = re.compile(r"\A.*[.](yaml|yml|json)\Z", re.I) EXECUTABLE_FILES = [] +ADF_LOG_LEVEL = os.environ.get("ADF_LOG_LEVEL", "INFO") +logging.basicConfig(level=logging.INFO) +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(ADF_LOG_LEVEL) PR_DESCRIPTION = """ADF Version {0} @@ -61,8 +67,7 @@ def __post_init__(self): if self.NotificationEndpoint: self.NotificationEndpointType = ( "email" - if "@" - in self.NotificationEndpoint # pylint:disable=unsupported-membership-test + if self.NotificationEndpoint.find("@") > 0 else "slack" ) @@ -119,6 +124,7 @@ def as_dict(self) -> Dict[str, Union[str, bytes]]: "fileContent": self.fileContent, } + @dataclass class FileToDelete: filePath: str @@ -155,48 +161,68 @@ def __post_init__(self): ) -def generate_create_branch_input(event, repo_name, commit_id): - return { - "repositoryName": repo_name, - "branchName": event.ResourceProperties.Version, - "commitId": commit_id - } +def chunks(list_to_chunk, number_to_chunk_into): + """ + Split the list in segments of number_to_chunk_into. + Args: + list_to_chunk (list(Any)): The list to split into chunks. + number_to_chunk_into (int): The number per chunk that is allowed max. -def generate_delete_branch_input(event, repo_name): - return { - "repositoryName": repo_name, - "branchName": event.ResourceProperties.Version - } + Returns: + generator(list(Any)): The list of chunks of the list_to_chunk, where + each item in the list of chunks contains at most + number_to_chunk_into elements. + """ + number_per_chunk = max(1, number_to_chunk_into) + return ( + list_to_chunk[item:item + number_per_chunk] + for item in range(0, len(list_to_chunk), number_per_chunk) + ) -def chunks(list_to_chunk, number_to_chunk_into): - number_of_chunks = max(1, number_to_chunk_into) - return (list_to_chunk[item:item + number_of_chunks] for item in range(0, len(list_to_chunk), number_of_chunks)) - - -def generate_pull_request_input(event, repo_name, default_branch_name): - return { - "title": f'ADF {event.ResourceProperties.Version} Automated Update PR', - "description": PR_DESCRIPTION.format(event.ResourceProperties.Version), - "targets": [ - { - 'repositoryName': repo_name, - 'sourceReference': event.ResourceProperties.Version, - 'destinationReference': default_branch_name, - }, - ] - } +def generate_commit_input( + repo_name, + version, + index, + branch="main", + parent_commit_id=None, + puts=None, + deletes=None, +): + """ + Generate the input used to create a commit with the CodeCommit client. + + Args: + repo_name (str): The repository name to crate a commit on. + + version (str): The version of ADF that is installing/updating. + + index (int): The index number of the commit. + + branch (str): The branch to create a commit on, defaults to `main`. + + parent_commit_id (str): The parent commit id which this commit will be + linked to. + puts (FileToCommit[]): The list of FileToCommit items that need to be + committed. -def generate_commit_input(repo_name, index, branch="main", parent_commit_id=None, puts=None, deletes=None): + deletes (FileToDelete[]): The list of FileToDelete items that need to + be removed in this commit. + + Returns: + dict(str, Any): The create_commit API call details. + """ commit_action = "Delete" if deletes else "Create" output = { "repositoryName": repo_name, "branchName": branch, "authorName": "AWS ADF Builders Team", "email": "adf-builders@amazon.com", - "commitMessage": f"Automated Commit - {commit_action} Part {index}", + "commitMessage": ( + f"Automated Commit - {version} {commit_action} Part {index}" + ), "putFiles": puts if puts else [], "deleteFiles": deletes if deletes else [] } @@ -205,116 +231,168 @@ def generate_commit_input(repo_name, index, branch="main", parent_commit_id=None return output -@create() -def create_(event: Mapping[str, Any], _context: Any) -> Tuple[Union[None, PhysicalResourceId], Data]: - create_event = CreateEvent(**event) - repo_name = repo_arn_to_name(create_event.ResourceProperties.RepositoryArn) - default_branch_name = create_event.ResourceProperties.DefaultBranchName - directory = create_event.ResourceProperties.DirectoryName - try: - commit_id = CC_CLIENT.get_branch( - repositoryName=repo_name, - branchName=default_branch_name, - )["branch"]["commitId"] - CC_CLIENT.create_branch( - repositoryName=repo_name, - branchName=create_event.ResourceProperties.Version, - commitId=commit_id - ) - # CodeCommit only allows 100 files per commit, so we chunk them up here - for index, files in enumerate(chunks([f.as_dict() for f in get_files_to_commit(directory)], 99)): - if index == 0: - commit_id = CC_CLIENT.create_commit( - **generate_commit_input(repo_name, index, puts=files) - )["commitId"] - else: - commit_id = CC_CLIENT.create_commit( - **generate_commit_input(repo_name, index, puts=files, parent_commit_id=commit_id) - )["commitId"] - - CC_CLIENT.create_pull_request( - **generate_pull_request_input( - create_event, - repo_name, - default_branch_name, - ) - ) - return event.get("PhysicalResourceId"), {} - - except (CC_CLIENT.exceptions.FileEntryRequiredException, CC_CLIENT.exceptions.NoChangeException): - CC_CLIENT.delete_branch(**generate_delete_branch_input(create_event, repo_name)) - return event.get("PhysicalResourceId"), {} - - except CC_CLIENT.exceptions.BranchDoesNotExistException: - files_to_commit = get_files_to_commit(directory) - if directory == "bootstrap_repository": - adf_config = create_adf_config_file(create_event.ResourceProperties) - files_to_commit.append(adf_config) +def generate_commits(event, repo_name, directory, parent_commit_id=None): + """ + Generate the commits for the specified repository. - for index, files in enumerate(chunks([f.as_dict() for f in files_to_commit], 99)): - if index == 0: - commit_id = CC_CLIENT.create_commit( - **generate_commit_input(repo_name, index, puts=files) - )["commitId"] - else: - commit_id = CC_CLIENT.create_commit( - **generate_commit_input(repo_name, index, puts=files, parent_commit_id=commit_id) - )["commitId"] + Args: + event (dict(str, Any)): The Create Event or Update Event details. - return commit_id, {} + repo_name (str): The repository name to create the commits on. + directory (str): The directory to process. -@update() -def update_(event: Mapping[str, Any], _context: Any, create_pr=False) -> Tuple[PhysicalResourceId, Data]: #pylint: disable=R0912, R0915 - update_event = UpdateEvent(**event) - repo_name = repo_arn_to_name(update_event.ResourceProperties.RepositoryArn) - default_branch_name = update_event.ResourceProperties.DefaultBranchName - files_to_delete = get_files_to_delete(repo_name) - files_to_commit = get_files_to_commit(update_event.ResourceProperties.DirectoryName) + parent_commit_id (str): The parent commit to link the commits to. - commit_id = CC_CLIENT.get_branch( - repositoryName=repo_name, - branchName=default_branch_name, - )["branch"]["commitId"] + Returns: + str[]: The commit ids of the commits that were created. + """ + directory_path = HERE / directory + version = event.ResourceProperties.Version + default_branch_name = event.ResourceProperties.DefaultBranchName + branch_name = version CC_CLIENT.create_branch( - **generate_create_branch_input(update_event, repo_name, commit_id) + repositoryName=repo_name, + branchName=branch_name, + commitId=parent_commit_id ) - if files_to_commit: + # CodeCommit only allows 100 files per commit, so we chunk them up here + files_to_commit = get_files_to_commit(directory_path) + create_first_branch = parent_commit_id is None + + if create_first_branch and directory == "bootstrap_repository": + adf_config = create_adf_config_file(event.ResourceProperties) + files_to_commit.append(adf_config) + + chunked_files = chunks([f.as_dict() for f in files_to_commit], 99) + commit_id = parent_commit_id + commits_created = [] + for index, files in enumerate(chunked_files): try: - for index, files in enumerate(chunks([f.as_dict() for f in files_to_commit], 99)): - commit_id = CC_CLIENT.create_commit(**generate_commit_input( + commit_id = CC_CLIENT.create_commit( + **generate_commit_input( repo_name, + version, index, + branch_name, + puts=files, parent_commit_id=commit_id, - branch=update_event.ResourceProperties.Version, - puts=files - ))["commitId"] - create_pr = True # If the commit above was able to be made, we want to create a PR afterwards - except (CC_CLIENT.exceptions.FileEntryRequiredException, CC_CLIENT.exceptions.NoChangeException): + ) + )["commitId"] + commits_created.append(commit_id) + except ( + CC_CLIENT.exceptions.FileEntryRequiredException, + CC_CLIENT.exceptions.NoChangeException + ): pass - if files_to_delete: - try: - for index, deletes in enumerate(chunks([f.as_dict() for f in files_to_delete], 99)): + + if not create_first_branch: + # If the branch exists already with files inside, we may need to + # check which of these files should be deleted: + files_to_delete = get_files_to_delete(repo_name, directory_path) + for index, deletes in enumerate( + chunks([f.as_dict() for f in files_to_delete], 99) + ): + try: commit_id = CC_CLIENT.create_commit(**generate_commit_input( repo_name, + version, index, parent_commit_id=commit_id, - branch=update_event.ResourceProperties.Version, + branch=branch_name, deletes=deletes ))["commitId"] - except (CC_CLIENT.exceptions.FileEntryRequiredException, CC_CLIENT.exceptions.NoChangeException): - pass - if create_pr or files_to_delete: + commits_created.append(commit_id) + except ( + CC_CLIENT.exceptions.FileEntryRequiredException, + CC_CLIENT.exceptions.NoChangeException, + ): + pass + + if commits_created: CC_CLIENT.create_pull_request( - **generate_pull_request_input( - update_event, - repo_name, - default_branch_name, - ) + title=f'ADF {version} Automated Update PR', + description=PR_DESCRIPTION.format(version), + targets=[ + { + 'repositoryName': repo_name, + 'sourceReference': branch_name, + 'destinationReference': default_branch_name, + }, + ], ) else: - CC_CLIENT.delete_branch(**generate_delete_branch_input(update_event, repo_name)) + CC_CLIENT.delete_branch( + repositoryName=repo_name, + branchName=branch_name, + ) + + return commits_created + + +def get_commit_id_from_branch(repo_name, branch_name): + try: + return CC_CLIENT.get_branch( + repositoryName=repo_name, + branchName=branch_name, + )["branch"]["commitId"] + except CC_CLIENT.exceptions.BranchDoesNotExistException: + LOGGER.info( + "Branch %s on %s does not exist. " + "Defaulting to creating the branch instead.", + branch_name, + repo_name, + ) + return None + + +@create() +def create_( + event: Mapping[str, Any], + _context: Any, +) -> Tuple[Union[None, PhysicalResourceId], Data]: + create_event = CreateEvent(**event) + repo_name = repo_arn_to_name(create_event.ResourceProperties.RepositoryArn) + default_branch_name = create_event.ResourceProperties.DefaultBranchName + directory = create_event.ResourceProperties.DirectoryName + + parent_commit_id = get_commit_id_from_branch( + repo_name, + default_branch_name, + ) + commits_created = generate_commits( + create_event, + repo_name, + directory=directory, + parent_commit_id=parent_commit_id, + ) + if parent_commit_id is None and commits_created: + # Return the last commit id that was created. + return commits_created[-1], {} + + return event.get("PhysicalResourceId"), {} + + +@update() +def update_( + event: Mapping[str, Any], + _context: Any, +) -> Tuple[PhysicalResourceId, Data]: + update_event = UpdateEvent(**event) + repo_name = repo_arn_to_name(update_event.ResourceProperties.RepositoryArn) + default_branch_name = update_event.ResourceProperties.DefaultBranchName + + parent_commit_id = get_commit_id_from_branch( + repo_name, + default_branch_name, + ) + generate_commits( + update_event, + repo_name, + directory=update_event.ResourceProperties.DirectoryName, + parent_commit_id=parent_commit_id, + ) return event["PhysicalResourceId"], {} @@ -328,7 +406,10 @@ def repo_arn_to_name(repo_arn: str) -> str: return repo_arn.split(":")[-1] -def get_files_to_delete(repo_name: str) -> List[FileToDelete]: +def get_files_to_delete( + repo_name: str, + directory_path: Path, +) -> List[FileToDelete]: differences = CC_CLIENT.get_differences( repositoryName=repo_name, afterCommitSpecifier='HEAD' @@ -341,11 +422,11 @@ def get_files_to_delete(repo_name: str) -> List[FileToDelete]: if not CONFIG_FILE_REGEX.match(file['afterBlob']['path']) ] - # 31: trimming off /var/task/pipelines_repository so - # we can compare correctly blobs = [ - str(filename)[31:] - for filename in Path('/var/task/pipelines_repository/').rglob('*') + # Get the paths relative to the directory path so we can compare them + # correctly. + filename.relative_to(directory_path) + for filename in directory_path.rglob('*') ] return [ @@ -358,41 +439,28 @@ def get_files_to_delete(repo_name: str) -> List[FileToDelete]: ] -def determine_file_mode(entry, directoryName): - if str(get_relative_name(entry, directoryName)) in EXECUTABLE_FILES: +def determine_file_mode(entry: Path, directory_path: Path) -> FileMode: + if str(entry.relative_to(directory_path)) in EXECUTABLE_FILES: return FileMode.EXECUTABLE return FileMode.NORMAL -def get_files_to_commit(directoryName: str) -> List[FileToCommit]: - path = HERE / directoryName - +def get_files_to_commit(directory_path: Path) -> List[FileToCommit]: return [ FileToCommit( - str(get_relative_name(entry, directoryName)), + str(entry.relative_to(directory_path)), determine_file_mode( entry, - directoryName, + directory_path, ), entry.read_bytes(), ) - for entry in path.glob("**/*") + for entry in directory_path.glob("**/*") if not entry.is_dir() ] -def get_relative_name(path: Path, directoryName: str) -> Path: - """ - Search for the last occurrence of in and return only the trailing part of - - >>> get_relative_name(Path('/foo/test/bar/test/xyz/abc.py') ,'test') - Path('xyz/abc.py') - """ - index = list(reversed(path.parts)).index(directoryName) - return Path(*path.parts[-index:]) - - def create_adf_config_file(props: CustomResourceProperties) -> FileToCommit: template = HERE / "adfconfig.yml.j2" adf_config = ( diff --git a/src/lambda_codebase/initial_commit/initial_commit.py b/src/lambda_codebase/initial_commit/initial_commit.py index 099381f06..0d987b643 100644 --- a/src/lambda_codebase/initial_commit/initial_commit.py +++ b/src/lambda_codebase/initial_commit/initial_commit.py @@ -3,6 +3,8 @@ initial bootstrap repository content. """ +import os +import logging from typing import Mapping, Optional, Union, List, Dict, Any, Tuple from dataclasses import dataclass, fields from enum import Enum @@ -10,7 +12,7 @@ import re import boto3 import jinja2 -from cfn_custom_resource import ( # pylint: disable=unused-import +from cfn_custom_resource import ( # pylint: disable=unused-import lambda_handler, create, update, @@ -34,10 +36,16 @@ EXECUTABLE_FILES = [ "adf-build/shared/helpers/package_transform.sh", "adf-build/shared/helpers/retrieve_organization_accounts.py", + "adf-build/shared/helpers/sync_to_s3.py", "adf-build/shared/helpers/sts.sh", "adf-build/shared/helpers/terraform/install_terraform.sh", ] +ADF_LOG_LEVEL = os.environ.get("ADF_LOG_LEVEL", "INFO") +logging.basicConfig(level=logging.INFO) +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(ADF_LOG_LEVEL) + PR_DESCRIPTION = """ADF Version {0} You can find the changelog at: @@ -77,8 +85,7 @@ def __post_init__(self): if self.NotificationEndpoint: self.NotificationEndpointType = ( "email" - if "@" - in self.NotificationEndpoint # pylint:disable=unsupported-membership-test + if self.NotificationEndpoint.find("@") > 0 else "slack" ) @@ -172,48 +179,68 @@ def __post_init__(self): ) -def generate_create_branch_input(event, repo_name, commit_id): - return { - "repositoryName": repo_name, - "branchName": event.ResourceProperties.Version, - "commitId": commit_id - } +def chunks(list_to_chunk, number_to_chunk_into): + """ + Split the list in segments of number_to_chunk_into. + Args: + list_to_chunk (list(Any)): The list to split into chunks. + number_to_chunk_into (int): The number per chunk that is allowed max. -def generate_delete_branch_input(event, repo_name): - return { - "repositoryName": repo_name, - "branchName": event.ResourceProperties.Version - } + Returns: + generator(list(Any)): The list of chunks of the list_to_chunk, where + each item in the list of chunks contains at most + number_to_chunk_into elements. + """ + number_per_chunk = max(1, number_to_chunk_into) + return ( + list_to_chunk[item:item + number_per_chunk] + for item in range(0, len(list_to_chunk), number_per_chunk) + ) -def chunks(list_to_chunk, number_to_chunk_into): - number_of_chunks = max(1, number_to_chunk_into) - return (list_to_chunk[item:item + number_of_chunks] for item in range(0, len(list_to_chunk), number_of_chunks)) - - -def generate_pull_request_input(event, repo_name, default_branch_name): - return { - "title": f'ADF {event.ResourceProperties.Version} Automated Update PR', - "description": PR_DESCRIPTION.format(event.ResourceProperties.Version), - "targets": [ - { - 'repositoryName': repo_name, - 'sourceReference': event.ResourceProperties.Version, - 'destinationReference': default_branch_name, - }, - ] - } +def generate_commit_input( + repo_name, + version, + index, + branch="main", + parent_commit_id=None, + puts=None, + deletes=None, +): + """ + Generate the input used to create a commit with the CodeCommit client. + Args: + repo_name (str): The repository name to crate a commit on. -def generate_commit_input(repo_name, index, branch="main", parent_commit_id=None, puts=None, deletes=None): + version (str): The version of ADF that is installing/updating. + + index (int): The index number of the commit. + + branch (str): The branch to create a commit on, defaults to `main`. + + parent_commit_id (str): The parent commit id which this commit will be + linked to. + + puts (FileToCommit[]): The list of FileToCommit items that need to be + committed. + + deletes (FileToDelete[]): The list of FileToDelete items that need to + be removed in this commit. + + Returns: + dict(str, Any): The create_commit API call details. + """ commit_action = "Delete" if deletes else "Create" output = { "repositoryName": repo_name, "branchName": branch, "authorName": "AWS ADF Builders Team", "email": "adf-builders@amazon.com", - "commitMessage": f"Automated Commit - {commit_action} Part {index}", + "commitMessage": ( + f"Automated Commit - {version} {commit_action} Part {index}" + ), "putFiles": puts if puts else [], "deleteFiles": deletes if deletes else [] } @@ -222,122 +249,190 @@ def generate_commit_input(repo_name, index, branch="main", parent_commit_id=None return output -@create() -def create_(event: Mapping[str, Any], _context: Any) -> Tuple[Union[None, PhysicalResourceId], Data]: - create_event = CreateEvent(**event) - repo_name = repo_arn_to_name(create_event.ResourceProperties.RepositoryArn) - default_branch_name = create_event.ResourceProperties.DefaultBranchName - directory = create_event.ResourceProperties.DirectoryName - try: - commit_id = CC_CLIENT.get_branch( - repositoryName=repo_name, - branchName=default_branch_name, - )["branch"]["commitId"] - CC_CLIENT.create_branch( - repositoryName=repo_name, - branchName=create_event.ResourceProperties.Version, - commitId=commit_id - ) - # CodeCommit only allows 100 files per commit, so we chunk them up here - for index, files in enumerate(chunks([f.as_dict() for f in get_files_to_commit(directory)], 99)): - if index == 0: - commit_id = CC_CLIENT.create_commit( - **generate_commit_input(repo_name, index, puts=files) - )["commitId"] - else: - commit_id = CC_CLIENT.create_commit( - **generate_commit_input(repo_name, index, puts=files, parent_commit_id=commit_id) - )["commitId"] - - CC_CLIENT.create_pull_request( - **generate_pull_request_input( - create_event, - repo_name, - default_branch_name, - ) - ) - return event.get("PhysicalResourceId"), {} +def generate_commits(event, repo_name, directory, parent_commit_id=None): + """ + Generate the commits for the specified repository. - except (CC_CLIENT.exceptions.FileEntryRequiredException, CC_CLIENT.exceptions.NoChangeException): - CC_CLIENT.delete_branch(**generate_delete_branch_input(create_event, repo_name)) - return event.get("PhysicalResourceId"), {} + Args: + event (dict(str, Any)): The Create Event or Update Event details. - except CC_CLIENT.exceptions.BranchDoesNotExistException: - files_to_commit = get_files_to_commit(directory) - if directory == "bootstrap_repository": - adf_config = create_adf_config_file(create_event.ResourceProperties, "adfconfig.yml.j2", "/tmp/adfconfig.yml") - initial_sample_global_iam = create_adf_config_file(create_event.ResourceProperties, "bootstrap_repository/adf-bootstrap/example-global-iam.yml", "/tmp/global-iam.yml") - - if create_event.ResourceProperties.DeploymentAccountFullName and create_event.ResourceProperties.DeploymentAccountEmailAddress: - adf_deployment_account_yml = create_adf_config_file(create_event.ResourceProperties, "adf.yml.j2", "/tmp/adf.yml") - files_to_commit.append(adf_deployment_account_yml) - files_to_commit.append(adf_config) - files_to_commit.append(initial_sample_global_iam) - - for index, files in enumerate(chunks([f.as_dict() for f in files_to_commit], 99)): - if index == 0: - commit_id = CC_CLIENT.create_commit( - **generate_commit_input(repo_name, index, puts=files) - )["commitId"] - else: - commit_id = CC_CLIENT.create_commit( - **generate_commit_input(repo_name, index, puts=files, parent_commit_id=commit_id) - )["commitId"] - - return commit_id, {} + repo_name (str): The repository name to create the commits on. + directory (str): The directory to process. -@update() -def update_(event: Mapping[str, Any], _context: Any, create_pr=False) -> Tuple[PhysicalResourceId, Data]: #pylint: disable=R0912, R0915 - update_event = UpdateEvent(**event) - repo_name = repo_arn_to_name(update_event.ResourceProperties.RepositoryArn) - default_branch_name = update_event.ResourceProperties.DefaultBranchName - files_to_delete = get_files_to_delete(repo_name) - files_to_commit = get_files_to_commit(update_event.ResourceProperties.DirectoryName) + parent_commit_id (str): The parent commit to link the commits to. - commit_id = CC_CLIENT.get_branch( - repositoryName=repo_name, - branchName=default_branch_name, - )["branch"]["commitId"] + Returns: + str[]: The commit ids of the commits that were created. + """ + directory_path = HERE / directory + version = event.ResourceProperties.Version + default_branch_name = event.ResourceProperties.DefaultBranchName + branch_name = version CC_CLIENT.create_branch( - **generate_create_branch_input(update_event, repo_name, commit_id) + repositoryName=repo_name, + branchName=branch_name, + commitId=parent_commit_id ) - if files_to_commit: + # CodeCommit only allows 100 files per commit, so we chunk them up here + files_to_commit = get_files_to_commit(directory_path) + create_first_branch = parent_commit_id is None + + if create_first_branch and directory == "bootstrap_repository": + adf_config = create_adf_config_file( + event.ResourceProperties, + "adfconfig.yml.j2", + "/tmp/adfconfig.yml", + ) + initial_sample_global_iam = create_adf_config_file( + event.ResourceProperties, + "bootstrap_repository/adf-bootstrap/example-global-iam.yml", + "/tmp/global-iam.yml", + ) + + create_deployment_account = ( + event.ResourceProperties.DeploymentAccountFullName + and event.ResourceProperties.DeploymentAccountEmailAddress + ) + if create_deployment_account: + adf_deployment_account_yml = create_adf_config_file( + event.ResourceProperties, + "adf.yml.j2", + "/tmp/adf.yml", + ) + files_to_commit.append(adf_deployment_account_yml) + files_to_commit.append(adf_config) + files_to_commit.append(initial_sample_global_iam) + + chunked_files = chunks([f.as_dict() for f in files_to_commit], 99) + commit_id = parent_commit_id + commits_created = [] + for index, files in enumerate(chunked_files): try: - for index, files in enumerate(chunks([f.as_dict() for f in files_to_commit], 99)): - commit_id = CC_CLIENT.create_commit(**generate_commit_input( + commit_id = CC_CLIENT.create_commit( + **generate_commit_input( repo_name, + version, index, + branch_name, + puts=files, parent_commit_id=commit_id, - branch=update_event.ResourceProperties.Version, - puts=files - ))["commitId"] - create_pr = True # If the commit above was able to be made, we want to create a PR afterwards - except (CC_CLIENT.exceptions.FileEntryRequiredException, CC_CLIENT.exceptions.NoChangeException): + ) + )["commitId"] + commits_created.append(commit_id) + except ( + CC_CLIENT.exceptions.FileEntryRequiredException, + CC_CLIENT.exceptions.NoChangeException + ): pass - if files_to_delete: - try: - for index, deletes in enumerate(chunks([f.as_dict() for f in files_to_delete], 99)): + + if not create_first_branch: + # If the branch exists already with files inside, we may need to + # check which of these files should be deleted: + files_to_delete = get_files_to_delete(repo_name, directory_path) + for index, deletes in enumerate( + chunks([f.as_dict() for f in files_to_delete], 99) + ): + try: commit_id = CC_CLIENT.create_commit(**generate_commit_input( repo_name, + version, index, parent_commit_id=commit_id, - branch=update_event.ResourceProperties.Version, + branch=branch_name, deletes=deletes ))["commitId"] - except (CC_CLIENT.exceptions.FileEntryRequiredException, CC_CLIENT.exceptions.NoChangeException): - pass - if create_pr or files_to_delete: + commits_created.append(commit_id) + except ( + CC_CLIENT.exceptions.FileEntryRequiredException, + CC_CLIENT.exceptions.NoChangeException, + ): + pass + + if commits_created: CC_CLIENT.create_pull_request( - **generate_pull_request_input( - update_event, - repo_name, - default_branch_name, - ) + title=f'ADF {version} Automated Update PR', + description=PR_DESCRIPTION.format(version), + targets=[ + { + 'repositoryName': repo_name, + 'sourceReference': branch_name, + 'destinationReference': default_branch_name, + }, + ], ) else: - CC_CLIENT.delete_branch(**generate_delete_branch_input(update_event, repo_name)) + CC_CLIENT.delete_branch( + repositoryName=repo_name, + branchName=branch_name, + ) + + return commits_created + + +def get_commit_id_from_branch(repo_name, branch_name): + try: + return CC_CLIENT.get_branch( + repositoryName=repo_name, + branchName=branch_name, + )["branch"]["commitId"] + except CC_CLIENT.exceptions.BranchDoesNotExistException: + LOGGER.info( + "Branch %s on %s does not exist. " + "Defaulting to creating the branch instead.", + branch_name, + repo_name, + ) + return None + + +@create() +def create_( + event: Mapping[str, Any], + _context: Any, +) -> Tuple[Union[None, PhysicalResourceId], Data]: + create_event = CreateEvent(**event) + repo_name = repo_arn_to_name(create_event.ResourceProperties.RepositoryArn) + default_branch_name = create_event.ResourceProperties.DefaultBranchName + directory = create_event.ResourceProperties.DirectoryName + + parent_commit_id = get_commit_id_from_branch( + repo_name, + default_branch_name, + ) + commits_created = generate_commits( + create_event, + repo_name, + directory=directory, + parent_commit_id=parent_commit_id, + ) + if parent_commit_id is None and commits_created: + # Return the last commit id that was created. + return commits_created[-1], {} + + return event.get("PhysicalResourceId"), {} + + +@update() +def update_( + event: Mapping[str, Any], + _context: Any, +) -> Tuple[PhysicalResourceId, Data]: + update_event = UpdateEvent(**event) + repo_name = repo_arn_to_name(update_event.ResourceProperties.RepositoryArn) + default_branch_name = update_event.ResourceProperties.DefaultBranchName + + parent_commit_id = get_commit_id_from_branch( + repo_name, + default_branch_name, + ) + generate_commits( + update_event, + repo_name, + directory=update_event.ResourceProperties.DirectoryName, + parent_commit_id=parent_commit_id, + ) return event["PhysicalResourceId"], {} @@ -351,7 +446,10 @@ def repo_arn_to_name(repo_arn: str) -> str: return repo_arn.split(":")[-1] -def get_files_to_delete(repo_name: str) -> List[FileToDelete]: +def get_files_to_delete( + repo_name: str, + directory_path: Path, +) -> List[FileToDelete]: differences = CC_CLIENT.get_differences( repositoryName=repo_name, afterCommitSpecifier='HEAD' @@ -364,11 +462,11 @@ def get_files_to_delete(repo_name: str) -> List[FileToDelete]: if not CONFIG_FILE_REGEX.match(file['afterBlob']['path']) ] - # 31: trimming off /var/task/bootstrap_repository so - # we can compare correctly blobs = [ - str(filename)[31:] - for filename in Path('/var/task/bootstrap_repository/').rglob('*') + # Get the paths relative to the directory path so we can compare them + # correctly. + filename.relative_to(directory_path) + for filename in directory_path.rglob('*') ] return [ @@ -381,41 +479,33 @@ def get_files_to_delete(repo_name: str) -> List[FileToDelete]: ] -def determine_file_mode(entry, directoryName): - if str(get_relative_name(entry, directoryName)) in EXECUTABLE_FILES: +def determine_file_mode(entry: Path, directory_path: Path) -> FileMode: + if str(entry.relative_to(directory_path)) in EXECUTABLE_FILES: return FileMode.EXECUTABLE return FileMode.NORMAL -def get_files_to_commit(directoryName: str) -> List[FileToCommit]: - path = HERE / directoryName +def get_files_to_commit(directory_path: Path) -> List[FileToCommit]: return [ FileToCommit( - str(get_relative_name(entry, directoryName)), + str(entry.relative_to(directory_path)), determine_file_mode( entry, - directoryName, + directory_path, ), entry.read_bytes(), ) - for entry in path.glob("**/*") + for entry in directory_path.glob("**/*") if not entry.is_dir() ] -def get_relative_name(path: Path, directoryName: str) -> Path: - """ - Search for the last occurrence of in and return only the trailing part of - - >>> get_relative_name(Path('/foo/test/bar/test/xyz/abc.py') ,'test') - Path('xyz/abc.py') - """ - index = list(reversed(path.parts)).index(directoryName) - return Path(*path.parts[-index:]) - - -def create_adf_config_file(props: CustomResourceProperties, input_file_name: str, output_file_name: str) -> FileToCommit: +def create_adf_config_file( + props: CustomResourceProperties, + input_file_name: str, + output_file_name: str, +) -> FileToCommit: template = HERE / input_file_name adf_config = ( jinja2.Template(template.read_text(), undefined=jinja2.StrictUndefined) From 8ecab6738db19815bf7feb726b7ff72cfda0502c Mon Sep 17 00:00:00 2001 From: Simon Kok Date: Thu, 22 Sep 2022 18:07:39 +0200 Subject: [PATCH 2/2] Add support for paginating differences in initial commit **Why?** Before, we were only matching against the first page of differences. This could result in files being deleted erroneously if the repository went through a large change such as an update that touches almost all files. **What?** Added support for paginating the differences and updated the tests to cover these. --- .../initial_commit/initial_commit.py | 32 +++++---- .../tests/test_initial_commit.py | 72 +++++++++++-------- .../initial_commit/initial_commit.py | 34 +++++---- .../tests/test_initial_commit.py | 72 +++++++++++-------- 4 files changed, 129 insertions(+), 81 deletions(-) diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/initial_commit.py b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/initial_commit.py index 351326f09..8c21c58cb 100644 --- a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/initial_commit.py +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/initial_commit.py @@ -26,7 +26,7 @@ NOT_YET_CREATED = "NOT_YET_CREATED" CC_CLIENT = boto3.client("codecommit") CONFIG_FILE_REGEX = re.compile(r"\A.*[.](yaml|yml|json)\Z", re.I) -EXECUTABLE_FILES = [] +EXECUTABLE_FILES: List[str] = [] ADF_LOG_LEVEL = os.environ.get("ADF_LOG_LEVEL", "INFO") logging.basicConfig(level=logging.INFO) LOGGER = logging.getLogger(__name__) @@ -410,22 +410,30 @@ def get_files_to_delete( repo_name: str, directory_path: Path, ) -> List[FileToDelete]: - differences = CC_CLIENT.get_differences( + paginator = CC_CLIENT.get_paginator('get_differences') + page_iterator = paginator.paginate( repositoryName=repo_name, - afterCommitSpecifier='HEAD' - )['differences'] - - # We never want to delete JSON or YAML files - file_paths = [ - Path(file['afterBlob']['path']) - for file in differences - if not CONFIG_FILE_REGEX.match(file['afterBlob']['path']) - ] + afterCommitSpecifier='HEAD', + ) + unfiltered_file_paths = [] + for page in page_iterator: + unfiltered_file_paths.extend(list( + map( + lambda obj: Path(obj['afterBlob']['path']), + page['differences'], + ), + )) + + file_paths = list(filter( + # We never want to delete JSON or YAML files + lambda path: not CONFIG_FILE_REGEX.match(str(path)), + unfiltered_file_paths, + )) blobs = [ # Get the paths relative to the directory path so we can compare them # correctly. - filename.relative_to(directory_path) + str(filename.relative_to(directory_path)) for filename in directory_path.rglob('*') ] diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/tests/test_initial_commit.py b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/tests/test_initial_commit.py index 4b40bac70..e92766901 100644 --- a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/tests/test_initial_commit.py +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/tests/test_initial_commit.py @@ -3,6 +3,8 @@ # pylint: skip-file +import os +import tempfile from pathlib import Path import pytest from mock import Mock, patch @@ -33,6 +35,7 @@ SHOULD_NOT_DELETE_DIRS = [ 'deployment_maps', 'deployment', + 'samples', ] SHOULD_DELETE_PATHS = [ 'other.txt', @@ -56,9 +59,8 @@ def is_dir(self): return self.path in SHOULD_NOT_DELETE_DIRS -@patch('initial_commit.Path') @patch('initial_commit.CC_CLIENT') -def test_get_files_to_delete(cc_client, path_cls): +def test_get_files_to_delete(cc_client): repo_name = 'some-repo-name' difference_paths = ( SHOULD_NOT_DELETE_FILES + @@ -69,31 +71,34 @@ def test_get_files_to_delete(cc_client, path_cls): lambda x: {'afterBlob': {'path': x}}, difference_paths, )) - cc_client.get_differences.return_value = { - 'differences': differences, - } - path_rglob_mock = Mock() - path_rglob_mock.rglob.return_value = list(map( - lambda path: "/var/task/pipelines_repository/{}".format(path), - FILES_IN_UPSTREAM_REPO, - )) - path_cls.side_effect = lambda path: ( - path_rglob_mock if path == '/var/task/pipelines_repository/' - else GenericPathMocked(path) + paginator = Mock() + cc_client.get_paginator.return_value = paginator + paginator.paginate.return_value = [ + { + 'differences': differences[:2], + }, + { + 'differences': differences[2:], + }, + ] + with tempfile.TemporaryDirectory() as temp_dir_path: + directory_path = Path(temp_dir_path) + for dir_name in SHOULD_NOT_DELETE_DIRS: + os.mkdir(str(directory_path / dir_name)) + for file_name in SHOULD_NOT_DELETE_FILES: + with open(str(directory_path / file_name), "wb") as file_p: + file_p.write("Test".encode('utf-8')) + + result = get_files_to_delete(repo_name, directory_path) + + cc_client.get_paginator.assert_called_once_with( + 'get_differences', ) - - result = get_files_to_delete(repo_name) - - cc_client.get_differences.assert_called_once_with( + paginator.paginate.assert_called_once_with( repositoryName=repo_name, afterCommitSpecifier='HEAD', ) - path_cls.assert_called_with( - '/var/task/pipelines_repository/' - ) - path_rglob_mock.rglob.assert_called_once_with('*') - assert all(isinstance(x, FileToDelete) for x in result) # Extract paths from result FileToDelete objects to make querying easier @@ -103,6 +108,17 @@ def test_get_files_to_delete(cc_client, path_cls): assert all(x not in result_paths for x in SHOULD_NOT_DELETE_FILES) assert all(x not in result_paths for x in SHOULD_NOT_DELETE_DIRS) + # Should delete all other + assert result_paths == SHOULD_DELETE_PATHS + assert len(result_paths) == len(SHOULD_DELETE_PATHS) + + # Extract paths from result FileToDelete objects to make querying easier + result_paths = list(map(lambda x: x.filePath, result)) + + # Should not delete JSON, YAML, and directories + assert all(x not in result_paths for x in SHOULD_NOT_DELETE_FILES) + assert all(x not in result_paths for x in SHOULD_NOT_DELETE_DIRS) + # Should delete all other assert all(x in result_paths for x in SHOULD_DELETE_PATHS) assert len(result_paths) == len(SHOULD_DELETE_PATHS) @@ -110,19 +126,19 @@ def test_get_files_to_delete(cc_client, path_cls): @pytest.mark.parametrize("entry", SHOULD_NOT_BE_EXECUTABLE) def test_determine_file_mode_normal(entry): - base_path = "test" - new_entry = f"/some/{base_path}/{entry}" + base_path = Path("/some/test") + new_entry = base_path / entry assert determine_file_mode( - Path(new_entry), + new_entry, base_path, ) == FileMode.NORMAL @pytest.mark.parametrize("entry", EXECUTABLE_FILES) def test_determine_file_mode_executable(entry): - base_path = "test" - new_entry = f"/some/{base_path}/{entry}" + base_path = Path("/some/test") + new_entry = base_path / entry assert determine_file_mode( - Path(new_entry), + new_entry, base_path, ) == FileMode.EXECUTABLE diff --git a/src/lambda_codebase/initial_commit/initial_commit.py b/src/lambda_codebase/initial_commit/initial_commit.py index 0d987b643..cb015de3a 100644 --- a/src/lambda_codebase/initial_commit/initial_commit.py +++ b/src/lambda_codebase/initial_commit/initial_commit.py @@ -26,14 +26,14 @@ NOT_YET_CREATED = "NOT_YET_CREATED" CC_CLIENT = boto3.client("codecommit") CONFIG_FILE_REGEX = re.compile(r"\A.*[.](yaml|yml|json)\Z", re.I) -REWRITE_PATHS = { +REWRITE_PATHS: Dict[str, str] = { "bootstrap_repository/adf-bootstrap/example-global-iam.yml": ( "adf-bootstrap/global-iam.yml" ), "adf.yml.j2": "adf-accounts/adf.yml", "adfconfig.yml.j2": "adfconfig.yml", } -EXECUTABLE_FILES = [ +EXECUTABLE_FILES: List[str] = [ "adf-build/shared/helpers/package_transform.sh", "adf-build/shared/helpers/retrieve_organization_accounts.py", "adf-build/shared/helpers/sync_to_s3.py", @@ -450,22 +450,30 @@ def get_files_to_delete( repo_name: str, directory_path: Path, ) -> List[FileToDelete]: - differences = CC_CLIENT.get_differences( + paginator = CC_CLIENT.get_paginator('get_differences') + page_iterator = paginator.paginate( repositoryName=repo_name, - afterCommitSpecifier='HEAD' - )['differences'] - - # We never want to delete JSON or YAML files - file_paths = [ - Path(file['afterBlob']['path']) - for file in differences - if not CONFIG_FILE_REGEX.match(file['afterBlob']['path']) - ] + afterCommitSpecifier='HEAD', + ) + unfiltered_file_paths = [] + for page in page_iterator: + unfiltered_file_paths.extend(list( + map( + lambda obj: Path(obj['afterBlob']['path']), + page['differences'], + ), + )) + + file_paths = list(filter( + # We never want to delete JSON or YAML files + lambda path: not CONFIG_FILE_REGEX.match(str(path)), + unfiltered_file_paths, + )) blobs = [ # Get the paths relative to the directory path so we can compare them # correctly. - filename.relative_to(directory_path) + str(filename.relative_to(directory_path)) for filename in directory_path.rglob('*') ] diff --git a/src/lambda_codebase/initial_commit/tests/test_initial_commit.py b/src/lambda_codebase/initial_commit/tests/test_initial_commit.py index 1efe79cff..104b7fc75 100644 --- a/src/lambda_codebase/initial_commit/tests/test_initial_commit.py +++ b/src/lambda_codebase/initial_commit/tests/test_initial_commit.py @@ -3,6 +3,8 @@ # pylint: skip-file +import os +import tempfile from pathlib import Path import pytest from mock import Mock, patch @@ -33,6 +35,7 @@ SHOULD_NOT_DELETE_DIRS = [ 'deployment_maps', 'deployment', + 'samples', ] SHOULD_DELETE_PATHS = [ 'other.txt', @@ -56,9 +59,8 @@ def is_dir(self): return self.path in SHOULD_NOT_DELETE_DIRS -@patch('initial_commit.Path') @patch('initial_commit.CC_CLIENT') -def test_get_files_to_delete(cc_client, path_cls): +def test_get_files_to_delete(cc_client): repo_name = 'some-repo-name' difference_paths = ( SHOULD_NOT_DELETE_FILES + @@ -69,31 +71,34 @@ def test_get_files_to_delete(cc_client, path_cls): lambda x: {'afterBlob': {'path': x}}, difference_paths, )) - cc_client.get_differences.return_value = { - 'differences': differences, - } - path_rglob_mock = Mock() - path_rglob_mock.rglob.return_value = list(map( - lambda path: "/var/task/bootstrap_repository/{}".format(path), - FILES_IN_UPSTREAM_REPO, - )) - path_cls.side_effect = lambda path: ( - path_rglob_mock if path == '/var/task/bootstrap_repository/' - else GenericPathMocked(path) + paginator = Mock() + cc_client.get_paginator.return_value = paginator + paginator.paginate.return_value = [ + { + 'differences': differences[:2], + }, + { + 'differences': differences[2:], + }, + ] + with tempfile.TemporaryDirectory() as temp_dir_path: + directory_path = Path(temp_dir_path) + for dir_name in SHOULD_NOT_DELETE_DIRS: + os.mkdir(str(directory_path / dir_name)) + for file_name in SHOULD_NOT_DELETE_FILES: + with open(str(directory_path / file_name), "wb") as file_p: + file_p.write("Test".encode('utf-8')) + + result = get_files_to_delete(repo_name, directory_path) + + cc_client.get_paginator.assert_called_once_with( + 'get_differences', ) - - result = get_files_to_delete(repo_name) - - cc_client.get_differences.assert_called_once_with( + paginator.paginate.assert_called_once_with( repositoryName=repo_name, afterCommitSpecifier='HEAD', ) - path_cls.assert_called_with( - '/var/task/bootstrap_repository/' - ) - path_rglob_mock.rglob.assert_called_once_with('*') - assert all(isinstance(x, FileToDelete) for x in result) # Extract paths from result FileToDelete objects to make querying easier @@ -103,6 +108,17 @@ def test_get_files_to_delete(cc_client, path_cls): assert all(x not in result_paths for x in SHOULD_NOT_DELETE_FILES) assert all(x not in result_paths for x in SHOULD_NOT_DELETE_DIRS) + # Should delete all other + assert result_paths == SHOULD_DELETE_PATHS + assert len(result_paths) == len(SHOULD_DELETE_PATHS) + + # Extract paths from result FileToDelete objects to make querying easier + result_paths = list(map(lambda x: x.filePath, result)) + + # Should not delete JSON, YAML, and directories + assert all(x not in result_paths for x in SHOULD_NOT_DELETE_FILES) + assert all(x not in result_paths for x in SHOULD_NOT_DELETE_DIRS) + # Should delete all other assert all(x in result_paths for x in SHOULD_DELETE_PATHS) assert len(result_paths) == len(SHOULD_DELETE_PATHS) @@ -110,19 +126,19 @@ def test_get_files_to_delete(cc_client, path_cls): @pytest.mark.parametrize("entry", SHOULD_NOT_BE_EXECUTABLE) def test_determine_file_mode_normal(entry): - base_path = "test" - new_entry = f"/some/{base_path}/{entry}" + base_path = Path("/some/test") + new_entry = base_path / entry assert determine_file_mode( - Path(new_entry), + new_entry, base_path, ) == FileMode.NORMAL @pytest.mark.parametrize("entry", EXECUTABLE_FILES) def test_determine_file_mode_executable(entry): - base_path = "test" - new_entry = f"/some/{base_path}/{entry}" + base_path = Path("/some/test") + new_entry = base_path / entry assert determine_file_mode( - Path(new_entry), + new_entry, base_path, ) == FileMode.EXECUTABLE