From ebd81526ac58f0b9fb371ee1d9a4ea524eff0f87 Mon Sep 17 00:00:00 2001 From: Simon Kok Date: Tue, 20 Sep 2022 10:16:31 +0200 Subject: [PATCH] Sync Step Function input files when content changed only with exec id metadata (#530) * Sync SFn input files when content changed only with exec id metadata **Why?** By using `aws s3 sync/cp`, it would copy the files when these were changed. However, as the file metadata is also taken into account, it would upload also if the content did not change. Additionally, as described in #518, we would like to insert metadata when a file is changed. If we would rely on the `aws s3 sync/cp` logic, it will also update the metadata if the metadata itself is changed. Therefore, we cannot add the necessary execution id to the files upon an upload only. **What?** The `sync_to_s3.py` script is added to support syncing the files to S3. This script will: 1. Upload a single file, or walk through a directory recursively. 2. Check each of the files it finds and determines the SHA-256 hash of these. 3. Parse the S3 bucket with an optional prefix, to determine which objects exist. 4. If a file is missing, it will upload the file. 5. If a file exists as an object already, it will check if the SHA-256 hashes match. If they do not, it will upload the new version. 6. If an object exists, but the file does not exist, it will optionally delete the object from the S3 bucket. When it uploads a file to S3, it will add the metadata that is requested through the `--upload-with-metadata` argument. Additionally, it will add the `sha256_hash` metadata to determine if the content changed. The deployment maps and account configuration processes rely on AWS Step Functions. When these are synced, the process is updated to rely on the `sync_to_s3.py` script. This way we can retrieve the `execution_id` and insert that in the invocation id of the Step Function State Machine. * Move sync_to_s3 to shared helpers * Fix linting issues * Add helper requirements * Code review changes * Add support for syncing multiple file extensions with sync_to_s3.py **Why?** To support matching both .yml and .yaml file extensions. **What?** Support added to pass multiple -e or --extension arguments. * Add CodePipeline Execution Id to accounts & pipeline gen * Add ADF Version to S3 file sync, such that an ADF update triggers updates **Why?** When files are synced to S3, they only triggered an update of the account management or pipeline generator when the file content changed. If ADF made changes to the pipeline structure, the pipelines and account management should be retriggered to apply them. **What?** By adding the `adf_version` metadata to the files that are synced, we can ensure that we only trigger an update to the file when the version is updated. --- Makefile | 1 + .../adf-bootstrap/deployment/global.yml | 17 +- .../determine_default_branch/requirements.txt | 2 +- .../adf-build/shared/helpers/__init__.py | 2 + .../adf-build/shared/helpers/pytest.ini | 3 + .../adf-build/shared/helpers/sync_to_s3.py | 915 ++++++++++++++ .../shared/helpers/tests/__init__.py | 11 + .../shared/helpers/tests/test_sync_to_s3.py | 1049 +++++++++++++++++ .../adf-build/shared/pytest.ini | 2 +- src/template.yml | 11 +- tox.ini | 1 + 11 files changed, 2008 insertions(+), 6 deletions(-) create mode 100644 src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/__init__.py create mode 100644 src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/pytest.ini create mode 100644 src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/sync_to_s3.py create mode 100644 src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/tests/__init__.py create mode 100644 src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/tests/test_sync_to_s3.py diff --git a/Makefile b/Makefile index 130239fbd..b0fbfb745 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ test: pytest src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase -vvv -s -c src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/pytest.ini pytest src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit -vvv -s -c src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/initial_commit/pytest.ini pytest src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/determine_default_branch -vvv -s -c src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/determine_default_branch/pytest.ini + pytest src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers -vvv -s -c src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/pytest.ini pytest src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/python -vvv -s -c src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/python/pytest.ini pytest src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/cdk -vvv -s -c src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/cdk/pytest.ini pytest src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared -vvv -s -c src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/pytest.ini diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/global.yml b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/global.yml index 0d6de4340..ac6a6a60f 100644 --- a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/global.yml +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/global.yml @@ -766,11 +766,14 @@ Resources: commands: - npm install cdk@1.169 -g -y --quiet --no-progress - aws s3 cp s3://$SHARED_MODULES_BUCKET/adf-build/ ./adf-build/ --recursive --quiet - - pip install -r adf-build/requirements.txt -q -t ./adf-build + - pip install -r adf-build/requirements.txt -r adf-build/helpers/requirements.txt -q -t ./adf-build + pre_build: + commands: + - mkdir -p deployment_maps build: commands: - - bash -c "[[ -e deployment_map.yml ]] && echo 'Copying deployment_map.yml' && aws s3 cp deployment_map.yml s3://$ADF_PIPELINES_BUCKET/deployment_map.yml || echo 'No deployment_map.yml, skipping copy'" - - bash -c "[[ -e deployment_maps ]] && echo 'Syncing deployment_maps folder' && aws s3 sync deployment_maps s3://$ADF_PIPELINES_BUCKET/deployment_maps || echo 'No deployment_maps folder, skipping sync'" + - python adf-build/helpers/sync_to_s3.py --metadata adf_version=${!ADF_VERSION} --upload-with-metadata execution_id=${!CODEPIPELINE_EXECUTION_ID} deployment_map.yml s3://$ADF_PIPELINES_BUCKET/deployment_map.yml + - python adf-build/helpers/sync_to_s3.py --extension .yml --extension .yaml --metadata adf_version=${!ADF_VERSION} --upload-with-metadata execution_id=${!CODEBUILD_BUILD_NUMBER} --recursive deployment_maps s3://$ADF_PIPELINES_BUCKET/deployment_maps post_build: commands: - echo "Pipelines are updated in the AWS Step Functions ADFPipelineManagementStateMachine." @@ -820,6 +823,14 @@ Resources: - Name: "Source" Configuration: ProjectName: !Ref CodeBuildProject + EnvironmentVariables: >- + [ + { + "name": "CODEPIPELINE_EXECUTION_ID", + "value": "#{codepipeline.PipelineExecutionId}", + "type": "PLAINTEXT" + } + ] RunOrder: 1 PipelineSNSTopic: diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/determine_default_branch/requirements.txt b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/determine_default_branch/requirements.txt index da6ec56c2..2a653ddc1 100644 --- a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/determine_default_branch/requirements.txt +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-bootstrap/deployment/lambda_codebase/determine_default_branch/requirements.txt @@ -1,2 +1,2 @@ -boto3==1.18.2 +boto3==1.24.59 cfn-custom-resource~=1.0.1 diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/__init__.py b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/__init__.py new file mode 100644 index 000000000..b0f3b0cc9 --- /dev/null +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/pytest.ini b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/pytest.ini new file mode 100644 index 000000000..015e8596c --- /dev/null +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +norecursedirs = terraform diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/sync_to_s3.py b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/sync_to_s3.py new file mode 100644 index 000000000..59b46b0a4 --- /dev/null +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/sync_to_s3.py @@ -0,0 +1,915 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +""" +Sync files to an S3 Bucket. + +This script will only overwrite files when the content changed. +To determine a file changed, it will calculate the SHA-256 hash of the file +and match that against the object SHA256 hash metadata stored along with the +S3 object. + +If a file is stored inside the S3 Bucket that is no longer present +locally, it will clean it up too. + +Usage: + sync_to_s3.py [-v... | --verbose...] [-r | --recursive] [-d | --delete] + [-e | --extension ]... + [--metadata =]... + [--upload-with-metadata =]... + [--] + SOURCE_PATH DESTINATION_S3_URL + + sync_to_s3.py -h | --help + + sync_to_s3.py --version + +Options: + -d, --delete + Delete stale files that are located in the destination bucket + with the corresponding S3 prefix. For example, if the + destination is set to s3://my-bucket/my-prefix, it will sync + all files inside the prefix location. If a file is located + there, that is not present in the source path locally, it will + get deleted. But only if the file extension of that path + matches (if one is specified). + + -e, --extension + File extension filter. Files that match locally are only + uploaded if their extension matches. If this parameter is not + specified, it will not apply a filter. Matching all files that + are found locally. The same filter is also applied on the + destination. For example, if the destination S3 location + contains a README.md file, while the extension is configured + to match '.yml', it will not delete the README.md file as its + extension is not a match. + + -h, --help Show this help message. + + --metadata = + The key and value pairs that are passed with this argument + will be added to the metadata. If the metadata set using this + argument does not match the metadata on the S3 object, it will + perform an update too. + + -r, --recursive + Indicating that the is a directory, and it + should recursively walk through the source directories and sync + those to the S3 bucket. + + --upload-with-metadata = + When a file is uploaded, the key and value pairs that are + passed with this argument will be added. It will only apply + these metadata properties if the file is missing, or the + content of the file or any of the `--metadata` properties did + not match. + + -v, --verbose + Show verbose logging information. + + + The source path where the original files are stored that should + by synced to the destination bucket. When you specify a + directory as the source path it will copy the files inside the + directory to the S3 bucket if you also specify the recursive + flag. Otherwise it will treat the source path as a file, when a + directory is detected instead it will abort with an error. + If the source path is a directory, the object keys that are + derived from the files inside the directory will be relative to + the . For example, if the equals + `./adf-accounts`, which contains a file named + `adf-accounts/adf.yml`, it will copy the file as `adf.yml`. + If the prefix of the s3 bucket is set to `adf-s3-accounts`, the + final key of that specific file will be: + `adf-s3-accounts/adf.yml`. + If the is a file and + the recursive flag is not specified, it will expect that the + s3 prefix is the new object name instead. In this case, if + equals `./deployment_map.yml` and the s3 prefix + is `root_deployment_map.yml`, it will copy the file to the s3 + prefix key. + + + The destination bucket and its prefix where the files should be + copied to. The s3 bucket and its optional prefix should be + specified as: s3://your-bucket-name/your-optional-prefix. + In this case, `your-bucket-name` is the name of the bucket. + While `your-optional-prefix` is the name of the prefix used for + all files that are copied to S3. If a directory is copied, i.e. + recursive is set, it will prepend the prefix to the object + keys of the files that are synced. If a file is copied instead, + i.e. no --recurdive, it will use the s3 prefix as the target + object key to use for that file. + +Examples: + + Copy the deployment_map.yml file to an S3 bucket as + root_deployment_map.yml, and delete the root_deployment_map.yml if the + local deployment_map.yml file is missing: + + $ python sync_to_s3.py -d deployment_map.yml \\ + s3://deploy-bucket/root_deployment_map.yml + + Copy all .yml files from the deployment_maps folder to an S3 bucket where + the objects are prefixed with the `deployment_map/`, deleting the .yml + objects inside the deployment_map that no longer exist locally. + + $ python sync_to_s3.py -d -e .yml -r deployment_maps \\ + s3://deploy-bucket/deployment_maps + + Copy all .yml files from folder source_folder to the to an S3 bucket where + the objects are prefixed with the `object_folder/`, deleting the .yml + objects inside the YAML files that no longer exist locally. Additionally, + all files will get the metadata set to include `adf_version`. And if the + file is uploaded/updated, it will also apply the `execution_id` metadata. + + $ python sync_to_s3.py -d -e .yml -r source_folder \\ + --metadata "adf_version=x.y.z" \\ + --upload-with-metadata "execution_id=$EXEC_ID" \\ + s3://deploy-bucket/object_folder +""" + +import os +import sys +from typing import Mapping, TypedDict +from pathlib import Path +from urllib.parse import urlparse +import hashlib +import logging +import base64 +import boto3 +from docopt import docopt + + +ADF_VERSION = os.environ.get("ADF_VERSION") +ADF_LOG_LEVEL = os.environ.get("ADF_LOG_LEVEL", "INFO") +NON_RECURSIVE_KEY = '%%-single-match-%%' + +logging.basicConfig(level=logging.INFO) +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(ADF_LOG_LEVEL) + + +class GenericFileData(TypedDict): + """ + Generic File or Object Data class. + """ + key: str + + +class LocalFileData(GenericFileData): + """ + Local File Data class, extended from the GenericFileData. + """ + file_path: str + sha256_hash: str + + +class S3ObjectData(GenericFileData): + """ + S3 Object Data class, extended from the GenericFileData. + """ + metadata: dict[str, str] + + +class MetadataToCheck(TypedDict): + always_apply: dict[str, str] + upon_upload_apply: dict[str, str] + + +def get_local_files( + local_path: str, + file_extensions: [str], + recursive: bool, +) -> Mapping[str, LocalFileData]: + """ + Retrieve the files that are in the relative path local_path. + This will perform a search inside a directory if the local_path is a + directory and the recursive flag is set. Alternatively, it will determine + if a specific file exists and if so, it will retrieve that one only. + + Args: + local_path (str): The local path to search in/lookup. + + file_extensions ([str]): The file_extensions to search for, or empty + list if this filter should not be applied. + + recursive (bool): Whether to search recursively or not. + + Returns: + Mapping[str, LocalFileData]: The map of the Local File Data objects + representing the local file(s) that were found. + The keys of the map are derived from the local file path relative + to the local_path. With a single object, the key used is + a special non recursive identifier key instead. + The value of the map is the Local File Data. + """ + if recursive: + return _get_recursive_local_files( + local_path, + file_extensions, + ) + return _get_single_local_file( + local_path, + ) + + +def _get_recursive_local_files( + local_path: str, + file_extensions: [str], +) -> Mapping[str, LocalFileData]: + """ + Retrieve the files that are in the relative path local_path. + A search is performed using the specified glob if one is specified. + Do not specify the glob in case only a single file should be matched. + + Args: + local_path (str): The local files to search in. + + file_extensions ([str]): The file_extensions to search for, or empty + list if this filter should not be applied. This will be converted + to a glob search, where the extension ".yml" will match files with + the glob search "**/*.yml", returning any YAML file that ends with + .yml. Including those in subdirectories. + + Returns: + Mapping[str, LocalFileData]: The map of the Local File Data objects + representing the local files that were found. + The keys of the map are derived from the local file path relative + to the local_path. + The value of the map is the Local File Data. + """ + path = get_full_local_path(local_path) + LOGGER.debug( + "Searching for local files in %s matching %s", + str(path), + file_extensions, + ) + local_files = {} + globs_to_match = [ + f"**/*{ext}" + for ext in ( + # File extensions or a list of an empty string, so it either + # generates "**/*{ext}" for each extension in file_extensions + # or it generates "**/*" + file_extensions or [""] + ) + ] + for glob in globs_to_match: + for file_path in path.glob(glob): + local_file_data = _get_local_file_data(file_path, path) + local_files[local_file_data['key']] = local_file_data + + LOGGER.debug( + "Found %d local files: %s", + len(local_files.keys()), + local_files, + ) + return local_files + + +def _get_single_local_file( + local_path: str, +) -> Mapping[str, LocalFileData]: + """ + Retrieve the file that is at the relative path local_path, or None if that + does not exist. + + Args: + local_path (str): The local files to search in. + + Returns: + Mapping[str, LocalFileData]: The map of the Local File Data object + representing the local file if one is found. + The keys of the map are derived from the local file path relative + to the local_path. + The value of the map is the Local File Data. + """ + path = get_full_local_path(local_path) + LOGGER.debug( + "Checking if local file at %s exists", + str(path), + ) + local_files = {} + if path.exists(): + local_file_data = _get_local_file_data(path, path.parent) + local_files[NON_RECURSIVE_KEY] = local_file_data + LOGGER.debug( + "File exists: %s", + local_files, + ) + else: + LOGGER.debug( + "File does not exist at: %s", + path, + ) + + return local_files + + +def _get_local_file_data( + file_path: Path, + relative_to_path: Path, +) -> LocalFileData: + """ + Get the local file data for the given path. + + This will open the file, calculate its hash and return that + with in a LocalFileData object. + + Args: + file_path (Path): The path of the file to read. + + relative_to_path (Path): The path that should be used to determine the + relative path of the local file. If an object lives inside + `x_path/y_path`. And the relative_to_path is set to `x_path`, the + key of the local file will become: `y_path`. + + Returns: + LocalFileData: The LocalFileData instance that holds the file + information such as the sha256_hash, its relative path, etc. + """ + with open(file_path, "rb", buffering=0) as file_pointer: + file_hash = hashlib.sha256() + memory_view = memoryview(bytearray(1024*1024)) + while data_read := file_pointer.readinto(memory_view): + file_hash.update(memory_view[:data_read]) + relative_path = str(file_path.relative_to(relative_to_path)) + return { + "key": relative_path, + "file_path": str(file_path), + "sha256_hash": str(base64.b64encode(file_hash.digest())), + } + + +def get_s3_objects( + s3_client: any, + s3_bucket: str, + s3_prefix: str, + file_extensions: [str], + recursive: bool, +): + """ + Retrieve the object or objects that are stored inside the S3 bucket. + When asked to search recursively, it will perform a search on the S3 bucket + using the specified prefix and file extension. + While it will perform a single object lookup otherwise. + + Args: + s3_client (Boto3.Client): The Boto3 S3 Client to interact with when + a file needs to be deleted. + s3_bucket (str): The bucket name. + s3_prefix (str): The prefix under which the objects are stored in + the bucket. + file_extensions ([str]): The file extensions of objects that would + match. + recursive (bool): Whether to search recursively or not. + + Returns: + Mapping[str, S3ObjectData]: The map of the S3 objects that were + found. + """ + if recursive: + return _get_recursive_s3_objects( + s3_client, + s3_bucket, + s3_prefix, + file_extensions, + ) + + return _get_single_s3_object( + s3_client, + s3_bucket, + s3_prefix, + ) + + +def _get_recursive_s3_objects( + s3_client: any, + s3_bucket: str, + s3_prefix: str, + file_extensions: [str], +) -> Mapping[str, S3ObjectData]: + """ + Retrieve the objects that are stored inside the S3 bucket, which keys + start with the specified s3_prefix. + + Args: + s3_client (Boto3.Client): The Boto3 S3 Client to interact with when + a file needs to be deleted. + s3_bucket (str): The bucket name. + s3_prefix (str): The prefix under which the objects are stored in + the bucket. + file_extensions ([str]): The file extension of objects that would + match. + + Returns: + Mapping[str, S3ObjectData]: The map of the S3 objects that were + found. The keys of the map are derived from the object key relative + to the s3_prefix. Unless the key is equal to the s3_prefix, in that + case the full object key is used as the key. The value of the map + is the S3 Object Data. + """ + LOGGER.debug( + "Searching for S3 objects in s3://%s/%s", + s3_bucket, + s3_prefix, + ) + s3_list_objects_paginator = s3_client.get_paginator("list_objects_v2") + s3_object_iterator = s3_list_objects_paginator.paginate( + Bucket=s3_bucket, + Prefix=f"{s3_prefix}/", + ) + s3_objects = {} + for response_data in s3_object_iterator: + for obj in response_data.get("Contents", []): + matched_extensions = list( + # The filter matches its Key against the file_extensions + # to see if it ends with that specific extension. + # This will return an empty list if it did not match or + # if the file_extensions is empty. + filter(obj.get("Key").endswith, file_extensions) + ) + if file_extensions and not matched_extensions: + # If we should filter on extensions and we did not match + # with any, we should skip this object. + continue + index_key = convert_to_local_key(obj.get("Key"), s3_prefix) + s3_objects[index_key] = _get_s3_object_data( + s3_client, + s3_bucket, + obj.get("Key"), + ) + + LOGGER.debug( + "Found %d S3 objects at: s3://%s/%s: %s", + len(s3_objects.keys()), + s3_bucket, + s3_prefix, + s3_objects, + ) + return s3_objects + + +def _get_single_s3_object( + s3_client: any, + s3_bucket: str, + s3_object_key: str, +) -> Mapping[str, S3ObjectData]: + """ + Retrieve a single object that is stored inside the S3 bucket, which object + key equals the specified s3_object_key. + + Args: + s3_client (Boto3.Client): The Boto3 S3 Client to interact with when + a file needs to be deleted. + s3_bucket (str): The bucket name. + s3_object_key (str): The object key under which the object might or + should be stored in the bucket. + + Returns: + Mapping[str, S3ObjectData]: The map of the S3 objects that were + found. The keys of the map is set to the non recursive identifier. + The value of the map is the S3 Object Data. + """ + LOGGER.debug( + "Searching for S3 object in s3://%s/%s", + s3_bucket, + s3_object_key, + ) + s3_object_data = _get_s3_object_data( + s3_client, + s3_bucket, + s3_object_key, + ) + if not s3_object_data: + return {} + + s3_objects = {} + s3_objects[NON_RECURSIVE_KEY] = s3_object_data + + LOGGER.debug( + "Found S3 object at: s3://%s/%s: %s", + s3_bucket, + s3_object_key, + s3_objects, + ) + return s3_objects + + +def _get_s3_object_data(s3_client, s3_bucket, key): + try: + obj_data = s3_client.head_object( + Bucket=s3_bucket, + Key=key, + ) + return { + "key": key, + "metadata": obj_data.get("Metadata", {}), + } + except s3_client.exceptions.NoSuchKey: + LOGGER.debug( + "Could not find s3://%s/%s", + s3_bucket, + key, + ) + return None + + +def upload_changed_files( + s3_client: any, + s3_bucket: str, + s3_prefix: str, + local_files: Mapping[str, LocalFileData], + s3_objects: Mapping[str, S3ObjectData], + metadata_to_check: MetadataToCheck, +): + """ + Upload changed files, by looping over the local files found and checking + if these still exist in the S3 bucket as objects. If they do, the SHA256 + hash is compared. The file is uploaded to the bucket if the file is + missing or when the SHA256 hash does not match. + + Args: + s3_client (Boto3.Client): The Boto3 S3 Client to interact with when + a file needs to be deleted. + + s3_bucket (str): The bucket name. + + s3_prefix (str): The prefix under which the objects are stored in + the bucket. + + local_files (Mapping[str, LocalFileData]): The map of LocalFileData + objects, representing the files that were found locally. + + s3_objects (Mapping[str, S3ObjectData]): The map of S3ObjectData + objects representing the objects that were found in the S3 bucket. + + metadata_to_check (MetadataToCheck): The metadata that needs to be + applied all the time and upon upload only. + """ + for key, local_file in local_files.items(): + s3_file = s3_objects.get(key) + + object_is_missing = s3_file is None + s3_metadata = {} if object_is_missing else s3_file["metadata"] + content_changed = ( + s3_metadata.get("sha256_hash") != local_file.get("sha256_hash") + ) + metadata_changed = ( + dict(filter( + lambda item: item[0] in metadata_to_check["always_apply"], + s3_metadata.items(), + )) != metadata_to_check["always_apply"] + ) + if (object_is_missing or content_changed or metadata_changed): + with open(local_file.get("file_path"), "rb") as file_pointer: + s3_key = convert_to_s3_key(key, s3_prefix) + + LOGGER.info( + "Uploading file %s to s3://%s/%s because the %s", + local_file.get("file_path"), + s3_bucket, + s3_key, + ( + "object does not exist yet" if object_is_missing + else ( + "file content changed" if content_changed + else "metadata changed" + ) + ), + ) + s3_client.put_object( + Body=file_pointer, + Bucket=s3_bucket, + Key=s3_key, + Metadata={ + **metadata_to_check['always_apply'], + **metadata_to_check['upon_upload_apply'], + "sha256_hash": local_file.get("sha256_hash"), + } + ) + + +def delete_stale_objects( + s3_client: any, + s3_bucket: str, + s3_prefix: str, + local_files: Mapping[str, LocalFileData], + s3_objects: Mapping[str, S3ObjectData], +): + """ + Delete stale files, by looping over the objects found in S3 and checking + if these still exist locally. If not, they are stale and need to be + deleted. + + Args: + s3_client (Boto3.Client): The Boto3 S3 Client to interact with when + a file needs to be deleted. + s3_bucket (str): The bucket name. + s3_prefix (str): The prefix under which the objects are stored in + the bucket. + local_files (Mapping[str, LocalFileData]): The map of LocalFileData + objects, representing the files that were found locally. + s3_objects (Mapping[str, S3ObjectData]): The map of S3ObjectData + objects representing the objects that were found in the S3 bucket. + """ + to_delete = [] + for key in s3_objects.keys(): + if local_files.get(key) is None: + s3_key = convert_to_s3_key(key, s3_prefix) + to_delete.append({ + "Key": s3_key, + }) + + if to_delete: + LOGGER.info( + "Deleting stale objects in s3://%s: %s", + s3_bucket, + to_delete, + ) + s3_client.delete_objects( + Bucket=s3_bucket, + Delete={ + "Objects": to_delete, + }, + ) + + +def clean_s3_prefix(original_prefix: str) -> str: + """ + Clean the S3 prefix, such that it does not start with a slash + and does not end with a slash. + + i.e. `/some/path/` will become `some/path` + + Args: + original_prefix (str): The original prefix that should be cleaned. + + Returns: + str: The cleaned prefix. + """ + new_prefix = ( + original_prefix[1:] if original_prefix.startswith("/") + else original_prefix + ) + + if original_prefix.endswith("/"): + return new_prefix[:-1] + + return new_prefix + + +def get_full_local_path(local_path: str) -> Path: + """ + Convert the local path str to the full Path. + + Args: + local_path (Path): The path where it should run the search from. + Can be an absolute path or a relative path to the current working + directory. Both will be translated to a full Path. + + Returns: + Path: The full Path instance pointing to the local_path + relative to the directory this command was executed from. Or the + Path instance pointing to the local_path if that is an absolute + path already. + """ + path = Path(local_path) + if path.is_absolute(): + return path + + here = Path(os.getcwd()) + return here / path + + +def convert_to_s3_key(local_key, s3_prefix): + """ + Convert the local key to an S3 key. + + Args: + local_key (str): The local key of the file (relative to the directory). + s3_prefix (str): The S3 prefix that is in use. + + Returns: + str: Returns the s3_prefix if that matches the local_key. + When it did not match, it returns the `/{s3_prefix}/{local_key}` + """ + if s3_prefix and local_key == NON_RECURSIVE_KEY: + return s3_prefix + + if s3_prefix and local_key != s3_prefix: + return f"{s3_prefix}/{local_key}" + + return local_key + + +def convert_to_local_key(s3_key, s3_prefix): + """ + Convert the S3 key to a local key. + + Args: + s3_key (str): The s3 key of the object includes the s3 prefix. + s3_prefix (str): The S3 prefix that is in use. + + Returns: + str: Returns the local key if that matches the s3_prefix. + When it did not match, it removes the s3 prefix and returns + the relative local_key. + """ + if s3_prefix and s3_key != s3_prefix: + return str(Path(s3_key).relative_to(s3_prefix)) + + return s3_key + + +def ensure_valid_input( + local_path: str, + file_extensions: [str], + s3_url: str, + s3_bucket: str, + s3_prefix: str, + recursive: bool, +): + if not local_path: + LOGGER.error( + "Input error: You need to specify the source path!" + ) + sys.exit(1) + + if not s3_url: + LOGGER.error( + "Input error: You need to specify the destination S3 url!" + ) + sys.exit(2) + + if not recursive and not s3_prefix: + LOGGER.error( + "Input error: Requested to sync single object, but no S3 object " + "location was specified! " + ) + LOGGER.error( + "In case you would like to sync a single object " + "to %s, you will need to specify the full object location. " + "For example, s3://%s/this-is-the-target-object-location.yml", + s3_url, + s3_bucket, + ) + sys.exit(3) + + full_path = get_full_local_path(local_path) + if recursive and not full_path.exists(): + LOGGER.error( + "Input error: The source path %s does not exist!", + local_path, + ) + sys.exit(4) + + if not recursive and full_path.exists() and full_path.is_dir(): + LOGGER.error( + "Input error: When syncing a single file the source path %s " + "should be referencing a file not a directory!", + local_path, + ) + sys.exit(5) + + if file_extensions and not recursive: + LOGGER.warning("Input warning: Ignoring file_extension filter.") + LOGGER.warning( + "Input warning: The file_extension filter is not applied " + "when you are trying to sync a single file to S3. " + "The --extension argument is only compatible when " + "performing a --recursive directory sync." + ) + + + +def sync_files( + s3_client: any, + local_path: str, + file_extensions: [str], + s3_url: str, + recursive: bool, + delete: bool, + metadata_to_check: MetadataToCheck, +): + """ + Sync files using the S3 client from the local_path, matching the local_glob + to the specific s3_url. + + Args: + s3_client (Boto3.Client): The Boto3 S3 Client to interact with when + a file needs to be deleted. + + local_path (str): The local path where the source files are stored. + + file_extensions ([str]): The extensions to search for files inside a + specific path. For example, [".yml", ".yaml"] will return all + YAML files, including those in sub directories. + + s3_url (str): The S3 URL to use, for example + S3://bucket/specific/prefix. + + recursive (bool): Whether to search the source directory recursively + or not. + + delete (bool): Whether to delete stale objects from the S3 bucket if + the source file no longer exists. + + metadata_to_check (MetadataToCheck): The metadata that needs to be + applied all the time and upon upload only. + """ + s3_url_details = urlparse(s3_url) + s3_bucket = s3_url_details.netloc + s3_prefix = clean_s3_prefix(str(s3_url_details.path)) + + ensure_valid_input( + local_path, + file_extensions, + s3_url, + s3_bucket, + s3_prefix, + recursive, + ) + + local_files = get_local_files(local_path, file_extensions, recursive) + + s3_objects = get_s3_objects( + s3_client, + s3_bucket, + s3_prefix, + file_extensions, + recursive, + ) + + upload_changed_files( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + metadata_to_check, + ) + if delete: + delete_stale_objects( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + ) + + +def main(): # pylint: disable=R0915 + """Main function to sync files to S3""" + + options = docopt(__doc__, version=ADF_VERSION, options_first=True) + # In case the user asked for verbose logging, increase + # the log level to debug. + if options["--verbose"] > 0: + LOGGER.setLevel(logging.DEBUG) + if options["--verbose"] > 1: + # Also enable DEBUG mode for other libraries, like boto3 + logging.basicConfig(level=logging.DEBUG) + + LOGGER.debug("Input arguments: %s", options) + + local_path = options.get('SOURCE_PATH') + # Remove duplicates from file extension list if there are any + file_extensions = list(set(options.get('--extension'))) + s3_url = options.get('DESTINATION_S3_URL') + recursive = options.get('--recursive', False) + delete = options.get('--delete', False) + + # Convert metadata key and value lists into a dictionary + metadata_to_check: MetadataToCheck = { + 'always_apply': dict(map( + lambda kv_pair: ( + kv_pair[:kv_pair.find("=")], + kv_pair[(kv_pair.find("=") + 1):] + ), + options['--metadata'], + )), + 'upon_upload_apply': dict(map( + lambda kv_pair: ( + kv_pair[:kv_pair.find("=")], + kv_pair[(kv_pair.find("=") + 1):] + ), + options['--upload-with-metadata'], + )), + } + + s3_client = boto3.client("s3") + sync_files( + s3_client, + local_path, + file_extensions, + s3_url, + recursive, + delete, + metadata_to_check, + ) + LOGGER.info("All done.") + + +if __name__ == "__main__": + main() diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/tests/__init__.py b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/tests/__init__.py new file mode 100644 index 000000000..e164948b0 --- /dev/null +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/tests/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +""" +__init__ for tests module +""" + +import sys +import os + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/tests/test_sync_to_s3.py b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/tests/test_sync_to_s3.py new file mode 100644 index 000000000..80c0fe125 --- /dev/null +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/helpers/tests/test_sync_to_s3.py @@ -0,0 +1,1049 @@ +import os +from typing import Mapping +from pathlib import Path +from copy import deepcopy +from mock import Mock, patch, call, ANY +import pytest +from base64 import b64encode +from hashlib import sha256 +import tempfile +from sync_to_s3 import * + +# pylint: skip-file + +S3_PREFIX = "s3-prefix" +CURRENT_VERSION = "This is the current version on S3".encode("utf-8") +NEW_VERSION = "This will be uploaded to S3".encode("utf-8") +CURRENT_HASH = str(b64encode(sha256(CURRENT_VERSION).digest())) +NEW_HASH = str(b64encode(sha256(NEW_VERSION).digest())) +UPLOAD_PREVIOUS_METADATA = { + "execution_id": "a-b-c-d", +} +UPLOAD_NEW_METADATA = { + "execution_id": "b-c-d-e", +} +PREVIOUS_METADATA = { + "adf_version": "x.y.z", + "another_key": "and_its_value", +} +CURRENT_METADATA = { + "adf_version": "x.y.z+1", + "another_key": "and_its_value", +} +IRRELEVANT_METADATA = { + "irrelevant_metadata": "some irrelevant value", + "another_irrelevant_key": "and-value", +} + +EXAMPLE_LOCAL_FILES: Mapping[str, LocalFileData] = { + "first-file.yml": { + "key": "first-file.yml", + "file_path": "/full/path/first-file.yml", + "sha256_hash": CURRENT_HASH, + }, + "second-file.yaml": { + "key": "second-file.yaml", + "file_path": "/full/path/second-file.yaml", + "sha256_hash": CURRENT_HASH, + }, + "second-file.yaml": { + "key": "second-file.yaml", + "file_path": "/full/path/second-file.yaml", + "sha256_hash": CURRENT_HASH, + }, + "needs-new-metadata-file.yaml": { + "key": "needs-new-metadata-file.yaml", + "file_path": "/full/path/needs-new-metadata-file.yaml", + "sha256_hash": CURRENT_HASH, + }, + "updated-file.yml": { + "key": "updated-file.yml", + "file_path": "/full/path/updated-file.yml", + "sha256_hash": NEW_HASH, + }, + "missing-file.yml": { + "key": "missing-file.yml", + "file_path": "/full/path/missing-file.yml", + "sha256_hash": NEW_HASH, + }, +} +EXAMPLE_S3_OBJECTS: Mapping[str, S3ObjectData] = { + "first-file.yml": { + "key": f"{S3_PREFIX}/first-file.yml", + "metadata": { + **CURRENT_METADATA, + **UPLOAD_PREVIOUS_METADATA, + **IRRELEVANT_METADATA, + "sha256_hash": CURRENT_HASH, + } + }, + "second-file.yaml": { + "key": f"{S3_PREFIX}/second-file.yaml", + "metadata": { + **CURRENT_METADATA, + **UPLOAD_PREVIOUS_METADATA, + **IRRELEVANT_METADATA, + "sha256_hash": CURRENT_HASH, + } + }, + "needs-new-metadata-file.yaml": { + "key": f"{S3_PREFIX}/needs-new-metadata-file.yaml", + "metadata": { + **PREVIOUS_METADATA, + **UPLOAD_PREVIOUS_METADATA, + **IRRELEVANT_METADATA, + "sha256_hash": CURRENT_HASH, + } + }, + "updated-file.yml": { + "key": f"{S3_PREFIX}/updated-file.yml", + "metadata": { + **CURRENT_METADATA, + **UPLOAD_PREVIOUS_METADATA, + **IRRELEVANT_METADATA, + "sha256_hash": CURRENT_HASH, + } + }, + "stale-file.yml": { + "key": f"{S3_PREFIX}/stale-file.yml", + "metadata": { + **PREVIOUS_METADATA, + **UPLOAD_PREVIOUS_METADATA, + **IRRELEVANT_METADATA, + "sha256_hash": CURRENT_HASH, + } + }, +} + + +@patch("sync_to_s3.get_full_local_path") +def test_get_local_files_empty_directory(get_full_local_path): + file_extensions = [".yml"] + with tempfile.TemporaryDirectory() as directory_path: + get_full_local_path.return_value = Path(directory_path) + + assert get_local_files( + directory_path, + file_extensions, + recursive=True, + ) == {} + + get_full_local_path.assert_called_once_with(directory_path) + + +@patch("sync_to_s3.get_full_local_path") +def test_get_local_files_non_recursive_missing_file(get_full_local_path): + with tempfile.TemporaryDirectory() as directory_path: + local_path = Path(directory_path) / "missing-file.yml" + get_full_local_path.return_value = local_path + + assert get_local_files( + str(local_path), + file_extensions=[], + recursive=False, + ) == {} + + get_full_local_path.assert_called_once_with(str(local_path)) + + +@patch("sync_to_s3.get_full_local_path") +def test_get_local_files_recursive(get_full_local_path): + file_extensions = [".yml", ".yaml"] + example_local_files = deepcopy(EXAMPLE_LOCAL_FILES) + example_local_files["README.md"] = { + "key": "README.md", + "file_path": "/full/path/README.md", + "sha256_hash": NEW_HASH, + } + example_local_files["some-other-config.json"] = { + "key": "some-other-config.json", + "file_path": "/full/path/some-other-config.json", + "sha256_hash": CURRENT_HASH, + } + with tempfile.TemporaryDirectory() as directory_path: + get_full_local_path.return_value = Path(directory_path) + + for file in example_local_files.values(): + tmp_file_path = Path(directory_path) / file.get("key") + with open(tmp_file_path, "wb", buffering=0) as file_pointer: + file["file_path"] = str(Path(directory_path) / file.get("key")) + file_pointer.write( + NEW_VERSION if file.get("key") in [ + "updated-file.yml", + "missing-file.yml", + "README.md" + ] else CURRENT_VERSION + ) + return_local_files = deepcopy(example_local_files) + del return_local_files["README.md"] + del return_local_files["some-other-config.json"] + + assert get_local_files( + directory_path, + file_extensions, + recursive=True, + ) == return_local_files + + get_full_local_path.assert_called_once_with(directory_path) + + +@patch("sync_to_s3.get_full_local_path") +def test_get_local_files_recursive_any(get_full_local_path): + file_extensions = [] + example_local_files = deepcopy(EXAMPLE_LOCAL_FILES) + example_local_files["README.md"] = { + "key": "README.md", + "file_path": "/full/path/README.md", + "sha256_hash": NEW_HASH, + } + example_local_files["some-other-config.json"] = { + "key": "some-other-config.json", + "file_path": "/full/path/some-other-config.json", + "sha256_hash": CURRENT_HASH, + } + with tempfile.TemporaryDirectory() as directory_path: + get_full_local_path.return_value = Path(directory_path) + + for file in example_local_files.values(): + tmp_file_path = Path(directory_path) / file.get("key") + with open(tmp_file_path, "wb", buffering=0) as file_pointer: + file["file_path"] = str(Path(directory_path) / file.get("key")) + file_pointer.write( + NEW_VERSION if file.get("key") in [ + "updated-file.yml", + "missing-file.yml", + "README.md" + ] else CURRENT_VERSION + ) + + assert get_local_files( + directory_path, + file_extensions, + recursive=True, + ) == example_local_files + + get_full_local_path.assert_called_once_with(directory_path) + + +@patch("sync_to_s3.get_full_local_path") +def test_get_local_files_recursive_unrelated_only(get_full_local_path): + file_extensions = [".xml"] + example_local_files = deepcopy(EXAMPLE_LOCAL_FILES) + with tempfile.TemporaryDirectory() as directory_path: + get_full_local_path.return_value = Path(directory_path) + + for file in example_local_files.values(): + tmp_file_path = Path(directory_path) / file.get("key") + with open(tmp_file_path, "wb", buffering=0) as file_pointer: + file["file_path"] = str(Path(directory_path) / file.get("key")) + file_pointer.write( + NEW_VERSION if file.get("key") in [ + "updated-file.yml", + "missing-file.yml", + ] else CURRENT_VERSION + ) + + assert get_local_files( + directory_path, + file_extensions, + recursive=True, + ) == {} + + get_full_local_path.assert_called_once_with(directory_path) + + +@patch("sync_to_s3.get_full_local_path") +def test_get_local_files_recursive_no_filter(get_full_local_path): + file_extensions = [] + example_local_files = deepcopy(EXAMPLE_LOCAL_FILES) + example_local_files["README.md"] = { + "key": "README.md", + "file_path": "/full/path/README.md", + "sha256_hash": CURRENT_HASH, + } + example_local_files["some-other-config.json"] = { + "key": "some-other-config.json", + "file_path": "/full/path/some-other-config.json", + "sha256_hash": CURRENT_HASH, + } + with tempfile.TemporaryDirectory() as directory_path: + get_full_local_path.return_value = Path(directory_path) + + for file in example_local_files.values(): + tmp_file_path = Path(directory_path) / file.get("key") + with open(tmp_file_path, "wb", buffering=0) as file_pointer: + file["file_path"] = str(Path(directory_path) / file.get("key")) + file_pointer.write( + NEW_VERSION if file.get("key") in [ + "updated-file.yml", + "missing-file.yml", + ] else CURRENT_VERSION + ) + + assert get_local_files( + directory_path, + file_extensions, + recursive=True, + ) == example_local_files + + get_full_local_path.assert_called_once_with(directory_path) + + +@patch("sync_to_s3.get_full_local_path") +def test_get_local_file_non_recursive(get_full_local_path): + example_local_files = {} + file_name = "updated-file.yml" + example_local_files[NON_RECURSIVE_KEY] = ( + deepcopy(EXAMPLE_LOCAL_FILES[file_name]) + ) + with tempfile.TemporaryDirectory() as directory_path: + tmp_file_path = Path(directory_path) / file_name + get_full_local_path.return_value = tmp_file_path + + with open(tmp_file_path, mode="wb", buffering=0) as file_pointer: + example_local_files[NON_RECURSIVE_KEY]["file_path"] = str( + tmp_file_path, + ) + file_pointer.write(NEW_VERSION) + + assert get_local_files( + file_pointer.name, + file_extensions=[], + recursive=False, + ) == example_local_files + + get_full_local_path.assert_called_once_with(file_pointer.name) + + +def test_get_s3_objects_recursive_empty_bucket(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = S3_PREFIX + file_extensions = [".yml"] + + paginator = Mock() + s3_client.get_paginator.return_value = paginator + paginator.paginate.return_value = [ + {}, + ] + + assert get_s3_objects( + s3_client, + s3_bucket, + s3_prefix, + file_extensions, + recursive=True, + ) == {} + + +def test_get_s3_objects_recursive_unrelated_files_only(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = S3_PREFIX + file_extensions = [".yml"] + + paginator = Mock() + s3_client.get_paginator.return_value = paginator + paginator.paginate.return_value = [ + { + "Contents": [ + { + "Key": "README.md", + }, + { + "Key": "other-file.json", + }, + { + "Key": "another-file.yaml", + } + ], + }, + ] + + assert get_s3_objects( + s3_client, + s3_bucket, + s3_prefix, + file_extensions, + recursive=True, + ) == {} + + +def test_get_s3_objects_non_recursive_missing_object(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_object_key = f"{S3_PREFIX}/missing-file.yml" + file_extensions = [] + + s3_client.exceptions.NoSuchKey = Exception + s3_client.head_object.side_effect = s3_client.exceptions.NoSuchKey() + + assert get_s3_objects( + s3_client, + s3_bucket, + s3_object_key, + file_extensions, + recursive=False, + ) == {} + + +def test_get_s3_objects_recursive_success(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = S3_PREFIX + example_s3_objects = deepcopy(EXAMPLE_S3_OBJECTS) + file_extensions = [".yml", ".yaml"] + + paginator = Mock() + s3_client.get_paginator.return_value = paginator + + s3_obj_keys = list(map( + lambda obj: { + "Key": obj["key"], + }, + example_s3_objects.values(), + )) + s3_obj_data = dict(map( + lambda obj: ( + obj["key"], + { + "Key": obj["key"], + "Metadata": obj["metadata"], + } + ), + example_s3_objects.values(), + )) + paginator.paginate.return_value = [ + { + "Contents": s3_obj_keys[:2], + }, + { + "Contents": [ + { + "Key": "README.md", + }, + { + "Key": "other-file.json", + } + ], + }, + { + "Contents": s3_obj_keys[2:], + }, + ] + s3_client.head_object.side_effect = ( + lambda **kwargs: s3_obj_data[kwargs["Key"]] + ) + + assert get_s3_objects( + s3_client, + s3_bucket, + s3_prefix, + file_extensions, + recursive=True, + ) == example_s3_objects + + s3_client.get_paginator.assert_called_once_with("list_objects_v2") + paginator.paginate.assert_called_once_with( + Bucket=s3_bucket, + Prefix=f"{s3_prefix}/", + ) + s3_client.head_object.assert_has_calls( + list(map( + lambda obj: call( + Bucket=s3_bucket, + Key=obj.get("key"), + ), + example_s3_objects.values(), + )), + ) + + +def test_get_s3_objects_non_recursive_success(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_object_key = f"{S3_PREFIX}/first-file.yml" + example_s3_objects = {} + example_s3_objects[NON_RECURSIVE_KEY] = ( + deepcopy(EXAMPLE_S3_OBJECTS["first-file.yml"]) + ) + file_extensions = [] + + s3_client.head_object.return_value = { + "Key": "first-file.yml", + "Metadata": { + **CURRENT_METADATA, + **UPLOAD_PREVIOUS_METADATA, + **IRRELEVANT_METADATA, + "sha256_hash": CURRENT_HASH, + }, + } + + assert get_s3_objects( + s3_client, + s3_bucket, + s3_object_key, + file_extensions, + recursive=False, + ) == example_s3_objects + + s3_client.head_object.assert_called_once_with( + Bucket=s3_bucket, + Key=s3_object_key, + ) + + +def test_upload_changed_files_simple(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = S3_PREFIX + local_files = deepcopy(EXAMPLE_LOCAL_FILES) + s3_objects = deepcopy(EXAMPLE_S3_OBJECTS) + metadata_to_check = { + "always_apply": deepcopy(CURRENT_METADATA), + "upon_upload_apply": { + "execution_id": "example-id", + "another-key": "another-value", + } + } + + with tempfile.NamedTemporaryFile(mode="wb", buffering=0) as file_pointer: + file_pointer.write(CURRENT_VERSION) + for key in local_files.keys(): + local_files[key]["file_path"] = file_pointer.name + + upload_changed_files( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + metadata_to_check, + ) + + local_updated = local_files["updated-file.yml"] + local_missing = local_files["missing-file.yml"] + object_outdated_metadata = local_files["needs-new-metadata-file.yaml"] + s3_client.put_object.assert_has_calls([ + call( + Body=ANY, + Bucket=s3_bucket, + Key=f"{s3_prefix}/{object_outdated_metadata['key']}", + Metadata={ + **metadata_to_check["always_apply"], + **metadata_to_check["upon_upload_apply"], + "sha256_hash": object_outdated_metadata["sha256_hash"], + } + ), + call( + Body=ANY, + Bucket=s3_bucket, + Key=f"{s3_prefix}/{local_updated['key']}", + Metadata={ + **metadata_to_check["always_apply"], + **metadata_to_check["upon_upload_apply"], + "sha256_hash": local_updated["sha256_hash"], + } + ), + call( + Body=ANY, + Bucket=s3_bucket, + Key=f"{s3_prefix}/{local_missing['key']}", + Metadata={ + **metadata_to_check["always_apply"], + **metadata_to_check["upon_upload_apply"], + "sha256_hash": local_missing["sha256_hash"], + } + ), + ]) + assert s3_client.put_object.call_count == 3 + + +def test_upload_changed_files_no_updates(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = S3_PREFIX + local_files = deepcopy(EXAMPLE_LOCAL_FILES) + del local_files["updated-file.yml"] + del local_files["missing-file.yml"] + del local_files["needs-new-metadata-file.yaml"] + s3_objects = deepcopy(EXAMPLE_S3_OBJECTS) + + for obj in s3_objects.values(): + for irrelevant_key in IRRELEVANT_METADATA.keys(): + obj["metadata"][irrelevant_key] = "some-different-value" + + with tempfile.NamedTemporaryFile(mode="wb", buffering=0) as file_pointer: + file_pointer.write(CURRENT_VERSION) + for key in local_files.keys(): + local_files[key]["file_path"] = file_pointer.name + + upload_changed_files( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + metadata_to_check={ + "always_apply": {}, + "upon_upload_apply": {}, + }, + ) + + s3_client.put_object.assert_not_called() + + +def test_upload_changed_files_single_file(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = "missing-file.yml" + s3_objects = deepcopy(EXAMPLE_S3_OBJECTS) + metadata_to_check = { + "always_apply": deepcopy(CURRENT_METADATA), + "upon_upload_apply": deepcopy(UPLOAD_NEW_METADATA), + } + + with tempfile.NamedTemporaryFile(mode="wb", buffering=0) as file_pointer: + file_pointer.write(CURRENT_VERSION) + local_files = { + "missing-file.yml": { + "key": s3_prefix, + "file_path": file_pointer.name, + "sha256_hash": CURRENT_HASH, + }, + } + + upload_changed_files( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + metadata_to_check, + ) + + local_missing = local_files["missing-file.yml"] + s3_client.put_object.assert_has_calls([ + call( + Body=ANY, + Bucket=s3_bucket, + Key=f"{local_missing['key']}", + Metadata={ + **metadata_to_check["always_apply"], + **metadata_to_check["upon_upload_apply"], + "sha256_hash": local_missing["sha256_hash"], + } + ), + ]) + assert s3_client.put_object.call_count == 1 + + +def test_upload_changed_files_single_file_no_update(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = "first-file.yml" + s3_objects = deepcopy(EXAMPLE_S3_OBJECTS) + metadata_to_check = { + "always_apply": deepcopy(CURRENT_METADATA), + "upon_upload_apply": deepcopy(UPLOAD_NEW_METADATA), + } + + for obj in s3_objects.values(): + for irrelevant_key in IRRELEVANT_METADATA.keys(): + obj["metadata"][irrelevant_key] = "some-different-value" + + with tempfile.NamedTemporaryFile(mode="wb", buffering=0) as file_pointer: + file_pointer.write(CURRENT_VERSION) + local_files = { + "first-file.yml": { + "key": s3_prefix, + "file_path": file_pointer.name, + "sha256_hash": CURRENT_HASH, + }, + } + + upload_changed_files( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + metadata_to_check, + ) + + s3_client.put_object.assert_not_called() + + +def test_delete_stale_objects_simple(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = S3_PREFIX + local_files = deepcopy(EXAMPLE_LOCAL_FILES) + s3_objects = deepcopy(EXAMPLE_S3_OBJECTS) + + delete_stale_objects( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + ) + + s3_client.delete_objects.assert_called_once_with( + Bucket=s3_bucket, + Delete={ + "Objects": [{ + "Key": s3_objects.get("stale-file.yml").get("key"), + }], + }, + ) + + +def test_delete_stale_single_object(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = "stale-file.yml" + local_files = deepcopy(EXAMPLE_LOCAL_FILES) + s3_objects = { + "stale-file.yml": { + "key": s3_prefix, + "sha256_hash": CURRENT_HASH, + }, + } + + delete_stale_objects( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + ) + + s3_client.delete_objects.assert_called_once_with( + Bucket=s3_bucket, + Delete={ + "Objects": [{ + "Key": s3_prefix, + }], + }, + ) + + +def test_delete_stale_objects_no_stale_objects(): + s3_client = Mock() + s3_bucket = "your-bucket" + s3_prefix = S3_PREFIX + local_files = deepcopy(EXAMPLE_LOCAL_FILES) + s3_objects = deepcopy(EXAMPLE_S3_OBJECTS) + del s3_objects["stale-file.yml"] + + delete_stale_objects( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + ) + + s3_client.delete_objects.assert_not_called() + + +def test_clean_s3_prefix(): + assert clean_s3_prefix("some-path") == "some-path" + assert clean_s3_prefix("/some-path") == "some-path" + assert clean_s3_prefix("some-path/") == "some-path" + assert clean_s3_prefix("/some-path") == "some-path" + assert clean_s3_prefix("") == "" + + +def test_full_local_path_relative_to_cwd(): + local_path = "local/path" + here = Path(os.getcwd()) + assert (here / local_path) == get_full_local_path(local_path) + + +def test_full_local_path_absolute_path(): + absolute_path = "/absolute/path" + assert Path(absolute_path) == get_full_local_path(absolute_path) + + +def test_convert_to_s3_key(): + # Local key == s3_prefix + assert convert_to_s3_key("a.yml", "a.yml") == "a.yml" + + # S3 prefix is set + assert convert_to_s3_key("some-path", "prefix") == "prefix/some-path" + + # S3 prefix is set and local key matches NON_RECURSIVE_KEY + assert convert_to_s3_key(NON_RECURSIVE_KEY, "full-s3-obj") == "full-s3-obj" + + # S3 prefix is Non + assert convert_to_s3_key("some-path", "") == "some-path" + + +def test_convert_to_local_key(): + # Local key == s3_prefix + assert convert_to_local_key("a.yml", "a.yml") == "a.yml" + + # S3 prefix is set local + assert convert_to_local_key("prefix/some-path", "prefix") == "some-path" + + # S3 prefix is Nonlocal + assert convert_to_local_key("some-path", "") == "some-path" + + +@patch("sys.exit") +def test_ensure_valid_input_no_local_path(sys_exit): + s3_bucket = "your-bucket" + s3_prefix = "" + s3_url = f"s3://{s3_bucket}/{s3_prefix}" + + test_exit_message = "Would have exited with exit code 1" + sys_exit.side_effect = Exception(test_exit_message) + + with pytest.raises(Exception) as exc_info: + ensure_valid_input( + local_path="", + file_extensions=[".yml"], + s3_url=s3_url, + s3_bucket=s3_bucket, + s3_prefix=s3_prefix, + recursive=False, + ) + error_message = str(exc_info.value) + assert error_message.find(test_exit_message) >= 0 + + sys_exit.assert_called_once_with(1) + + +@patch("sys.exit") +def test_ensure_valid_input_no_destination_s3_url(sys_exit): + test_exit_message = "Would have exited with exit code 2" + sys_exit.side_effect = Exception(test_exit_message) + + with pytest.raises(Exception) as exc_info: + ensure_valid_input( + local_path="/tmp/some-path", + file_extensions=[".yml"], + s3_url="", + s3_bucket="", + s3_prefix="", + recursive=False, + ) + error_message = str(exc_info.value) + assert error_message.find(test_exit_message) >= 0 + + sys_exit.assert_called_once_with(2) + + +@patch("sys.exit") +def test_ensure_valid_input_non_recursive_and_no_s3_prefix(sys_exit): + s3_bucket = "your-bucket" + s3_prefix = "" + s3_url = f"s3://{s3_bucket}/{s3_prefix}" + + test_exit_message = "Would have exited with exit code 3" + sys_exit.side_effect = Exception(test_exit_message) + + with pytest.raises(Exception) as exc_info: + ensure_valid_input( + local_path="/tmp/some-path", + file_extensions=[".yml"], + s3_url=s3_url, + s3_bucket=s3_bucket, + s3_prefix=s3_prefix, + recursive=False, + ) + error_message = str(exc_info.value) + assert error_message.find(test_exit_message) >= 0 + + sys_exit.assert_called_once_with(3) + + +@patch("sys.exit") +def test_ensure_valid_input_recursive_and_path_does_not_exist(sys_exit): + s3_bucket = "your-bucket" + s3_prefix = "" + s3_url = f"s3://{s3_bucket}/{s3_prefix}" + + test_exit_message = "Would have exited with exit code 4" + sys_exit.side_effect = Exception(test_exit_message) + + with pytest.raises(Exception) as exc_info: + ensure_valid_input( + local_path="/tmp/some-path", + file_extensions=[".yml"], + s3_url=s3_url, + s3_bucket=s3_bucket, + s3_prefix=s3_prefix, + recursive=True, + ) + error_message = str(exc_info.value) + assert error_message.find(test_exit_message) >= 0 + + sys_exit.assert_called_once_with(4) + + +@patch("sys.exit") +def test_ensure_valid_input_not_recursive_and_path_is_a_dir(sys_exit): + s3_bucket = "your-bucket" + s3_prefix = "a-prefix.yml" + s3_url = f"s3://{s3_bucket}/{s3_prefix}" + + test_exit_message = "Would have exited with exit code 5" + sys_exit.side_effect = Exception(test_exit_message) + + with tempfile.TemporaryDirectory() as directory_path: + with pytest.raises(Exception) as exc_info: + ensure_valid_input( + local_path=directory_path, + file_extensions=[".yml"], + s3_url=s3_url, + s3_bucket=s3_bucket, + s3_prefix=s3_prefix, + recursive=False, + ) + error_message = str(exc_info.value) + assert error_message.find(test_exit_message) >= 0 + + sys_exit.assert_called_once_with(5) + + +@patch("sync_to_s3.delete_stale_objects") +@patch("sync_to_s3.upload_changed_files") +@patch("sync_to_s3.get_s3_objects") +@patch("sync_to_s3.get_local_files") +@patch("sync_to_s3.ensure_valid_input") +def test_sync_files_recursive_delete( + ensure_valid_input, + get_local_files, + get_s3_objects, + upload_files, + delete_stale, +): + s3_client = Mock() + local_path = "/tmp/some-path" + file_extensions = [".yml"] + s3_bucket = "your-bucket" + s3_prefix = "your-prefix" + s3_url = f"s3://{s3_bucket}/{s3_prefix}" + recursive = True + delete = True + metadata_to_check = { + "always_apply": deepcopy(CURRENT_METADATA), + "upon_upload_apply": deepcopy(UPLOAD_PREVIOUS_METADATA), + } + + local_files = Mock() + s3_objects = Mock() + get_local_files.return_value = local_files + get_s3_objects.return_value = s3_objects + + sync_files( + s3_client, + local_path, + file_extensions, + s3_url, + recursive, + delete, + metadata_to_check, + ) + + get_local_files.assert_called_once_with( + local_path, + file_extensions, + recursive, + ) + get_s3_objects.assert_called_once_with( + s3_client, + s3_bucket, + s3_prefix, + file_extensions, + recursive, + ) + upload_files.assert_called_once_with( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + metadata_to_check, + ) + delete_stale.assert_called_once_with( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + ) + + +@patch("sync_to_s3.delete_stale_objects") +@patch("sync_to_s3.upload_changed_files") +@patch("sync_to_s3.get_s3_objects") +@patch("sync_to_s3.get_local_files") +@patch("sync_to_s3.ensure_valid_input") +def test_sync_files_recursive_no_delete( + ensure_valid_input, + get_local_files, + get_s3_objects, + upload_files, + delete_stale, +): + s3_client = Mock() + local_path = "/tmp/some-path" + file_extensions = [".yml"] + s3_bucket = "your-bucket" + s3_prefix = "your-prefix" + s3_url = f"s3://{s3_bucket}/{s3_prefix}" + recursive = True + delete = False + metadata_to_check = { + "always_apply": deepcopy(CURRENT_METADATA), + "upon_upload_apply": deepcopy(UPLOAD_PREVIOUS_METADATA), + } + + local_files = Mock() + s3_objects = Mock() + get_local_files.return_value = local_files + get_s3_objects.return_value = s3_objects + + sync_files( + s3_client, + local_path, + file_extensions, + s3_url, + recursive, + delete, + metadata_to_check, + ) + + ensure_valid_input.assert_called_once_with( + local_path, + file_extensions, + s3_url, + s3_bucket, + s3_prefix, + recursive, + ) + get_local_files.assert_called_once_with( + local_path, + file_extensions, + recursive, + ) + get_s3_objects.assert_called_once_with( + s3_client, + s3_bucket, + s3_prefix, + file_extensions, + recursive, + ) + upload_files.assert_called_once_with( + s3_client, + s3_bucket, + s3_prefix, + local_files, + s3_objects, + metadata_to_check, + ) + delete_stale.assert_not_called() diff --git a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/pytest.ini b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/pytest.ini index 8947ae49b..68298b1c8 100644 --- a/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/pytest.ini +++ b/src/lambda_codebase/initial_commit/bootstrap_repository/adf-build/shared/pytest.ini @@ -1,3 +1,3 @@ [pytest] testpaths = tests -norecursedirs = python cdk +norecursedirs = python cdk helpers diff --git a/src/template.yml b/src/template.yml index 6763b9a23..bbfea92a7 100644 --- a/src/template.yml +++ b/src/template.yml @@ -1287,7 +1287,8 @@ Resources: - aws s3 sync ./adf-build/shared s3://$DEPLOYMENT_ACCOUNT_BUCKET/adf-build --quiet # Base templates: - aws s3 sync . s3://$S3_BUCKET --quiet --delete - - aws s3 sync ./adf-accounts s3://$ACCOUNT_BUCKET --quiet + # Upload account files to the ACCOUNT_BUCKET + - python adf-build/shared/helpers/sync_to_s3.py --extension .yml --extension .yaml --metadata adf_version=${ADF_VERSION} --upload-with-metadata execution_id=${CODEPIPELINE_EXECUTION_ID} --recursive adf-accounts s3://$ACCOUNT_BUCKET # Updates config, updates (or creates) base stacks: - python adf-build/main.py Type: CODEPIPELINE @@ -1333,6 +1334,14 @@ Resources: - Name: "TemplateSource" Configuration: ProjectName: !Ref CodeBuildProject + EnvironmentVariables: >- + [ + { + "name": "CODEPIPELINE_EXECUTION_ID", + "value": "#{codepipeline.PipelineExecutionId}", + "type": "PLAINTEXT" + } + ] RunOrder: 1 CodePipelineRole: diff --git a/tox.ini b/tox.ini index babe1cda8..bd05d0910 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ setenv= AWS_REGION=eu-central-1 AWS_DEFAULT_REGION=eu-central-1 ADF_PIPELINE_PREFIX=adf-pipeline- + CODEBUILD_BUILD_ID=abcdef S3_BUCKET=some_bucket S3_BUCKET_NAME=some_bucket DEPLOYMENT_ACCOUNT_BUCKET=some_deployment_account_bucket