From ac0d5b3dbe731605af38018ce7ce970ffded539a Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 22 Aug 2023 13:27:03 +0200 Subject: [PATCH] Improve detection of when breeze CI image needs rebuilding (#33603) * Improve detection of when breeze CI image needs rebuilding Previously we have been using provider.yaml file modification as a sign that the docker image needs rebuilding when starting image. However just modification of provider.yaml file is not a sign that the image needs rebuilding. The image needs rebuilding when provider dependencies changed, but there are many more reasons why provider.yaml file changed - especially recently provider.yaml file contains much more information and dependencies are only part of it. Provider.yaml files can also be modified by release manager wnen documentation is prepared, but none of the documentation change is a reason for rebuilding the image. This PR optimize the check for image building introducing two step process: * first we check if provider.yaml files changed * if they did, we regenerate provider dependencies by manully running the pre-commit script * then provider_dependencies.json is used instead of all providers to determine if the image needs rebuilding This has several nice side effects: * the list of files that have been modified displayed to the user is potentially much smaller (no provider.yaml files) * provider_dependencies.json is regenereated automatically when you run any breeze command, which means that you do not have to have pre-commit installed to regenerate it * the notification "image needs rebuilding" will be printed less frequently to the user - only when it is really needed * preparing provider documentation in CI will not trigger image rebuilding (which might occasionally fail in such case especially when we bring back a provider from long suspension like it happened in #33574 * Update dev/breeze/src/airflow_breeze/commands/developer_commands.py --- .../commands/ci_image_commands.py | 6 +- .../commands/developer_commands.py | 3 + .../src/airflow_breeze/global_constants.py | 2 +- .../airflow_breeze/params/build_ci_params.py | 1 + .../src/airflow_breeze/params/shell_params.py | 1 + .../airflow_breeze/utils/md5_build_check.py | 70 +++++++++++++++---- ...re_commit_update_providers_dependencies.py | 34 ++++++--- 7 files changed, 92 insertions(+), 25 deletions(-) diff --git a/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py b/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py index 124d6ca31809b..2eecbfff3ca81 100644 --- a/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py @@ -455,7 +455,10 @@ def should_we_run_the_build(build_ci_params: BuildCiParams) -> bool: # We import those locally so that click autocomplete works from inputimeout import TimeoutOccurred - if not md5sum_check_if_build_is_needed(md5sum_cache_dir=build_ci_params.md5sum_cache_dir): + if not md5sum_check_if_build_is_needed( + md5sum_cache_dir=build_ci_params.md5sum_cache_dir, + skip_provider_dependencies_check=build_ci_params.skip_provider_dependencies_check, + ): return False try: answer = user_confirm( @@ -631,6 +634,7 @@ def rebuild_or_pull_ci_image_if_needed(command_params: ShellParams | BuildCiPara image_tag=command_params.image_tag, platform=command_params.platform, force_build=command_params.force_build, + skip_provider_dependencies_check=command_params.skip_provider_dependencies_check, ) if command_params.image_tag is not None and command_params.image_tag != "latest": return_code, message = run_pull_image( diff --git a/dev/breeze/src/airflow_breeze/commands/developer_commands.py b/dev/breeze/src/airflow_breeze/commands/developer_commands.py index 40fc0954780e9..671ced7d5a649 100644 --- a/dev/breeze/src/airflow_breeze/commands/developer_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/developer_commands.py @@ -515,6 +515,9 @@ def static_checks( force_build=force_build, image_tag=image_tag, github_repository=github_repository, + # for static checks we do not want to regenerate dependencies before pre-commits are run + # we want the pre-commit to do it for us (and detect the case the dependencies are updated) + skip_provider_dependencies_check=True, ) if not skip_image_check: rebuild_or_pull_ci_image_if_needed(command_params=build_params) diff --git a/dev/breeze/src/airflow_breeze/global_constants.py b/dev/breeze/src/airflow_breeze/global_constants.py index c0740801d59c0..6e07343aecc77 100644 --- a/dev/breeze/src/airflow_breeze/global_constants.py +++ b/dev/breeze/src/airflow_breeze/global_constants.py @@ -323,13 +323,13 @@ def get_airflow_extras(): "setup.cfg", "Dockerfile.ci", ".dockerignore", + "generated/provider_dependencies.json", "scripts/docker/common.sh", "scripts/docker/install_additional_dependencies.sh", "scripts/docker/install_airflow.sh", "scripts/docker/install_airflow_dependencies_from_branch_tip.sh", "scripts/docker/install_from_docker_context_files.sh", "scripts/docker/install_mysql.sh", - *ALL_PROVIDER_YAML_FILES, ] ENABLED_SYSTEMS = "" diff --git a/dev/breeze/src/airflow_breeze/params/build_ci_params.py b/dev/breeze/src/airflow_breeze/params/build_ci_params.py index 1f0663ec9f95c..1f49f0597b53c 100644 --- a/dev/breeze/src/airflow_breeze/params/build_ci_params.py +++ b/dev/breeze/src/airflow_breeze/params/build_ci_params.py @@ -37,6 +37,7 @@ class BuildCiParams(CommonBuildParams): airflow_pre_cached_pip_packages: bool = True force_build: bool = False eager_upgrade_additional_requirements: str = "" + skip_provider_dependencies_check: bool = False @property def airflow_version(self): diff --git a/dev/breeze/src/airflow_breeze/params/shell_params.py b/dev/breeze/src/airflow_breeze/params/shell_params.py index a70a404d0ed64..99bc8c4d3cfd5 100644 --- a/dev/breeze/src/airflow_breeze/params/shell_params.py +++ b/dev/breeze/src/airflow_breeze/params/shell_params.py @@ -125,6 +125,7 @@ class ShellParams: celery_flower: bool = False only_min_version_update: bool = False regenerate_missing_docs: bool = False + skip_provider_dependencies_check: bool = False def clone_with_test(self, test_type: str) -> ShellParams: new_params = deepcopy(self) diff --git a/dev/breeze/src/airflow_breeze/utils/md5_build_check.py b/dev/breeze/src/airflow_breeze/utils/md5_build_check.py index 235fbf22495b4..54b46c99164c0 100644 --- a/dev/breeze/src/airflow_breeze/utils/md5_build_check.py +++ b/dev/breeze/src/airflow_breeze/utils/md5_build_check.py @@ -20,11 +20,14 @@ from __future__ import annotations import hashlib +import os +import sys from pathlib import Path -from airflow_breeze.global_constants import FILES_FOR_REBUILD_CHECK +from airflow_breeze.global_constants import ALL_PROVIDER_YAML_FILES, FILES_FOR_REBUILD_CHECK from airflow_breeze.utils.console import get_console from airflow_breeze.utils.path_utils import AIRFLOW_SOURCES_ROOT +from airflow_breeze.utils.run_utils import run_command def check_md5checksum_in_cache_modified(file_hash: str, cache_path: Path, update: bool) -> bool: @@ -59,8 +62,19 @@ def generate_md5(filename, file_size: int = 65536): return hash_md5.hexdigest() +def check_md5_sum_for_file(file_to_check: str, md5sum_cache_dir: Path, update: bool): + file_to_get_md5 = AIRFLOW_SOURCES_ROOT / file_to_check + md5_checksum = generate_md5(file_to_get_md5) + sub_dir_name = file_to_get_md5.parts[-2] + actual_file_name = file_to_get_md5.parts[-1] + cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum") + file_content = md5_checksum + " " + str(file_to_get_md5) + "\n" + is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update) + return is_modified + + def calculate_md5_checksum_for_files( - md5sum_cache_dir: Path, update: bool = False + md5sum_cache_dir: Path, update: bool = False, skip_provider_dependencies_check: bool = False ) -> tuple[list[str], list[str]]: """ Calculates checksums for all interesting files and stores the hashes in the md5sum_cache_dir. @@ -68,35 +82,63 @@ def calculate_md5_checksum_for_files( :param md5sum_cache_dir: directory where to store cached information :param update: whether to update the hashes + :param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies :return: Tuple of two lists: modified and not-modified files """ not_modified_files = [] modified_files = [] - for calculate_md5_file in FILES_FOR_REBUILD_CHECK: - file_to_get_md5 = AIRFLOW_SOURCES_ROOT / calculate_md5_file - md5_checksum = generate_md5(file_to_get_md5) - sub_dir_name = file_to_get_md5.parts[-2] - actual_file_name = file_to_get_md5.parts[-1] - cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum") - file_content = md5_checksum + " " + str(file_to_get_md5) + "\n" - is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update) + if not skip_provider_dependencies_check: + modified_provider_yaml_files = [] + for file in ALL_PROVIDER_YAML_FILES: + # Only check provider yaml files once and save the result immediately. + # If we need to regenerate the dependencies and they are not modified then + # all is fine and we can save checksums for the new files + if check_md5_sum_for_file(file, md5sum_cache_dir, True): + modified_provider_yaml_files.append(file) + if modified_provider_yaml_files: + get_console().print( + "[info]Attempting to generate provider dependencies. " + "Provider yaml files changed since last check:[/]" + ) + get_console().print( + [os.fspath(file.relative_to(AIRFLOW_SOURCES_ROOT)) for file in modified_provider_yaml_files] + ) + # Regenerate provider_dependencies.json + run_command( + [ + sys.executable, + os.fspath( + AIRFLOW_SOURCES_ROOT + / "scripts" + / "ci" + / "pre_commit" + / "pre_commit_update_providers_dependencies.py" + ), + ], + cwd=AIRFLOW_SOURCES_ROOT, + ) + for file in FILES_FOR_REBUILD_CHECK: + is_modified = check_md5_sum_for_file(file, md5sum_cache_dir, update) if is_modified: - modified_files.append(calculate_md5_file) + modified_files.append(file) else: - not_modified_files.append(calculate_md5_file) + not_modified_files.append(file) return modified_files, not_modified_files -def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path) -> bool: +def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path, skip_provider_dependencies_check: bool) -> bool: """ Checks if build is needed based on whether important files were modified. :param md5sum_cache_dir: directory where cached md5 sums are stored + :param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies :return: True if build is needed. """ build_needed = False - modified_files, not_modified_files = calculate_md5_checksum_for_files(md5sum_cache_dir, update=False) + modified_files, not_modified_files = calculate_md5_checksum_for_files( + md5sum_cache_dir, update=False, skip_provider_dependencies_check=skip_provider_dependencies_check + ) if modified_files: get_console().print( f"[warning]The following important files are modified in {AIRFLOW_SOURCES_ROOT} " diff --git a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py index e2fc0ca713362..b63a60b14b224 100755 --- a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py +++ b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py @@ -209,12 +209,28 @@ def check_if_different_provider_used(file_path: Path) -> None: console.print("[red]Errors found during verification. Exiting!") console.print() sys.exit(1) - DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n") - console.print( - f"[yellow]If you see changes to the {DEPENDENCIES_JSON_FILE_PATH} file - " - f"do not modify the file manually. Let pre-commit do the job!" - ) - console.print() - console.print("[green]Verification complete! Success!\n") - console.print(f"Written {DEPENDENCIES_JSON_FILE_PATH}") - console.print() + old_dependencies = DEPENDENCIES_JSON_FILE_PATH.read_text() + new_dependencies = json.dumps(unique_sorted_dependencies, indent=2) + "\n" + if new_dependencies != old_dependencies: + DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n") + if os.environ.get("CI"): + console.print() + console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}") + console.print( + f"[yellow]You will need to run breeze locally and commit " + f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n" + ) + console.print() + else: + console.print() + console.print( + f"[yellow]Regenerated new dependencies. Please commit " + f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n" + ) + console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}") + console.print() + else: + console.print( + "[green]No need to regenerate dependencies!\n[/]" + f"The {DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)} is up to date!\n" + )