Skip to content

Commit

Permalink
Improve detection of when breeze CI image needs rebuilding (#33603)
Browse files Browse the repository at this point in the history
* Improve detection of when breeze CI image needs rebuilding

Previously we have been using provider.yaml file modification as
a sign that the docker image needs rebuilding when starting image.
However just modification of provider.yaml file is not a sign
that the image needs rebuilding. The image needs rebuilding when
provider dependencies changed, but there are many more reasons why
provider.yaml file changed - especially recently provider.yaml
file contains much more information and dependencies are only part
of it. Provider.yaml files can also be modified by release manager
wnen documentation is prepared, but none of the documentation
change is a reason for rebuilding the image.

This PR optimize the check for image building introducing two
step process:

* first we check if provider.yaml files changed
* if they did, we regenerate provider dependencies by manully
  running the pre-commit script
* then provider_dependencies.json is used instead of all providers
  to determine if the image needs rebuilding

This has several nice side effects:

* the list of files that have been modified displayed to the
  user is potentially much smaller (no provider.yaml files)
* provider_dependencies.json is regenereated automatically when
  you run any breeze command, which means that you do not have
  to have pre-commit installed to regenerate it
* the notification "image needs rebuilding" will be printed less
  frequently to the user - only when it is really needed
* preparing provider documentation in CI will not trigger
  image rebuilding (which might occasionally fail in such case
  especially when we bring back a provider from long suspension
  like it happened in #33574

* Update dev/breeze/src/airflow_breeze/commands/developer_commands.py
  • Loading branch information
potiuk authored Aug 22, 2023
1 parent 7141c42 commit ac0d5b3
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 25 deletions.
6 changes: 5 additions & 1 deletion dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,10 @@ def should_we_run_the_build(build_ci_params: BuildCiParams) -> bool:
# We import those locally so that click autocomplete works
from inputimeout import TimeoutOccurred

if not md5sum_check_if_build_is_needed(md5sum_cache_dir=build_ci_params.md5sum_cache_dir):
if not md5sum_check_if_build_is_needed(
md5sum_cache_dir=build_ci_params.md5sum_cache_dir,
skip_provider_dependencies_check=build_ci_params.skip_provider_dependencies_check,
):
return False
try:
answer = user_confirm(
Expand Down Expand Up @@ -631,6 +634,7 @@ def rebuild_or_pull_ci_image_if_needed(command_params: ShellParams | BuildCiPara
image_tag=command_params.image_tag,
platform=command_params.platform,
force_build=command_params.force_build,
skip_provider_dependencies_check=command_params.skip_provider_dependencies_check,
)
if command_params.image_tag is not None and command_params.image_tag != "latest":
return_code, message = run_pull_image(
Expand Down
3 changes: 3 additions & 0 deletions dev/breeze/src/airflow_breeze/commands/developer_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,9 @@ def static_checks(
force_build=force_build,
image_tag=image_tag,
github_repository=github_repository,
# for static checks we do not want to regenerate dependencies before pre-commits are run
# we want the pre-commit to do it for us (and detect the case the dependencies are updated)
skip_provider_dependencies_check=True,
)
if not skip_image_check:
rebuild_or_pull_ci_image_if_needed(command_params=build_params)
Expand Down
2 changes: 1 addition & 1 deletion dev/breeze/src/airflow_breeze/global_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,13 +323,13 @@ def get_airflow_extras():
"setup.cfg",
"Dockerfile.ci",
".dockerignore",
"generated/provider_dependencies.json",
"scripts/docker/common.sh",
"scripts/docker/install_additional_dependencies.sh",
"scripts/docker/install_airflow.sh",
"scripts/docker/install_airflow_dependencies_from_branch_tip.sh",
"scripts/docker/install_from_docker_context_files.sh",
"scripts/docker/install_mysql.sh",
*ALL_PROVIDER_YAML_FILES,
]

ENABLED_SYSTEMS = ""
Expand Down
1 change: 1 addition & 0 deletions dev/breeze/src/airflow_breeze/params/build_ci_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class BuildCiParams(CommonBuildParams):
airflow_pre_cached_pip_packages: bool = True
force_build: bool = False
eager_upgrade_additional_requirements: str = ""
skip_provider_dependencies_check: bool = False

@property
def airflow_version(self):
Expand Down
1 change: 1 addition & 0 deletions dev/breeze/src/airflow_breeze/params/shell_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ class ShellParams:
celery_flower: bool = False
only_min_version_update: bool = False
regenerate_missing_docs: bool = False
skip_provider_dependencies_check: bool = False

def clone_with_test(self, test_type: str) -> ShellParams:
new_params = deepcopy(self)
Expand Down
70 changes: 56 additions & 14 deletions dev/breeze/src/airflow_breeze/utils/md5_build_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@
from __future__ import annotations

import hashlib
import os
import sys
from pathlib import Path

from airflow_breeze.global_constants import FILES_FOR_REBUILD_CHECK
from airflow_breeze.global_constants import ALL_PROVIDER_YAML_FILES, FILES_FOR_REBUILD_CHECK
from airflow_breeze.utils.console import get_console
from airflow_breeze.utils.path_utils import AIRFLOW_SOURCES_ROOT
from airflow_breeze.utils.run_utils import run_command


def check_md5checksum_in_cache_modified(file_hash: str, cache_path: Path, update: bool) -> bool:
Expand Down Expand Up @@ -59,44 +62,83 @@ def generate_md5(filename, file_size: int = 65536):
return hash_md5.hexdigest()


def check_md5_sum_for_file(file_to_check: str, md5sum_cache_dir: Path, update: bool):
file_to_get_md5 = AIRFLOW_SOURCES_ROOT / file_to_check
md5_checksum = generate_md5(file_to_get_md5)
sub_dir_name = file_to_get_md5.parts[-2]
actual_file_name = file_to_get_md5.parts[-1]
cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum")
file_content = md5_checksum + " " + str(file_to_get_md5) + "\n"
is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update)
return is_modified


def calculate_md5_checksum_for_files(
md5sum_cache_dir: Path, update: bool = False
md5sum_cache_dir: Path, update: bool = False, skip_provider_dependencies_check: bool = False
) -> tuple[list[str], list[str]]:
"""
Calculates checksums for all interesting files and stores the hashes in the md5sum_cache_dir.
Optionally modifies the hashes.
:param md5sum_cache_dir: directory where to store cached information
:param update: whether to update the hashes
:param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies
:return: Tuple of two lists: modified and not-modified files
"""
not_modified_files = []
modified_files = []
for calculate_md5_file in FILES_FOR_REBUILD_CHECK:
file_to_get_md5 = AIRFLOW_SOURCES_ROOT / calculate_md5_file
md5_checksum = generate_md5(file_to_get_md5)
sub_dir_name = file_to_get_md5.parts[-2]
actual_file_name = file_to_get_md5.parts[-1]
cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum")
file_content = md5_checksum + " " + str(file_to_get_md5) + "\n"
is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update)
if not skip_provider_dependencies_check:
modified_provider_yaml_files = []
for file in ALL_PROVIDER_YAML_FILES:
# Only check provider yaml files once and save the result immediately.
# If we need to regenerate the dependencies and they are not modified then
# all is fine and we can save checksums for the new files
if check_md5_sum_for_file(file, md5sum_cache_dir, True):
modified_provider_yaml_files.append(file)
if modified_provider_yaml_files:
get_console().print(
"[info]Attempting to generate provider dependencies. "
"Provider yaml files changed since last check:[/]"
)
get_console().print(
[os.fspath(file.relative_to(AIRFLOW_SOURCES_ROOT)) for file in modified_provider_yaml_files]
)
# Regenerate provider_dependencies.json
run_command(
[
sys.executable,
os.fspath(
AIRFLOW_SOURCES_ROOT
/ "scripts"
/ "ci"
/ "pre_commit"
/ "pre_commit_update_providers_dependencies.py"
),
],
cwd=AIRFLOW_SOURCES_ROOT,
)
for file in FILES_FOR_REBUILD_CHECK:
is_modified = check_md5_sum_for_file(file, md5sum_cache_dir, update)
if is_modified:
modified_files.append(calculate_md5_file)
modified_files.append(file)
else:
not_modified_files.append(calculate_md5_file)
not_modified_files.append(file)
return modified_files, not_modified_files


def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path) -> bool:
def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path, skip_provider_dependencies_check: bool) -> bool:
"""
Checks if build is needed based on whether important files were modified.
:param md5sum_cache_dir: directory where cached md5 sums are stored
:param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies
:return: True if build is needed.
"""
build_needed = False
modified_files, not_modified_files = calculate_md5_checksum_for_files(md5sum_cache_dir, update=False)
modified_files, not_modified_files = calculate_md5_checksum_for_files(
md5sum_cache_dir, update=False, skip_provider_dependencies_check=skip_provider_dependencies_check
)
if modified_files:
get_console().print(
f"[warning]The following important files are modified in {AIRFLOW_SOURCES_ROOT} "
Expand Down
34 changes: 25 additions & 9 deletions scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,12 +209,28 @@ def check_if_different_provider_used(file_path: Path) -> None:
console.print("[red]Errors found during verification. Exiting!")
console.print()
sys.exit(1)
DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n")
console.print(
f"[yellow]If you see changes to the {DEPENDENCIES_JSON_FILE_PATH} file - "
f"do not modify the file manually. Let pre-commit do the job!"
)
console.print()
console.print("[green]Verification complete! Success!\n")
console.print(f"Written {DEPENDENCIES_JSON_FILE_PATH}")
console.print()
old_dependencies = DEPENDENCIES_JSON_FILE_PATH.read_text()
new_dependencies = json.dumps(unique_sorted_dependencies, indent=2) + "\n"
if new_dependencies != old_dependencies:
DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n")
if os.environ.get("CI"):
console.print()
console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}")
console.print(
f"[yellow]You will need to run breeze locally and commit "
f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n"
)
console.print()
else:
console.print()
console.print(
f"[yellow]Regenerated new dependencies. Please commit "
f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n"
)
console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}")
console.print()
else:
console.print(
"[green]No need to regenerate dependencies!\n[/]"
f"The {DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)} is up to date!\n"
)

0 comments on commit ac0d5b3

Please sign in to comment.