Skip to content

Commit

Permalink
Fix pre-commit checking provider's doc consistency (apache#45803)
Browse files Browse the repository at this point in the history
The pre-commit was not working since the move of providers to
the `providers` top-level folder (apache#42505) - but we also have
not added many new providers since. This PR updates the check
to work again only for providers that follow the new structure
and to be more verbose on what it is doing.
  • Loading branch information
potiuk authored and dauinh committed Jan 23, 2025
1 parent d6f4b46 commit df5e7c6
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 102 deletions.
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1220,9 +1220,10 @@ repos:
name: Validate provider doc files
entry: ./scripts/ci/pre_commit/check_provider_docs.py
language: python
files: ^providers/src/airflow/providers/.*/provider\.yaml|^docs/.*
files: ^providers/.*/provider\.yaml|^docs/.*
additional_dependencies: ['rich>=12.4.4', 'pyyaml', 'jinja2']
require_serial: true
pass_filenames: false
- id: bandit
name: bandit
description: "Bandit is a tool for finding common security issues in Python code"
Expand Down
208 changes: 107 additions & 101 deletions scripts/ci/pre_commit/check_provider_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,26 @@
# under the License.
from __future__ import annotations

import os
import sys
from collections import defaultdict
from pathlib import Path

import yaml
from jinja2 import BaseLoader, Environment
from rich.console import Console

console = Console(color_system="standard", width=200)
sys.path.insert(0, str(Path(__file__).parent.resolve())) # make sure common utils are importable

AIRFLOW_PROVIDERS_IMPORT_PREFIX = "airflow.providers."
from common_precommit_utils import (
AIRFLOW_PROVIDERS_ROOT_PATH,
AIRFLOW_SOURCES_ROOT_PATH,
get_all_new_provider_info_dicts,
)

AIRFLOW_SOURCES_ROOT = Path(__file__).parents[3].resolve()
sys.path.insert(0, str(AIRFLOW_SOURCES_ROOT_PATH)) # make sure setup is imported from Airflow

AIRFLOW_PROVIDERS_DIR = AIRFLOW_SOURCES_ROOT / "airflow" / "providers"
AIRFLOW_DOC_FILES = AIRFLOW_SOURCES_ROOT / "docs"
AIRFLOW_DOC_AIRFLOW_BASE_FOLDER = AIRFLOW_DOC_FILES / "apache-airflow"
console = Console(color_system="standard", width=200)

sys.path.insert(0, str(AIRFLOW_SOURCES_ROOT)) # make sure setup is imported from Airflow
AIRFLOW_PROVIDERS_IMPORT_PREFIX = "airflow.providers."

warnings: list[str] = []
errors: list[str] = []
Expand All @@ -45,10 +45,6 @@

ALL_DEPENDENCIES: dict[str, dict[str, list[str]]] = defaultdict(lambda: defaultdict(list))

ALL_PROVIDERS: dict[str, dict[str, dict]] = defaultdict(lambda: defaultdict())

# Allow AST to parse the files.
sys.path.append(str(AIRFLOW_SOURCES_ROOT))

LICENCE_CONTENT_RST = """
.. Licensed to the Apache Software Foundation (ASF) under one
Expand Down Expand Up @@ -89,18 +85,7 @@
.. THIS FILE IS UPDATED AUTOMATICALLY_AT_RELEASE_TIME
"""


def find_all_providers():
for provider_file in AIRFLOW_PROVIDERS_DIR.rglob("provider.yaml"):
provider_name = str(provider_file.parent.relative_to(AIRFLOW_PROVIDERS_DIR)).replace(os.sep, ".")
provider_info = yaml.safe_load(provider_file.read_text())
if provider_info["state"] != "suspended":
ALL_PROVIDERS[provider_name] = provider_info


find_all_providers()

fail_pre_commit = False
failed: list[bool] = []


def process_content_to_write(content: str) -> str:
Expand All @@ -114,6 +99,10 @@ def process_content_to_write(content: str) -> str:
return content_to_write


def get_provider_doc_folder(provider_id: str) -> Path:
return AIRFLOW_PROVIDERS_ROOT_PATH.joinpath(*provider_id.split(".")) / "docs"


def check_provider_doc_exists_and_in_index(
*,
provider_id: str,
Expand All @@ -122,8 +111,9 @@ def check_provider_doc_exists_and_in_index(
generated_content: str = "",
missing_ok: bool = False,
check_content: bool = True,
) -> None:
global fail_pre_commit
):
console.print(f" [bright_blue]Checking [/]: {file_name} ", end="")
fail = False
provider_docs_file = get_provider_doc_folder(provider_id)
file_path = provider_docs_file / file_name
index_file = provider_docs_file / "index.rst"
Expand All @@ -132,29 +122,34 @@ def check_provider_doc_exists_and_in_index(
if file_path.exists():
if check_content and not generated_content:
if file_path.read_text() != content_to_write:
console.print("[yellow]Generating[/]")
console.print()
console.print(f"[yellow]Content of the file will be regenerated: [/]{file_path}")
console.print()
regenerate_file = True
else:
if missing_ok:
# Fine - we do not check anything else
console.print("[green]OK[/]")
failed.append(False)
return
if not generated_content:
console.print("[yellow]Generating[/]")
console.print()
console.print(f"[yellow]Missing file: [/]{file_path}")
console.print("[bright_blue]Please create the file looking at other providers as example [/]")
console.print()
else:
regenerate_file = True
if regenerate_file:
fail_pre_commit = True
fail = True
file_path.write_text(content_to_write)
console.print()
console.print(f"[yellow]Content updated in file: [/]{file_path}")
console.print()
if index_link not in index_file.read_text():
fail_pre_commit = True
console.print("[red]NOK[/]")
fail = True
console.print()
console.print(
f"[red]ERROR! Missing index link![/]\n"
Expand All @@ -163,99 +158,110 @@ def check_provider_doc_exists_and_in_index(
f"[bright_blue]Please add the entry in the index!"
)
console.print()
if not fail:
console.print("[green]OK[/]")
failed.append(fail)


def check_documentation_link_exists(link: str, doc_file_name: str):
global fail_pre_commit
docs_file = AIRFLOW_DOC_AIRFLOW_BASE_FOLDER / doc_file_name
def check_documentation_link_exists(link: str, docs_file: Path):
console.print(f" [bright_blue]Checking [/]: {docs_file} for link: {link} ", end="")
if link not in docs_file.read_text():
fail_pre_commit = True
console.print("[red]NOK[/]")
console.print()
console.print(
f"[red]ERROR! The {docs_file} does not contain:\n:[/]{link}\n[bright_blue]Please add it!"
)
console.print()
failed.append(True)
console.print("[green]OK[/]")
failed.append(False)


def get_provider_doc_folder(provider_id: str) -> Path:
return AIRFLOW_DOC_FILES / f"apache-airflow-providers-{provider_id.replace('.', '-')}"


def has_executor_package_defined(provider_id: str):
provider_sources = AIRFLOW_PROVIDERS_DIR / provider_id.replace(".", "/")
for executors_folder in provider_sources.rglob("executors"):
def has_executor_package_defined(provider_id: str) -> bool:
provider_package_path = (AIRFLOW_PROVIDERS_ROOT_PATH.joinpath(*provider_id.split(".")) / "src").joinpath(
*provider_id.split(".")
)
for executors_folder in provider_package_path.rglob("executors"):
if executors_folder.is_dir() and (executors_folder / "__init__.py").is_file():
return True
return False


JINJA_LOADER = Environment(loader=BaseLoader())
def run_all_checks():
jinja_loader = Environment(loader=BaseLoader(), autoescape=True)
all_providers = get_all_new_provider_info_dicts()
status: list[bool] = []

for provider_id, provider_info in ALL_PROVIDERS.items():
provider_docs_folder = get_provider_doc_folder(provider_id)
if not provider_docs_folder.exists():
provider_docs_folder.mkdir(parents=True)
for provider_id, provider_info in all_providers.items():
console.print(f"[bright_blue]Checking provider: {provider_id}[/]")
provider_docs_folder = get_provider_doc_folder(provider_id)
if not provider_docs_folder.exists():
provider_docs_folder.mkdir(parents=True)

check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="Detailed list of commits <commits>",
file_name="commits.rst",
generated_content=LICENCE_CONTENT_RST + COMMIT_CONTENT_RST,
# Only create commit content if it is missing, otherwise leave what is there
check_content=False,
)

check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="Security <security>",
file_name="security.rst",
generated_content=LICENCE_CONTENT_RST + SECURITY_CONTENT_RST,
)

check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="Installing from sources <installing-providers-from-sources>",
file_name="installing-providers-from-sources.rst",
generated_content=LICENCE_CONTENT_RST + INSTALLING_PROVIDERS_FROM_SOURCES_CONTENT_RST,
)

check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="Changelog <changelog>",
file_name="changelog.rst",
generated_content=LICENCE_CONTENT_RST
+ JINJA_LOADER.from_string(CHANGELOG_CONTENT_RST).render(provider_id=provider_id),
)

if has_executor_package_defined(provider_id) and not provider_info.get("executors"):
provider_yaml = AIRFLOW_PROVIDERS_DIR / provider_id.replace(".", "/") / "provider.yaml"
console.print()
console.print(
f"[red]ERROR! The {provider_id} provider has executor package but "
f"does not have `executors` defined in {provider_yaml}."
)
console.print(f"\nPlease add executor class to `executors` array in {provider_yaml}")
fail_pre_commit = True

if provider_info.get("executors"):
check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="CLI <cli-ref>",
file_name="cli-ref.rst",
missing_ok=True,
index_link="Detailed list of commits <commits>",
file_name="commits.rst",
generated_content=LICENCE_CONTENT_RST + COMMIT_CONTENT_RST,
# Only create commit content if it is missing, otherwise leave what is there
check_content=False,
)
if (get_provider_doc_folder(provider_id) / "cli-ref.rst").exists():
check_documentation_link_exists(
link=f"and related CLI commands: :doc:`{get_provider_doc_folder(provider_id).name}:cli-ref`",
doc_file_name="cli-and-env-variables-ref.rst",
)
if provider_info.get("config"):
check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="Configuration <configurations-ref>",
file_name="configurations-ref.rst",
generated_content=LICENCE_CONTENT_RST + CONFIGURATION_CONTENT_RST,
index_link="Security <security>",
file_name="security.rst",
generated_content=LICENCE_CONTENT_RST + SECURITY_CONTENT_RST,
)
check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="Installing from sources <installing-providers-from-sources>",
file_name="installing-providers-from-sources.rst",
generated_content=LICENCE_CONTENT_RST + INSTALLING_PROVIDERS_FROM_SOURCES_CONTENT_RST,
)
if fail_pre_commit:
sys.exit(1)
check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="Changelog <changelog>",
file_name="changelog.rst",
generated_content=LICENCE_CONTENT_RST
+ jinja_loader.from_string(CHANGELOG_CONTENT_RST).render(provider_id=provider_id),
)
if has_executor_package_defined(provider_id) and not provider_info.get("executors"):
provider_yaml = AIRFLOW_PROVIDERS_ROOT_PATH.joinpath(*provider_id.split(".")) / "provider.yaml"
console.print()
console.print(
f"[red]ERROR! The {provider_id} provider has executor package but "
f"does not have `executors` defined in {provider_yaml}."
)
console.print(f"\nPlease add executor class to `executors` array in {provider_yaml}")
status.append(False)

if provider_info.get("executors"):
check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="CLI <cli-ref>",
file_name="cli-ref.rst",
missing_ok=True,
check_content=False,
)
if (get_provider_doc_folder(provider_id) / "cli-ref.rst").exists():
check_documentation_link_exists(
link=f"and related CLI commands: :doc:`apache-airflow-providers-{provider_id.replace('.', '-')}:cli-ref`",
docs_file=AIRFLOW_SOURCES_ROOT_PATH
/ "docs"
/ "apache-airflow"
/ "cli-and-env-variables-ref.rst",
)
if provider_info.get("config"):
check_provider_doc_exists_and_in_index(
provider_id=provider_id,
index_link="Configuration <configurations-ref>",
file_name="configurations-ref.rst",
generated_content=LICENCE_CONTENT_RST + CONFIGURATION_CONTENT_RST,
)
print(failed)
if any(failed):
sys.exit(1)


if __name__ == "__main__":
run_all_checks()
29 changes: 29 additions & 0 deletions scripts/ci/pre_commit/common_precommit_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,3 +276,32 @@ def get_all_new_provider_ids() -> list[str]:
if provider_id:
all_provider_ids.append(provider_id)
return all_provider_ids


# TODO(potiuk): rename this function when all providers are moved to new structure
def get_all_new_provider_yaml_files() -> list[Path]:
"""
Get all providers from the new provider structure
"""
all_provider_yaml_files = []
for provider_file in AIRFLOW_PROVIDERS_ROOT_PATH.rglob("provider.yaml"):
if provider_file.is_relative_to(AIRFLOW_PROVIDERS_ROOT_PATH / "src"):
continue
all_provider_yaml_files.append(provider_file)
return all_provider_yaml_files


# TODO(potiuk): rename this function when all providers are moved to new structure
def get_all_new_provider_info_dicts() -> dict[str, dict]:
"""
Get provider yaml info for all providers from the new provider structure
"""
providers: dict[str, dict] = {}
for provider_file in get_all_new_provider_yaml_files():
provider_id = str(provider_file.parent.relative_to(AIRFLOW_PROVIDERS_ROOT_PATH)).replace(os.sep, ".")
import yaml

provider_info = yaml.safe_load(provider_file.read_text())
if provider_info["state"] != "suspended":
providers[provider_id] = provider_info
return providers

0 comments on commit df5e7c6

Please sign in to comment.