Skip to content

Commit

Permalink
Adds automated installation of dependent packages
Browse files Browse the repository at this point in the history
When extras are specifying when airflow is installed, this one triggers
installation of dependent packages. Each extra has a set of provider
packages that are needed by the extra and they will be installed
automatically if this extra is specified.

For now we do not add any version specificatiion, until we agree the
process in apache#11425 and then we should be able to implement an
automated way of getting information about cross-package
version dependencies.

Fixes: apache#11464
  • Loading branch information
potiuk committed Oct 30, 2020
1 parent 912fe52 commit e8eb429
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 22 deletions.
6 changes: 3 additions & 3 deletions CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -518,9 +518,9 @@ celery, cgroups, cloudant, cncf.kubernetes, dask, databricks, datadog, devel, de
docker, druid, elasticsearch, exasol, facebook, gcp, gcp_api, github_enterprise, google,
google_auth, grpc, hashicorp, hdfs, hive, jdbc, jira, kerberos, kubernetes, ldap, microsoft.azure,
microsoft.mssql, microsoft.winrm, mongo, mssql, mysql, odbc, oracle, pagerduty, papermill, password,
pinot, plexus, postgres, presto, qds, rabbitmq, redis, salesforce, samba, segment, sendgrid, sentry,
singularity, slack, snowflake, spark, ssh, statsd, tableau, vertica, virtualenv, webhdfs, winrm,
yandexcloud, all, devel_ci
pinot, plexus, postgres, presto, qds, qubole, rabbitmq, redis, salesforce, samba, segment, sendgrid,
sentry, singularity, slack, snowflake, spark, ssh, statsd, tableau, vertica, virtualenv, webhdfs,
winrm, yandexcloud, all, devel_ci

.. END EXTRAS HERE
Expand Down
6 changes: 3 additions & 3 deletions INSTALL
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ celery, cgroups, cloudant, cncf.kubernetes, dask, databricks, datadog, devel, de
docker, druid, elasticsearch, exasol, facebook, gcp, gcp_api, github_enterprise, google,
google_auth, grpc, hashicorp, hdfs, hive, jdbc, jira, kerberos, kubernetes, ldap, microsoft.azure,
microsoft.mssql, microsoft.winrm, mongo, mssql, mysql, odbc, oracle, pagerduty, papermill, password,
pinot, plexus, postgres, presto, qds, rabbitmq, redis, salesforce, samba, segment, sendgrid, sentry,
singularity, slack, snowflake, spark, ssh, statsd, tableau, vertica, virtualenv, webhdfs, winrm,
yandexcloud, all, devel_ci
pinot, plexus, postgres, presto, qds, qubole, rabbitmq, redis, salesforce, samba, segment, sendgrid,
sentry, singularity, slack, snowflake, spark, ssh, statsd, tableau, vertica, virtualenv, webhdfs,
winrm, yandexcloud, all, devel_ci

# END EXTRAS HERE

Expand Down
43 changes: 39 additions & 4 deletions scripts/ci/pre_commit/pre_commit_check_order_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"""
Test for an order of dependencies in setup.py
"""

import os
import re
import sys
Expand All @@ -28,6 +27,10 @@

errors = []

MY_DIR_PATH = os.path.dirname(__file__)
SOURCE_DIR_PATH = os.path.abspath(os.path.join(MY_DIR_PATH, os.pardir, os.pardir, os.pardir))
sys.path.insert(0, SOURCE_DIR_PATH)


def _check_list_sorted(the_list: List[str], message: str) -> None:
sorted_list = sorted(the_list)
Expand Down Expand Up @@ -123,7 +126,7 @@ def check_extras_require(setup_context: str) -> None:
extras_require in setup.py
"""
pattern_extras_requires = re.compile(
r'EXTRAS_REQUIREMENTS: Dict\[str, Iterable\[str\]] = {(.*?)}', re.DOTALL)
r'EXTRAS_REQUIREMENTS: Dict\[str, List\[str\]] = {(.*?)}', re.DOTALL)
extras_requires = pattern_extras_requires.findall(setup_context)[0]

pattern_dependent = re.compile('\'(.*?)\'')
Expand All @@ -136,15 +139,45 @@ def check_provider_requirements(setup_context: str) -> None:
Test for an order of dependencies in function do_setup section
providers_require in setup.py
"""
pattern_extras_requires = re.compile(
pattern_extras_providers_packages = re.compile(
r'PROVIDERS_REQUIREMENTS: Dict\[str, Iterable\[str\]\] = {(.*?)}', re.DOTALL)
extras_requires = pattern_extras_requires.findall(setup_context)[0]
extras_requires = pattern_extras_providers_packages.findall(setup_context)[0]

pattern_dependent = re.compile('"(.*?)"')
src = pattern_dependent.findall(extras_requires)
_check_list_sorted(src, "Order of dependencies in: providers_require")


def check_extras_provider_packages(setup_context: str) -> None:
"""
Test for an order of dependencies in function do_setup section
providers_require in setup.py
"""
pattern_extras_requires = re.compile(
r'EXTRAS_PROVIDERS_PACKAGES: Dict\[str, Iterable\[str\]\] = {(.*?)}', re.DOTALL)
extras_requires = pattern_extras_requires.findall(setup_context)[0]

pattern_dependent = re.compile('"(.*?)":')
src = pattern_dependent.findall(extras_requires)
_check_list_sorted(src, "Order of dependencies in: extras_provider_packages")


def checks_extra_with_providers_exist() -> None:

from setup import EXTRAS_REQUIREMENTS, EXTRAS_PROVIDERS_PACKAGES # noqa # isort:skip
message = 'Check if all extras have providers defined in: EXTRAS_PROVIDERS_PACKAGES'
local_error = False
for key in EXTRAS_REQUIREMENTS.keys(): # noqa
if key not in EXTRAS_PROVIDERS_PACKAGES.keys(): # noqa
if not local_error:
local_error = True
print(f"Extra {key} NOK")
errors.append(f"ERROR in {message}. The {key} extras is missing there."
" If you do not want to install any providers with this extra set it to []")
if not local_error:
print(f"{message} is ok")


if __name__ == '__main__':
setup_context_main = setup()
check_main_dependent_group(setup_context_main)
Expand All @@ -153,6 +186,8 @@ def check_provider_requirements(setup_context: str) -> None:
check_install_and_setup_requires(setup_context_main)
check_extras_require(setup_context_main)
check_provider_requirements(setup_context_main)
check_extras_provider_packages(setup_context_main)
checks_extra_with_providers_exist()

print()
print()
Expand Down
2 changes: 1 addition & 1 deletion scripts/in_container/run_prepare_provider_packages.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LIST_OF_DIRS_FILE=$(mktemp)

cd "${AIRFLOW_SOURCES}/airflow/providers" || exit 1

find . -type d | sed 's/.\///; s/\//\./g' | grep -E 'hooks|operators|sensors|secrets' \
find . -type d | sed 's/.\///; s/\//\./g' | grep -E 'hooks|operators|sensors|secrets|utils' \
> "${LIST_OF_DIRS_FILE}"

cd "${AIRFLOW_SOURCES}/provider_packages" || exit 1
Expand Down
138 changes: 127 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,16 @@

# Kept manually in sync with airflow.__version__
spec = util.spec_from_file_location("airflow.version", os.path.join('airflow', 'version.py')) # noqa
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod) # type: ignore
version = mod.version # type: ignore
try:
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod) # type: ignore
version = mod.version # type: ignore
# in case of reading the setup py from external package the magic to retrieve version does not work in all
# cases - for example this happens when the setup.py is read by scripts to verify and prepare
# provider package documentation. It's safe to disable it. There is no way any of the
# packaging tools will validate an empty version.
except Exception: # pylint: disable=broad-except
version = None

PY3 = sys.version_info[0] == 3

Expand Down Expand Up @@ -362,7 +369,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version
presto = [
'presto-python-client>=0.7.0,<0.8'
]
qds = [
qubole = [
'qds-sdk>=1.10.4',
]
rabbitmq = [
Expand Down Expand Up @@ -537,11 +544,12 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version
"plexus": plexus,
"postgres": postgres,
"presto": presto,
"qubole": qds,
"qubole": qubole,
"redis": redis,
"salesforce": salesforce,
"samba": samba,
"segment": segment,
"sendgrid": sendgrid,
"sftp": ssh,
"singularity": singularity,
"slack": slack,
Expand All @@ -553,7 +561,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version
"zendesk": zendesk,
}

EXTRAS_REQUIREMENTS: Dict[str, Iterable[str]] = {
EXTRAS_REQUIREMENTS: Dict[str, List[str]] = {
'all_dbs': all_dbs,
'amazon': amazon,
'apache.atlas': atlas,
Expand Down Expand Up @@ -614,7 +622,8 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version
'plexus': plexus,
'postgres': postgres,
'presto': presto,
'qds': qds,
'qds': qubole, # TODO: remove this in Airflow 2.1
'qubole': qubole,
'rabbitmq': rabbitmq,
'redis': redis,
'salesforce': salesforce,
Expand All @@ -636,6 +645,99 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version
'yandexcloud': yandexcloud,
}

EXTRAS_PROVIDERS_PACKAGES: Dict[str, Iterable[str]] = {
'all': list(PROVIDERS_REQUIREMENTS.keys()),
# this is not 100% accurate with devel_ci definition, but we really want to have all providers
# when devel_ci extra is installed!
'devel_ci': list(PROVIDERS_REQUIREMENTS.keys()),
'all_dbs': [
"apache.cassandra", "apache.druid", "apache.hdfs", "apache.hive", "apache.pinot",
"cloudant", "exasol",
"mongo", "microsoft.mssql", "mysql", "postgres", "presto", "vertica"
],
'amazon': ["amazon"],
'apache.atlas': [],
'apache.beam': [],
"apache.cassandra": ["apache.cassandra"],
"apache.druid": ["apache.druid"],
"apache.hdfs": ["apache.hdfs"],
"apache.hive": ["apache.hive"],
"apache.kylin": ["apache.kylin"],
"apache.pinot": ["apache.pinot"],
"apache.webhdfs": ["apache.hdfs"],
'async': [],
'atlas': [], # TODO: remove this in Airflow 2.1
'aws': ["amazon"], # TODO: remove this in Airflow 2.1
'azure': ["microsoft.azure"], # TODO: remove this in Airflow 2.1
'cassandra': ["apache.cassandra"], # TODO: remove this in Airflow 2.1
'celery': ["celery"],
'cgroups': [],
'cloudant': ["cloudant"],
'cncf.kubernetes': ["cncf.kubernetes"],
'dask': ["dask"],
'databricks': ["databricks"],
'datadog': ["datadog"],
'devel': ["cncf.kubernetes", "mysql"],
'devel_hadoop': ["apache.hdfs", "apache.hive", "presto"],
'doc': [],
'docker': ["docker"],
'druid': ["apache.druid"], # TODO: remove this in Airflow 2.1
'elasticsearch': ["elasticsearch"],
'exasol': ["exasol"],
'facebook': ["facebook"],
'gcp': ["google"], # TODO: remove this in Airflow 2.1
'gcp_api': ["google"], # TODO: remove this in Airflow 2.1
'github_enterprise': [],
'google': ["google"],
'google_auth': [],
'grpc': ["grpc"],
'hashicorp': ["hashicorp"],
'hdfs': ["apache.hdfs"], # TODO: remove this in Airflow 2.1
'hive': ["apache.hive"], # TODO: remove this in Airflow 2.1
'jdbc': ["jdbc"],
'jira': ["jira"],
'kerberos': [],
'kubernetes': ["cncf.kubernetes"], # TODO: remove this in Airflow 2.1
'ldap': [],
"microsoft.azure": ["microsoft.azure"],
"microsoft.mssql": ["microsoft.mssql"],
"microsoft.winrm": ["microsoft.winrm"],
'mongo': ["mongo"],
'mssql': ["microsoft.mssql"], # TODO: remove this in Airflow 2.1
'mysql': ["microsoft.mssql"],
'odbc': ["odbc"],
'oracle': ["oracle"],
'pagerduty': ["pagerduty"],
'papermill': ["papermill"],
'password': [],
'pinot': ["apache.pinot"], # TODO: remove this in Airflow 2.1
'plexus': ["plexus"],
'postgres': ["postgres"],
'presto': ["presto"],
'qds': ["qubole"], # TODO: remove this in Airflow 2.1
'qubole': ["qubole"],
'rabbitmq': ["rabbitmq"],
'redis': ["redis"],
'salesforce': ["salesforce"],
'samba': ["samba"],
'segment': ["segment"],
'sendgrid': ["sendgrid"],
'sentry': ["sentry"],
'singularity': ["singularity"],
'slack': ["slack"],
'snowflake': ["snowflake"],
'spark': ["spark"],
'ssh': ["ssh"],
'statsd': ["statsd"],
'tableau': ["tableau"],
'vertica': ["vertica"],
'virtualenv': ["virtualenv"],
'webhdfs': ["apache.hdfs"], # TODO: remove this in Airflow 2.1
'winrm': ["microsoft.winrm"], # TODO: remove this in Airflow 2.1
'yandexcloud': ["yandexcloud"],
}


# Make devel_all contain all providers + extras + unique
devel_all = list(set(devel +
[req for req_list in EXTRAS_REQUIREMENTS.values() for req in req_list] +
Expand Down Expand Up @@ -743,13 +845,27 @@ def is_package_excluded(package: str, exclusion_list: List[str]):
]


def get_provider_package_from_package_id(package_id: str):
"""
Builds the name of provider package out of the package id provided/
:param package_id: id of the package (like amazon or microsoft.azure)
:return: full name of package in PyPI
"""
package_suffix = package_id.replace(".", "-")
return f"apache-airflow-providers-{package_suffix}"


def do_setup():
"""Perform the Airflow package setup."""
install_providers_from_sources = os.getenv('INSTALL_PROVIDERS_FROM_SOURCES')
exclude_patterns = \
[] if install_providers_from_sources and install_providers_from_sources == 'true' \
else ['airflow.providers', 'airflow.providers.*']
install_providers_from_sources = os.getenv('INSTALL_PROVIDERS_FROM_SOURCES') == 'true'
exclude_patterns = [] if install_providers_from_sources else ['airflow.providers', 'airflow.providers.*']
write_version()
if not install_providers_from_sources:
for key, value in EXTRAS_PROVIDERS_PACKAGES.items():
EXTRAS_REQUIREMENTS[key].extend(
[get_provider_package_from_package_id(package_name) for package_name in value]
)
setup(
name='apache-airflow',
description='Programmatically author, schedule and monitor data pipelines',
Expand Down

0 comments on commit e8eb429

Please sign in to comment.