From 118913467f2893612f9131648bfcc9810c96ce41 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 13 Oct 2020 16:36:21 +0200 Subject: [PATCH] Adds automated installation of dependent packages When extras are specifying when airflow is installed, this one triggers installation of dependent packages. Each extra has a set of provider packages that are needed by the extra and they will be installed automatically if this extra is specified. For now we do not add any version specificatiion, until we agree the process in #11425 and then we should be able to implement an automated way of getting information about cross-package version dependencies. Fixes: #11464 --- CONTRIBUTING.rst | 6 +- INSTALL | 6 +- .../pre_commit_check_order_setup.py | 49 ++++++- .../run_prepare_provider_packages.sh | 2 +- setup.py | 126 +++++++++++++++++- 5 files changed, 172 insertions(+), 17 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index cfd1c7079f884..5a4066c9fd5a6 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -546,9 +546,9 @@ aws, azure, cassandra, celery, cgroups, cloudant, cncf.kubernetes, dask, databri devel_hadoop, doc, docker, druid, elasticsearch, exasol, facebook, gcp, gcp_api, github_enterprise, google, google_auth, grpc, hashicorp, hdfs, hive, jdbc, jira, kerberos, kubernetes, ldap, microsoft.azure, microsoft.mssql, microsoft.winrm, mongo, mssql, mysql, odbc, oracle, pagerduty, -papermill, password, pinot, plexus, postgres, presto, qds, rabbitmq, redis, salesforce, samba, -segment, sendgrid, sentry, singularity, slack, snowflake, spark, ssh, statsd, tableau, vertica, -virtualenv, webhdfs, winrm, yandexcloud, all, devel_ci +papermill, password, pinot, plexus, postgres, presto, qds, qubole, rabbitmq, redis, salesforce, +samba, segment, sendgrid, sentry, singularity, slack, snowflake, spark, ssh, statsd, tableau, +vertica, virtualenv, webhdfs, winrm, yandexcloud, all, devel_ci .. END EXTRAS HERE diff --git a/INSTALL b/INSTALL index da6d6ad7a2f1c..d3932200486a9 100644 --- a/INSTALL +++ b/INSTALL @@ -69,9 +69,9 @@ aws, azure, cassandra, celery, cgroups, cloudant, cncf.kubernetes, dask, databri devel_hadoop, doc, docker, druid, elasticsearch, exasol, facebook, gcp, gcp_api, github_enterprise, google, google_auth, grpc, hashicorp, hdfs, hive, jdbc, jira, kerberos, kubernetes, ldap, microsoft.azure, microsoft.mssql, microsoft.winrm, mongo, mssql, mysql, odbc, oracle, pagerduty, -papermill, password, pinot, plexus, postgres, presto, qds, rabbitmq, redis, salesforce, samba, -segment, sendgrid, sentry, singularity, slack, snowflake, spark, ssh, statsd, tableau, vertica, -virtualenv, webhdfs, winrm, yandexcloud, all, devel_ci +papermill, password, pinot, plexus, postgres, presto, qds, qubole, rabbitmq, redis, salesforce, +samba, segment, sendgrid, sentry, singularity, slack, snowflake, spark, ssh, statsd, tableau, +vertica, virtualenv, webhdfs, winrm, yandexcloud, all, devel_ci # END EXTRAS HERE diff --git a/scripts/ci/pre_commit/pre_commit_check_order_setup.py b/scripts/ci/pre_commit/pre_commit_check_order_setup.py index cfcb297e68e4b..e94109c0469bf 100755 --- a/scripts/ci/pre_commit/pre_commit_check_order_setup.py +++ b/scripts/ci/pre_commit/pre_commit_check_order_setup.py @@ -19,7 +19,6 @@ """ Test for an order of dependencies in setup.py """ - import os import re import sys @@ -28,6 +27,10 @@ errors = [] +MY_DIR_PATH = os.path.dirname(__file__) +SOURCE_DIR_PATH = os.path.abspath(os.path.join(MY_DIR_PATH, os.pardir, os.pardir, os.pardir)) +sys.path.insert(0, SOURCE_DIR_PATH) + def _check_list_sorted(the_list: List[str], message: str) -> None: sorted_list = sorted(the_list) @@ -122,9 +125,7 @@ def check_extras_require(setup_context: str) -> None: Test for an order of dependencies in function do_setup section extras_require in setup.py """ - pattern_extras_requires = re.compile( - r'EXTRAS_REQUIREMENTS: Dict\[str, Iterable\[str\]] = {(.*?)}', re.DOTALL - ) + pattern_extras_requires = re.compile(r'EXTRAS_REQUIREMENTS: Dict\[str, List\[str\]] = {(.*?)}', re.DOTALL) extras_requires = pattern_extras_requires.findall(setup_context)[0] pattern_dependent = re.compile('\'(.*?)\'') @@ -137,16 +138,50 @@ def check_provider_requirements(setup_context: str) -> None: Test for an order of dependencies in function do_setup section providers_require in setup.py """ - pattern_extras_requires = re.compile( + pattern_extras_providers_packages = re.compile( r'PROVIDERS_REQUIREMENTS: Dict\[str, Iterable\[str\]\] = {(.*?)}', re.DOTALL ) - extras_requires = pattern_extras_requires.findall(setup_context)[0] + extras_requires = pattern_extras_providers_packages.findall(setup_context)[0] pattern_dependent = re.compile('"(.*?)"') src = pattern_dependent.findall(extras_requires) _check_list_sorted(src, "Order of dependencies in: providers_require") +def check_extras_provider_packages(setup_context: str) -> None: + """ + Test for an order of dependencies in function do_setup section + providers_require in setup.py + """ + pattern_extras_requires = re.compile( + r'EXTRAS_PROVIDERS_PACKAGES: Dict\[str, Iterable\[str\]\] = {(.*?)}', re.DOTALL + ) + extras_requires = pattern_extras_requires.findall(setup_context)[0] + + pattern_dependent = re.compile('"(.*?)":') + src = pattern_dependent.findall(extras_requires) + _check_list_sorted(src, "Order of dependencies in: extras_provider_packages") + + +def checks_extra_with_providers_exist() -> None: + + from setup import EXTRAS_REQUIREMENTS, EXTRAS_PROVIDERS_PACKAGES # noqa # isort:skip + + message = 'Check if all extras have providers defined in: EXTRAS_PROVIDERS_PACKAGES' + local_error = False + for key in EXTRAS_REQUIREMENTS.keys(): # noqa + if key not in EXTRAS_PROVIDERS_PACKAGES.keys(): # noqa + if not local_error: + local_error = True + print(f"Extra {key} NOK") + errors.append( + f"ERROR in {message}. The {key} extras is missing there." + " If you do not want to install any providers with this extra set it to []" + ) + if not local_error: + print(f"{message} is ok") + + if __name__ == '__main__': setup_context_main = setup() check_main_dependent_group(setup_context_main) @@ -155,6 +190,8 @@ def check_provider_requirements(setup_context: str) -> None: check_install_and_setup_requires(setup_context_main) check_extras_require(setup_context_main) check_provider_requirements(setup_context_main) + check_extras_provider_packages(setup_context_main) + checks_extra_with_providers_exist() print() print() diff --git a/scripts/in_container/run_prepare_provider_packages.sh b/scripts/in_container/run_prepare_provider_packages.sh index f10f29b3731ff..75c1d09182e91 100755 --- a/scripts/in_container/run_prepare_provider_packages.sh +++ b/scripts/in_container/run_prepare_provider_packages.sh @@ -24,7 +24,7 @@ LIST_OF_DIRS_FILE=$(mktemp) cd "${AIRFLOW_SOURCES}/airflow/providers" || exit 1 -find . -type d | sed 's/.\///; s/\//\./g' | grep -E 'hooks|operators|sensors|secrets' \ +find . -type d | sed 's/.\///; s/\//\./g' | grep -E 'hooks|operators|sensors|secrets|utils' \ > "${LIST_OF_DIRS_FILE}" cd "${AIRFLOW_SOURCES}/provider_packages" || exit 1 diff --git a/setup.py b/setup.py index b5930122b3822..dbbfcda73dba5 100644 --- a/setup.py +++ b/setup.py @@ -352,7 +352,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'psycopg2-binary>=2.7.4', ] presto = ['presto-python-client>=0.7.0,<0.8'] -qds = [ +qubole = [ 'qds-sdk>=1.10.4', ] rabbitmq = [ @@ -540,11 +540,12 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version "plexus": plexus, "postgres": postgres, "presto": presto, - "qubole": qds, + "qubole": qubole, "redis": redis, "salesforce": salesforce, "samba": samba, "segment": segment, + "sendgrid": sendgrid, "sftp": ssh, "singularity": singularity, "slack": slack, @@ -556,7 +557,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version "zendesk": zendesk, } -EXTRAS_REQUIREMENTS: Dict[str, Iterable[str]] = { +EXTRAS_REQUIREMENTS: Dict[str, List[str]] = { 'all_dbs': all_dbs, 'amazon': amazon, 'apache.atlas': atlas, @@ -619,7 +620,8 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'plexus': plexus, 'postgres': postgres, 'presto': presto, - 'qds': qds, + 'qds': qubole, # TODO: remove this in Airflow 2.1 + 'qubole': qubole, 'rabbitmq': rabbitmq, 'redis': redis, 'salesforce': salesforce, @@ -641,6 +643,111 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'yandexcloud': yandexcloud, } +EXTRAS_PROVIDERS_PACKAGES: Dict[str, Iterable[str]] = { + 'all': list(PROVIDERS_REQUIREMENTS.keys()), + # this is not 100% accurate with devel_ci definition, but we really want to have all providers + # when devel_ci extra is installed! + 'devel_ci': list(PROVIDERS_REQUIREMENTS.keys()), + 'all_dbs': [ + "apache.cassandra", + "apache.druid", + "apache.hdfs", + "apache.hive", + "apache.pinot", + "cloudant", + "exasol", + "mongo", + "microsoft.mssql", + "mysql", + "postgres", + "presto", + "vertica", + ], + 'amazon': ["amazon"], + 'apache.atlas': [], + 'apache.beam': [], + "apache.cassandra": ["apache.cassandra"], + "apache.druid": ["apache.druid"], + "apache.hdfs": ["apache.hdfs"], + "apache.hive": ["apache.hive"], + "apache.kylin": ["apache.kylin"], + "apache.pinot": ["apache.pinot"], + "apache.presto": ["apache.presto"], + "apache.spark": ["apache.spark"], + "apache.webhdfs": ["apache.hdfs"], + 'async': [], + 'atlas': [], # TODO: remove this in Airflow 2.1 + 'aws': ["amazon"], # TODO: remove this in Airflow 2.1 + 'azure': ["microsoft.azure"], # TODO: remove this in Airflow 2.1 + 'cassandra': ["apache.cassandra"], # TODO: remove this in Airflow 2.1 + 'celery': ["celery"], + 'cgroups': [], + 'cloudant': ["cloudant"], + 'cncf.kubernetes': ["cncf.kubernetes"], + 'dask': ["dask"], + 'databricks': ["databricks"], + 'datadog': ["datadog"], + 'devel': ["cncf.kubernetes", "mysql"], + 'devel_hadoop': ["apache.hdfs", "apache.hive", "presto"], + 'doc': [], + 'docker': ["docker"], + 'druid': ["apache.druid"], # TODO: remove this in Airflow 2.1 + 'elasticsearch': ["elasticsearch"], + 'exasol': ["exasol"], + 'facebook': ["facebook"], + 'gcp': ["google"], # TODO: remove this in Airflow 2.1 + 'gcp_api': ["google"], # TODO: remove this in Airflow 2.1 + 'github_enterprise': [], + 'google': ["google"], + 'google_auth': [], + 'grpc': ["grpc"], + 'hashicorp': ["hashicorp"], + 'hdfs': ["apache.hdfs"], # TODO: remove this in Airflow 2.1 + 'hive': ["apache.hive"], # TODO: remove this in Airflow 2.1 + 'jdbc': ["jdbc"], + 'jira': ["jira"], + 'kerberos': [], + 'kubernetes': ["cncf.kubernetes"], # TODO: remove this in Airflow 2.1 + 'ldap': [], + "microsoft.azure": ["microsoft.azure"], + "microsoft.mssql": ["microsoft.mssql"], + "microsoft.winrm": ["microsoft.winrm"], + 'mongo': ["mongo"], + 'mssql': ["microsoft.mssql"], # TODO: remove this in Airflow 2.1 + 'mysql': ["microsoft.mssql"], + 'odbc': ["odbc"], + 'oracle': ["oracle"], + 'pagerduty': ["pagerduty"], + 'papermill': ["papermill"], + 'password': [], + 'pinot': ["apache.pinot"], # TODO: remove this in Airflow 2.1 + 'plexus': ["plexus"], + 'postgres': ["postgres"], + 'presto': ["presto"], + 'qds': ["qubole"], # TODO: remove this in Airflow 2.1 + 'qubole': ["qubole"], + 'rabbitmq': ["rabbitmq"], + 'redis': ["redis"], + 'salesforce': ["salesforce"], + 'samba': ["samba"], + 'segment': ["segment"], + 'sendgrid': ["sendgrid"], + 'sentry': ["sentry"], + 'singularity': ["singularity"], + 'slack': ["slack"], + 'snowflake': ["snowflake"], + 'spark': ["spark"], + 'ssh': ["ssh"], + 'statsd': ["statsd"], + 'tableau': ["tableau"], + 'vertica': ["vertica"], + 'virtualenv': ["virtualenv"], + 'webhdfs': ["apache.hdfs"], # TODO: remove this in Airflow 2.1 + 'winrm': ["microsoft.winrm"], # TODO: remove this in Airflow 2.1 + 'yandexcloud': ["yandexcloud"], +} + + # Make devel_all contain all providers + extras + unique devel_all = list( set( @@ -758,6 +865,17 @@ def is_package_excluded(package: str, exclusion_list: List[str]): ] +def get_provider_package_from_package_id(package_id: str): + """ + Builds the name of provider package out of the package id provided/ + + :param package_id: id of the package (like amazon or microsoft.azure) + :return: full name of package in PyPI + """ + package_suffix = package_id.replace(".", "-") + return f"apache-airflow-providers-{package_suffix}" + + def do_setup(): """Perform the Airflow package setup.""" install_providers_from_sources = os.getenv('INSTALL_PROVIDERS_FROM_SOURCES')