diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8d6e156 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +tests/cache +tests/playbook.retry +tests/__pycache__ +.molecule +.cache +.vagrant +tests/roles \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2aa756c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,9 @@ +# Change Log +All notable changes to this project will be documented in this file. +This project adheres to [Semantic Versioning](http://semver.org/) and [Keep a changelog](https://github.com/olivierlacan/keep-a-changelog). + +## [Unreleased](https://github.com/idealista/airflow-role/tree/develop) + +## [1.0.0](https://github.com/idealista/airflow-role/tree/1.0.0) +### Added +- *First release* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fa11fa6 --- /dev/null +++ b/README.md @@ -0,0 +1,97 @@ +![Logo](https://raw.githubusercontent.com/idealista/airflow-role/master/logo.gif) + +# Apache Airflow Ansible role + +This ansible role installs a Apache Airflow server in a debian environment. + +- [Getting Started](#getting-started) + - [Prerequisities](#prerequisities) + - [Installing](#installing) +- [Usage](#usage) +- [Testing](#testing) +- [Built With](#built-with) +- [Versioning](#versioning) +- [Authors](#authors) +- [License](#license) +- [Contributing](#contributing) + +## Getting Started + +These instructions will get you a copy of the role for your ansible playbook. Once launched, it will install a [Apache Airflow](https://airflow.incubator.apache.org/) server in a Debian system. + +### Prerequisities + +Ansible 2.2.1.0 version installed. +Inventory destination should be a Debian environment. + +For testing purposes, [Molecule](https://molecule.readthedocs.io/) with [Vagrant](https://www.vagrantup.com/) as driver (with [landrush](https://github.com/vagrant-landrush/landrush) plugin) and [VirtualBox](https://www.virtualbox.org/) as provider. + +### Installing + +Create or add to your roles dependency file (e.g requirements.yml) from GitHub: + +``` +- src: http://github.com/idealista/airflow-role.git + scm: git + version: 1.0.0 + name: airflow +``` + +or using [Ansible Galaxy](https://galaxy.ansible.com/idealista/airflow-role/) as origin if you prefer: + +``` +- src: idealista.airflow-role +``` + +Install the role with ansible-galaxy command: + +``` +ansible-galaxy install -p roles -r requirements.yml -f +``` + +Use in a playbook: + +``` +--- +- hosts: someserver + roles: + - { role: airflow } +``` + +## Usage + +Look to the defaults properties file to see the possible configuration properties. + +## Testing + +``` +molecule test --platform=Debian9 +``` + +See molecule.yml to check possible testing platforms. + +## Built With + +![Ansible](https://img.shields.io/badge/ansible-2.2.1.0-green.svg) + +## Versioning + +For the versions available, see the [tags on this repository](https://github.com/idealista/airflow-role/tags). + +Additionaly you can see what change in each version in the [CHANGELOG.md](CHANGELOG.md) file. + +## Authors + +* **Idealista** - *Work with* - [idealista](https://github.com/idealista) + +See also the list of [contributors](https://github.com/idealista/airflow-role/contributors) who participated in this project. + +## License + +![Apache 2.0 Licence](https://img.shields.io/hexpm/l/plug.svg) + +This project is licensed under the [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) license - see the [LICENSE.txt](LICENSE.txt) file for details. + +## Contributing + +Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on our code of conduct, and the process for submitting pull requests to us. \ No newline at end of file diff --git a/defaults/main.yml b/defaults/main.yml new file mode 100644 index 0000000..9d7b37e --- /dev/null +++ b/defaults/main.yml @@ -0,0 +1,196 @@ +--- + +## General +airflow_version: 1.8.1 +airflow_extra_packages: + +airflow_required_libs: + - python-pip + - acl + +# Owner +airflow_user: airflow +airflow_group: airflow + +## Service options + +airflow_scheduler_runs: 5 + +airflow_services: + airflow-webserver: + enabled: yes + state: started + airflow-scheduler: + enabled: yes + state: started + airflow-worker: + enabled: no + state: started + +# Files & Paths +airflow_home: /etc/airflow +airflow_dags_folder: "{{ airflow_home }}/dags" +airflow_logs_folder: /var/log/airflow +airflow_child_process_log_folder: "{{ airflow_logs_folder }}/scheduler" +airflow_environment_file_folder: /etc/sysconfig + +# Allowing playbooks to provide external config files&templates +airflow_extra_conf_path: "{{ playbook_dir }}/files/airflow" +airflow_extra_conf_template_path: "{{ playbook_dir }}/templates/airflow" + +# AIRFLOW CONFIGURATION +# --------------------- + +airflow_load_examples: True + +# The executor class that airflow should use. Choices include +# SequentialExecutor, LocalExecutor, CeleryExecutor +airflow_executor: SequentialExecutor + +airflow_parallelism: 32 +airflow_dag_concurrency: 16 +airflow_dags_are_paused_at_creation: True +airflow_non_pooled_task_slot_count: 128 +airflow_max_active_runs_per_dag: 16 + +airflow_fernet_key: + +airflow_donot_pickle: False +airflow_dagbag_import_timeout: + +airflow_task_runner: BashTaskRunner + +airflow_default_impersonation: +airflow_unit_test_mode: False + +airflow_plugins_folder: "{{ airflow_home }}/plugins" + +# REMOTE LOGS +airflow_remote_base_log_folder: +airflow_remote_log_conn_id: +airflow_encrypt_s3_logs: False +airflow_s3_log_folder: # DEPRECATED + +# DB +airflow_database_conn: sqlite:////etc/airflow/airflow.db +airflow_database_pool_size: 5 +airflow_database_pool_recycle: 2000 + +# CLI +airflow_cli_api_client: airflow.api.client.local_client +airflow_cli_api_endpoint_url: http://localhost:8080 + +# API +airflow_auth_backend: airflow.api.auth.backend.default + +# WEBSERVER +airflow_webserver_base_url: http://localhost:8080 +airflow_webserver_host: 0.0.0.0 +airflow_webserver_port: 8080 +airflow_webserver_workers: 4 +airflow_webserver_worker_timeout: 120 +airflow_webserver_ssl_cert: +airflow_webserver_ssl_key: +airflow_webserver_worker_refresh_batch_size: 1 +airflow_webserver_worker_refresh_interval: 30 +airflow_webserver_secret_key: temporary_key +airflow_webserver_worker_class: sync +airflow_webserver_expose_config: False +airflow_webserver_filter_by_owner: False +airflow_webserver_owner_mode: user +airflow_webserver_dag_orientation: LR +airflow_webserver_demo_mode: False +airflow_webserver_log_fetch_timeout_sec: 5 +airflow_webserver_hide_paused_dags_by_default: False + +## Webserver Authentication (http://pythonhosted.org/airflow/security.html#web-authentication) + +# Choices of auth_backend include: +# - airflow.contrib.auth.backends.password_auth +# - airflow.contrib.auth.backends.ldap_auth +# - airflow.contrib.auth.backends.github_enterprise_auth +# - others? :) +airflow_webserver_authenticate: False +airflow_webserver_auth_backend: + +# Operators +airflow_operator_default_owner: Airflow +airflow_operator_default_cpus: 1 +airflow_operator_default_ram: 512 +airflow_operator_default_disk: 512 +airflow_operator_default_gpus: 0 + +# LDAP (only applies if airflow_webserver_auth_backend == "airflow.contrib.auth.backends.ldap_auth") +airflow_ldap_uri: +airflow_ldap_user_filter: +airflow_ldap_user_name_attr: +airflow_ldap_superuser_filter: +airflow_ldap_data_profiler_filter: +airflow_ldap_bind_user: +airflow_ldap_bind_password: +airflow_ldap_basedn: +airflow_ldap_cacert: +airflow_ldap_search_scope: + +# MAIL +airflow_email_backend: airflow.utils.email.send_email_smtp + +# SMTP +airflow_smtp_host: localhost +airflow_smtp_starttls: True +airflow_smtp_ssl: False +airflow_smtp_port: 25 +airflow_smtp_mail_from: airflow@airflow.com +airflow_smtp_user: +airflow_smtp_passwd: + +# SCHEDULER +airflow_scheduler_job_heartbeat_sec: 5 +airflow_scheduler_heartbeat_sec: 5 +airflow_scheduler_run_duration: -1 +airflow_scheduler_min_file_process_interval: 0 +airflow_scheduler_dag_dir_list_interval: 300 +airflow_scheduler_print_stats_interval: 30 +airflow_scheduler_zombie_task_threshold: 300 +airflow_scheduler_catchup_by_default: True +airflow_scheduler_max_threads: 2 +airflow_scheduler_authenticate: False + +## STASTD +airflow_statsd_on: False +airflow_statsd_host: localhost +airflow_statsd_port: 8125 +airflow_statsd_prefix: airflow + +# CELERY +airflow_celery_app_name: airflow.executors.celery_executor +airflow_celery_concurrency: 16 +airflow_celery_worker_log_server_port: 8793 +airflow_celery_broker_url: sqla+mysql://airflow:airflow@localhost:3306/airflow +airflow_celery_result_backend: db+mysql://airflow:airflow@localhost:3306/airflow +airflow_celery_default_queue: default + +# FLOWER +airflow_flower_host: 0.0.0.0 +airflow_flower_port: 5555 + +# MESOS +airflow_mesos_master_host: localhost:5050 +airflow_mesos_framework_name: Airflow +airflow_mesos_task_cpu: 1 +airflow_mesos_task_memory: 256 +airflow_mesos_checkpoint: False +airflow_mesos_authenticate: False + +# KERBEROS +airflow_kerberos_ccache: /tmp/airflow_krb5_ccache +airflow_kerberos_principal: airflow +airflow_kerberos_reinit_frequency: 3600 +airflow_kerberos_kinit_path: kinit +airflow_kerberos_keytab: airflow.keytab + +# GITHUB ENTEPRISE +airflow_github_enterprise_api_rev: v3 + +# ADMIN +airflow_admin_hide_sensitive_variable_fields: True diff --git a/handlers/main.yml b/handlers/main.yml new file mode 100644 index 0000000..864a3a2 --- /dev/null +++ b/handlers/main.yml @@ -0,0 +1,20 @@ +--- +# handlers file for airflow-role + +- name: restart airflow-webserver + systemd: + name: airflow-webserver + state: restarted + when: airflow_services['airflow-webserver'] and airflow_services['airflow-webserver']['enabled'] + +- name: restart airflow-scheduler + systemd: + name: airflow-scheduler + state: restarted + when: airflow_services['airflow-scheduler'] and airflow_services['airflow-scheduler']['enabled'] + +- name: restart airflow-worker + systemd: + name: airflow-worker + state: restarted + when: airflow_services['airflow-worker'] and airflow_services['airflow-worker']['enabled'] \ No newline at end of file diff --git a/logo.gif b/logo.gif new file mode 100644 index 0000000..d96f592 Binary files /dev/null and b/logo.gif differ diff --git a/meta/main.yml b/meta/main.yml new file mode 100644 index 0000000..315a737 --- /dev/null +++ b/meta/main.yml @@ -0,0 +1,10 @@ +--- +galaxy_info: + company: Idealista S.A. + description: Apache Airflow role + min_ansible_version: 2.2.1.0 + license: Apache 2.0 + platforms: + - name: Debian + versions: + - stretch \ No newline at end of file diff --git a/molecule.yml b/molecule.yml new file mode 100644 index 0000000..041d68c --- /dev/null +++ b/molecule.yml @@ -0,0 +1,66 @@ +--- +molecule: + # directory in CWD to place all temp files, etc. + molecule_dir: .molecule + + # where temporary state will be stored (lives under molecule_dir) + state_file: state + + # name of temporary vagrantfile created during runs (lives under molecule_dir) + vagrantfile_file: vagrantfile + + # directories to ignore when doing trailing whitespace checks on files during verify command + ignore_paths: + - .git + - .vagrant + - .molecule + + # directory to look for testinfra tests + testinfra_dir: tests + + # directory containing group_vars to use with ansible + group_vars: ../tests/group_vars + +dependency: + name: galaxy + +# ansible related configuration +ansible: + playbook: tests/playbook.yml + +test: + sequence: + - destroy + - syntax + - create + - converge + - verify + - idempotence + +# configuration options for the internal call to vagrant +vagrant: + # molecule's --platform option will look for these names + raw_config_args: + - "landrush.enabled = true" + - "landrush.tld = 'vm'" + - "landrush.guest_redirect_dns = true" + + platforms: + - name: Debian9 + box: debian/stretch64 + + providers: + - name: virtualbox + type: virtualbox + options: + memory: 1024 + cpus: 4 + + instances: + - name: airflow.vm + ansible_groups: + - airflow + interfaces: + - network_name: private_network + type: dhcp + auto_config: true \ No newline at end of file diff --git a/tasks/config.yml b/tasks/config.yml new file mode 100644 index 0000000..2e57199 --- /dev/null +++ b/tasks/config.yml @@ -0,0 +1,74 @@ +--- + +- name: Airflow | Ensure airflow directories structure + file: + path: "{{ item }}" + state: directory + owner: "{{ airflow_user }}" + group: "{{ airflow_group }}" + with_items: + - "{{ airflow_logs_folder }}" + - "{{ airflow_child_process_log_folder }}" + - "{{ airflow_dags_folder }}" + - "{{ airflow_plugins_folder }}" + +- name: Airflow | Copy gunicorn logrotate config + template: + src: gunicorn-logrotate.j2 + dest: /etc/logrotate.d/airflow + mode: 0644 + owner: root + group: root + +- name: Airflow | Copy basic airflow config file + template: + src: "airflow.cfg.j2" + dest: "{{ airflow_home }}/airflow.cfg" + owner: "{{ airflow_user }}" + group: "{{ airflow_group }}" + mode: 0640 + register: airflow_config + notify: + - restart airflow-webserver + - restart airflow-scheduler + - restart airflow-worker + +- name: Airflow | Initializing DB + shell: AIRFLOW_HOME={{ airflow_home }} /usr/local/bin/airflow initdb + become: true + become_user: "{{ airflow_user }}" + tags: + skip_ansible_lint + when: airflow_install.changed or airflow_config.changed + notify: + - restart airflow-webserver + - restart airflow-scheduler + - restart airflow-worker + +- name: Airflow | Copy extra airflow config files (provided by playbooks) + copy: + src: "{{ item }}" + dest: "{{ airflow_home }}/{{ item | basename }}" + owner: "{{ airflow_user }}" + group: "{{ airflow_group }}" + mode: 0640 + with_fileglob: + - "{{ airflow_extra_conf_path }}/*" + notify: + - restart airflow-webserver + - restart airflow-scheduler + - restart airflow-worker + +- name: Airflow | Copy extra airflow config templates (provided by playbooks) + template: + src: "{{ item }}" + dest: "{{ airflow_home }}/{{ item | basename }}" + owner: "{{ airflow_user }}" + group: "{{ airflow_group }}" + mode: 0640 + with_fileglob: + - "{{ airflow_extra_conf_template_path }}/*" + notify: + - restart airflow-webserver + - restart airflow-scheduler + - restart airflow-worker \ No newline at end of file diff --git a/tasks/install.yml b/tasks/install.yml new file mode 100644 index 0000000..6ff09d8 --- /dev/null +++ b/tasks/install.yml @@ -0,0 +1,60 @@ +--- + +- name: Airflow | Ensure Airflow group + group: + name: "{{ airflow_group }}" + +- name: Airflow | Ensure Airflow user + user: + name: "{{ airflow_user }}" + group: "{{ airflow_group }}" + shell: /bin/bash + +- name: Airflow | Installing dependencies + apt: + pkg: "{{ item }}" + state: present + with_items: "{{ airflow_required_libs }}" + +- name: Airflow | Installing Airflow + pip: + name: apache-airflow + version: "{{ airflow_version }}" + register: airflow_install + +- name: Airflow | Installing Airflow Extra Packages + pip: + name: apache-airflow[{{ item }}] + version: "{{ airflow_version }}" + with_items: "{{ airflow_extra_packages }}" + +- name: Airflow | Create path for Airflow home + file: + path: "{{ airflow_home }}" + state: directory + owner: "{{ airflow_user }}" + group: "{{ airflow_group }}" + +- name: Airflow | Create path for configuration files + file: + path: "{{ airflow_environment_file_folder }}" + state: directory + owner: "{{ airflow_user }}" + group: "{{ airflow_group }}" + +- name: Airflow | Copy Environment File + template: + src: airflow-environment-file.j2 + dest: "{{ airflow_environment_file_folder }}/airflow" + mode: 0644 + owner: "{{ airflow_user }}" + group: "{{ airflow_group }}" + +- name: Airflow | Copy Daemon scripts + template: + src: "{{ item.key }}.service.j2" + dest: /lib/systemd/system/{{ item.key }}.service + mode: 0644 + notify: restart {{ item.key }} + with_dict: "{{ airflow_services }}" + when: "{{ item.value.enabled }}" \ No newline at end of file diff --git a/tasks/main.yml b/tasks/main.yml new file mode 100644 index 0000000..38c7558 --- /dev/null +++ b/tasks/main.yml @@ -0,0 +1,16 @@ +--- + +- name: Airflow | Install + include: install.yml + tags: + - install + +- name: Airflow | Config + include: config.yml + tags: + - config + +- name: Airflow | Service + include: service.yml + tags: + - service \ No newline at end of file diff --git a/tasks/service.yml b/tasks/service.yml new file mode 100644 index 0000000..a2def81 --- /dev/null +++ b/tasks/service.yml @@ -0,0 +1,11 @@ +--- + +- name: Airflow | Configuring service + systemd: + name: "{{ item.key }}" + state: "{{ item.value.state }}" + enabled: "{{ item.value.enabled }}" + daemon_reload: yes + with_dict: "{{ airflow_services }}" + when: "{{ item.value.enabled }}" + changed_when: false \ No newline at end of file diff --git a/templates/airflow-environment-file.j2 b/templates/airflow-environment-file.j2 new file mode 100644 index 0000000..74aca45 --- /dev/null +++ b/templates/airflow-environment-file.j2 @@ -0,0 +1,24 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is the environment file for Airflow. Put this file in /etc/sysconfig/airflow per default +# configuration of the systemd unit files. +# +# AIRFLOW_CONFIG= +# AIRFLOW_HOME= +# + +# required setting, 0 sets it to unlimited. Scheduler will get restart after every X runs +SCHEDULER_RUNS={{ airflow_scheduler_runs }} + +AIRFLOW_HOME={{ airflow_home }} \ No newline at end of file diff --git a/templates/airflow-scheduler.service.j2 b/templates/airflow-scheduler.service.j2 new file mode 100644 index 0000000..637872f --- /dev/null +++ b/templates/airflow-scheduler.service.j2 @@ -0,0 +1,30 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[Unit] +Description=Airflow scheduler daemon +After=network.target postgresql.service mysql.service redis.service rabbitmq-server.service +Wants=postgresql.service mysql.service redis.service rabbitmq-server.service + +[Service] +EnvironmentFile={{ airflow_environment_file_folder }}/airflow +User={{ airflow_user }} +Group={{ airflow_group }} +Type=simple +ExecStart=/usr/local/bin/airflow scheduler -n ${SCHEDULER_RUNS} --pid /run/airflow/scheduler.pid +Restart=always +RestartSec=5s +RuntimeDirectory=airflow + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/templates/airflow-webserver.service.j2 b/templates/airflow-webserver.service.j2 new file mode 100644 index 0000000..6192235 --- /dev/null +++ b/templates/airflow-webserver.service.j2 @@ -0,0 +1,31 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[Unit] +Description=Airflow webserver daemon +After=network.target postgresql.service mysql.service redis.service rabbitmq-server.service +Wants=postgresql.service mysql.service redis.service rabbitmq-server.service + +[Service] +EnvironmentFile={{ airflow_environment_file_folder }}/airflow +User={{ airflow_user }} +Group={{ airflow_group }} +Type=simple +ExecStart=/usr/local/bin/airflow webserver --pid /run/airflow/webserver.pid +Restart=on-failure +RestartSec=5s +PrivateTmp=true +RuntimeDirectory=airflow + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/templates/airflow-worker.service.j2 b/templates/airflow-worker.service.j2 new file mode 100644 index 0000000..b22899d --- /dev/null +++ b/templates/airflow-worker.service.j2 @@ -0,0 +1,30 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[Unit] +Description=Airflow celery worker daemon +After=network.target postgresql.service mysql.service redis.service rabbitmq-server.service +Wants=postgresql.service mysql.service redis.service rabbitmq-server.service + +[Service] +EnvironmentFile={{ airflow_environment_file_folder }}/airflow +User={{ airflow_user }} +Group={{ airflow_group }} +Type=simple +ExecStart=/usr/local/bin/airflow worker --pid /run/airflow/worker.pid +Restart=on-failure +RestartSec=10s +RuntimeDirectory=airflow + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/templates/airflow.cfg.j2 b/templates/airflow.cfg.j2 new file mode 100644 index 0000000..d8a299a --- /dev/null +++ b/templates/airflow.cfg.j2 @@ -0,0 +1,374 @@ +[core] +# The home folder for airflow, default is ~/airflow +airflow_home = {{ airflow_home }} + +# The folder where your airflow pipelines live, most likely a +# subfolder in a code repository +# This path must be absolute +dags_folder = {{ airflow_dags_folder }} + +# The folder where airflow should store its log files +# This path must be absolute +base_log_folder = {{ airflow_logs_folder }} + +# Airflow can store logs remotely in AWS S3 or Google Cloud Storage. Users +# must supply a remote location URL (starting with either 's3://...' or +# 'gs://...') and an Airflow connection id that provides access to the storage +# location. +remote_base_log_folder = {{ airflow_remote_base_log_folder }} +remote_log_conn_id = {{ airflow_remote_log_conn_id }} +# Use server-side encryption for logs stored in S3 +encrypt_s3_logs = {{ airflow_encrypt_s3_logs }} +# DEPRECATED option for remote log storage, use remote_base_log_folder instead! +s3_log_folder = {{ airflow_s3_log_folder }} + +# The executor class that airflow should use. Choices include +# SequentialExecutor, LocalExecutor, CeleryExecutor +executor = {{ airflow_executor }} + +# The SqlAlchemy connection string to the metadata database. +# SqlAlchemy supports many different database engine, more information +# their website +sql_alchemy_conn = {{ airflow_database_conn }} + +# The SqlAlchemy pool size is the maximum number of database connections +# in the pool. +sql_alchemy_pool_size = {{ airflow_database_pool_size }} + +# The SqlAlchemy pool recycle is the number of seconds a connection +# can be idle in the pool before it is invalidated. This config does +# not apply to sqlite. +sql_alchemy_pool_recycle = {{ airflow_database_pool_recycle }} + +# The amount of parallelism as a setting to the executor. This defines +# the max number of task instances that should run simultaneously +# on this airflow installation +parallelism = {{ airflow_parallelism }} + +# The number of task instances allowed to run concurrently by the scheduler +dag_concurrency = {{ airflow_dag_concurrency }} + +# Are DAGs paused by default at creation +dags_are_paused_at_creation = {{ airflow_dags_are_paused_at_creation }} + +# When not using pools, tasks are run in the "default pool", +# whose size is guided by this config element +non_pooled_task_slot_count = {{ airflow_non_pooled_task_slot_count }} + +# The maximum number of active DAG runs per DAG +max_active_runs_per_dag = {{ airflow_max_active_runs_per_dag }} + +# Whether to load the examples that ship with Airflow. It's good to +# get started, but you probably want to set this to False in a production +# environment +load_examples = {{ airflow_load_examples }} + +# Where your Airflow plugins are stored +plugins_folder = {{ airflow_plugins_folder }} + +# Secret key to save connection passwords in the db +fernet_key = {{ airflow_fernet_key }} + +# Whether to disable pickling dags +donot_pickle = {{ airflow_donot_pickle }} + +# How long before timing out a python file import while filling the DagBag +dagbag_import_timeout = {{ airflow_dagbag_import_timeout }} + +# The class to use for running task instances in a subprocess +task_runner = {{ airflow_task_runner }} + +# If set, tasks without a `run_as_user` argument will be run with this user +# Can be used to de-elevate a sudo user running Airflow when executing tasks +default_impersonation = {{ airflow_default_impersonation }} + +# What security module to use (for example kerberos): +security = + +# Turn unit test mode on (overwrites many configuration options with test +# values at runtime) +unit_test_mode = {{ airflow_unit_test_mode }} + +[cli] +# In what way should the cli access the API. The LocalClient will use the +# database directly, while the json_client will use the api running on the +# webserver +api_client = {{ airflow_cli_api_client }} +endpoint_url = {{ airflow_cli_api_endpoint_url }} + +[api] +# How to authenticate users of the API +auth_backend = {{ airflow_auth_backend }} + +[operators] +# The default owner assigned to each new operator, unless +# provided explicitly or passed via `default_args` +default_owner = {{ airflow_operator_default_owner }} +default_cpus = {{ airflow_operator_default_cpus }} +default_ram = {{ airflow_operator_default_ram }} +default_disk = {{ airflow_operator_default_disk }} +default_gpus = {{ airflow_operator_default_gpus }} + + +[webserver] +# The base url of your website as airflow cannot guess what domain or +# cname you are using. This is used in automated emails that +# airflow sends to point links to the right web server +base_url = {{ airflow_webserver_base_url }} + +# The ip specified when starting the web server +web_server_host = {{ airflow_webserver_host }} + +# The port on which to run the web server +web_server_port = {{ airflow_webserver_port }} + +# Paths to the SSL certificate and key for the web server. When both are +# provided SSL will be enabled. This does not change the web server port. +web_server_ssl_cert = {{ airflow_webserver_ssl_cert }} +web_server_ssl_key = {{ airflow_webserver_ssl_key }} + +# Number of seconds the gunicorn webserver waits before timing out on a worker +web_server_worker_timeout = {{ airflow_webserver_worker_timeout }} + +# Number of workers to refresh at a time. When set to 0, worker refresh is +# disabled. When nonzero, airflow periodically refreshes webserver workers by +# bringing up new ones and killing old ones. +worker_refresh_batch_size = {{ airflow_webserver_worker_refresh_batch_size }} + +# Number of seconds to wait before refreshing a batch of workers. +worker_refresh_interval = {{ airflow_webserver_worker_refresh_interval }} + +# Secret key used to run your flask app +secret_key = {{ airflow_webserver_secret_key }} + +# Number of workers to run the Gunicorn web server +workers = {{ airflow_webserver_workers }} + +# The worker class gunicorn should use. Choices include +# sync (default), eventlet, gevent +worker_class = {{ airflow_webserver_worker_class }} + +# Log files for the gunicorn webserver. '-' means log to stderr. +access_logfile = {{ airflow_logs_folder }}/gunicorn-access.log +error_logfile = {{ airflow_logs_folder }}/gunicorn-error.log + +# Expose the configuration file in the web server +expose_config = {{ airflow_webserver_expose_config }} + +# Set to true to turn on authentication: +# http://pythonhosted.org/airflow/security.html#web-authentication +authenticate = {{ airflow_webserver_authenticate }} +{% if airflow_webserver_auth_backend %} +auth_backend = {{ airflow_webserver_auth_backend }} +{% endif %} + +# Filter the list of dags by owner name (requires authentication to be enabled) +filter_by_owner = {{ airflow_webserver_filter_by_owner }} + +# Filtering mode. Choices include user (default) and ldapgroup. +# Ldap group filtering requires using the ldap backend +# +# Note that the ldap server needs the "memberOf" overlay to be set up +# in order to user the ldapgroup mode. +owner_mode = {{ airflow_webserver_owner_mode }} + +# Default DAG orientation. Valid values are: +# LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top) +dag_orientation = {{ airflow_webserver_dag_orientation }} + +# Puts the webserver in demonstration mode; blurs the names of Operators for +# privacy. +demo_mode = {{ airflow_webserver_demo_mode }} + +# The amount of time (in secs) webserver will wait for initial handshake +# while fetching logs from other worker machine +log_fetch_timeout_sec = {{ airflow_webserver_log_fetch_timeout_sec }} + +# By default, the webserver shows paused DAGs. Flip this to hide paused +# DAGs by default +hide_paused_dags_by_default = {{ airflow_webserver_hide_paused_dags_by_default }} + + +[email] +email_backend = {{ airflow_email_backend }} + + +[smtp] +# If you want airflow to send emails on retries, failure, and you want to use +# the airflow.utils.email.send_email_smtp function, you have to configure an +# smtp server here +smtp_host = {{ airflow_smtp_host }} +smtp_starttls = {{ airflow_smtp_starttls }} +smtp_ssl = {{ airflow_smtp_ssl }} +smtp_port = {{ airflow_smtp_port }} +smtp_mail_from = {{ airflow_smtp_mail_from }} +{% if airflow_smtp_user %} +smtp_user = {{ airflow_smtp_user }} +{% endif %} +{% if airflow_smtp_passwd %} +smtp_password = {{ airflow_smtp_passwd }} +{% endif %} + +[celery] +# This section only applies if you are using the CeleryExecutor in +# [core] section above + +# The app name that will be used by celery +celery_app_name = {{ airflow_celery_app_name }} + +# The concurrency that will be used when starting workers with the +# "airflow worker" command. This defines the number of task instances that +# a worker will take, so size up your workers based on the resources on +# your worker box and the nature of your tasks +celeryd_concurrency = {{ airflow_celery_concurrency }} + +# When you start an airflow worker, airflow starts a tiny web server +# subprocess to serve the workers local log files to the airflow main +# web server, who then builds pages and sends them to users. This defines +# the port on which the logs are served. It needs to be unused, and open +# visible from the main web server to connect into the workers. +worker_log_server_port = {{ airflow_celery_worker_log_server_port }} + +# The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally +# a sqlalchemy database. Refer to the Celery documentation for more +# information. +broker_url = {{ airflow_celery_broker_url }} + +# Another key Celery setting +celery_result_backend = {{ airflow_celery_result_backend }} + +# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start +# it `airflow flower`. This defines the IP that Celery Flower runs on +flower_host = {{ airflow_flower_host }} + +# This defines the port that Celery Flower runs on +flower_port = {{ airflow_flower_port }} + +# Default queue that tasks get assigned to and that worker listen on. +default_queue = {{ airflow_celery_default_queue }} + + +[scheduler] +# Task instances listen for external kill signal (when you clear tasks +# from the CLI or the UI), this defines the frequency at which they should +# listen (in seconds). +job_heartbeat_sec = {{ airflow_scheduler_job_heartbeat_sec }} + +# The scheduler constantly tries to trigger new tasks (look at the +# scheduler section in the docs for more information). This defines +# how often the scheduler should run (in seconds). +scheduler_heartbeat_sec = {{ airflow_scheduler_heartbeat_sec }} + +# after how much time should the scheduler terminate in seconds +# -1 indicates to run continuously (see also num_runs) +run_duration = {{ airflow_scheduler_run_duration }} + +# after how much time a new DAGs should be picked up from the filesystem +min_file_process_interval = {{ airflow_scheduler_min_file_process_interval }} + +dag_dir_list_interval = {{ airflow_scheduler_dag_dir_list_interval }} + +# How often should stats be printed to the logs +print_stats_interval = {{ airflow_scheduler_print_stats_interval }} + +child_process_log_directory = {{ airflow_child_process_log_folder }} + +# Local task jobs periodically heartbeat to the DB. If the job has +# not heartbeat in this many seconds, the scheduler will mark the +# associated task instance as failed and will re-schedule the task. +scheduler_zombie_task_threshold = {{ airflow_scheduler_zombie_task_threshold }} + +# Turn off scheduler catchup by setting this to False. +# Default behavior is unchanged and +# Command Line Backfills still work, but the scheduler +# will not do scheduler catchup if this is False, +# however it can be set on a per DAG basis in the +# DAG definition (catchup) +catchup_by_default = {{ airflow_scheduler_catchup_by_default }} + +# Statsd (https://github.com/etsy/statsd) integration settings +statsd_on = {{ airflow_statsd_on }} +statsd_host = {{ airflow_statsd_host }} +statsd_port = {{ airflow_statsd_port }} +statsd_prefix = {{ airflow_statsd_prefix }} + +# The scheduler can run multiple threads in parallel to schedule dags. +# This defines how many threads will run. However airflow will never +# use more threads than the amount of cpu cores available. +max_threads = {{ airflow_scheduler_max_threads }} + +authenticate = {{ airflow_scheduler_authenticate }} + + +[mesos] +# Mesos master address which MesosExecutor will connect to. +master = {{ airflow_mesos_master_host }} + +# The framework name which Airflow scheduler will register itself as on mesos +framework_name = {{ airflow_mesos_framework_name }} + +# Number of cpu cores required for running one task instance using +# 'airflow run --local -p ' +# command on a mesos slave +task_cpu = {{ airflow_mesos_task_cpu }} + +# Memory in MB required for running one task instance using +# 'airflow run --local -p ' +# command on a mesos slave +task_memory = {{ airflow_mesos_task_memory }} + +# Enable framework checkpointing for mesos +# See http://mesos.apache.org/documentation/latest/slave-recovery/ +checkpoint = {{ airflow_mesos_checkpoint }} + +# Failover timeout in milliseconds. +# When checkpointing is enabled and this option is set, Mesos waits +# until the configured timeout for +# the MesosExecutor framework to re-register after a failover. Mesos +# shuts down running tasks if the +# MesosExecutor framework fails to re-register within this timeframe. +# failover_timeout = 604800 + +# Enable framework authentication for mesos +# See http://mesos.apache.org/documentation/latest/configuration/ +authenticate = {{ airflow_mesos_authenticate }} + +# Mesos credentials, if authentication is enabled +# default_principal = admin +# default_secret = admin + + +[kerberos] +ccache = {{ airflow_kerberos_ccache }} +# gets augmented with fqdn +principal = {{ airflow_kerberos_principal }} +reinit_frequency = {{ airflow_kerberos_reinit_frequency }} +kinit_path = {{ airflow_kerberos_kinit_path }} +keytab = {{ airflow_kerberos_keytab }} + + +[github_enterprise] +api_rev = {{ airflow_github_enterprise_api_rev }} + + +[admin] +# UI to hide sensitive variable fields when set to True +hide_sensitive_variable_fields = {{ airflow_admin_hide_sensitive_variable_fields }} + +{% if airflow_webserver_auth_backend == "airflow.contrib.auth.backends.ldap_auth" %} +[ldap] +# set a connection without encryption: uri = ldap://: +uri = {{ airflow_ldap_uri }} +user_filter = {{ airflow_ldap_user_filter }} +# in case of Active Directory you would use: user_name_attr = sAMAccountName +user_name_attr = {{ airflow_ldap_user_name_attr }} +superuser_filter = {{ airflow_ldap_superuser_filter }} +data_profiler_filter = {{ airflow_ldap_data_profiler_filter }} +bind_user = {{ airflow_ldap_bind_user }} +bind_password = {{ airflow_ldap_bind_password }} +basedn = {{ airflow_ldap_basedn }} +cacert = {{ airflow_ldap_cacert }} +# Set search_scope to one of them: BASE, LEVEL , SUBTREE +# Set search_scope to SUBTREE if using Active Directory, and not specifying an Organizational Unit +search_scope = {{ airflow_ldap_search_scope }} +{% endif %} \ No newline at end of file diff --git a/templates/gunicorn-logrotate.j2 b/templates/gunicorn-logrotate.j2 new file mode 100644 index 0000000..29e579c --- /dev/null +++ b/templates/gunicorn-logrotate.j2 @@ -0,0 +1,13 @@ +{{ airflow_logs_folder }}/gunicorn-*.log { + daily + missingok + rotate 7 + size 500M + compress + notifempty + create 644 {{ airflow_user }} {{ airflow_group }} + sharedscripts + postrotate + [ -f /run/airflow/webserver.pid ] && kill -USR1 `cat /run/airflow/webserver.pid` + endscript +} \ No newline at end of file diff --git a/tests/group_vars/airflow/main.yml b/tests/group_vars/airflow/main.yml new file mode 100644 index 0000000..82c375c --- /dev/null +++ b/tests/group_vars/airflow/main.yml @@ -0,0 +1,3 @@ +--- + +airflow_fernet_key: xKy13nPFfDflJ0DYGVTwf_DEmbItfURHlEDxrt-bBQw= \ No newline at end of file diff --git a/tests/playbook.yml b/tests/playbook.yml new file mode 100644 index 0000000..b93e683 --- /dev/null +++ b/tests/playbook.yml @@ -0,0 +1,5 @@ +--- + +- hosts: airflow.vm + roles: + - role: airflow-role \ No newline at end of file diff --git a/tests/test_ansible.py b/tests/test_ansible.py new file mode 100644 index 0000000..a330f28 --- /dev/null +++ b/tests/test_ansible.py @@ -0,0 +1,31 @@ +import pytest + + +@pytest.fixture() +def AnsibleDefaults(Ansible): + return Ansible("include_vars", "defaults/main.yml")["ansible_facts"] + + +def test_airflow_user(User, Group, AnsibleDefaults): + user = User(AnsibleDefaults["airflow_user"]) + group = Group(AnsibleDefaults["airflow_group"]) + + assert user.exists + assert group.exists + assert user.group == AnsibleDefaults["airflow_group"] + + +def test_airflow_version(PipPackage, AnsibleDefaults): + expected_version = AnsibleDefaults["airflow_version"] + installed_version = PipPackage.get_packages()["apache-airflow"]["version"] + + assert installed_version == expected_version + + +def test_airflow_services(Service, AnsibleDefaults): + airflow_services = AnsibleDefaults["airflow_services"] + + for airflow_service in airflow_services: + if airflow_services[airflow_service]["enabled"]: + assert Service(airflow_service).is_enabled + assert Service(airflow_service).is_running