-
Notifications
You must be signed in to change notification settings - Fork 14.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support DAGS folder being in different location on scheduler and runners #16860
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
import functools | ||
import logging | ||
import os | ||
import pathlib | ||
import pickle | ||
import re | ||
import sys | ||
|
@@ -236,13 +237,21 @@ class DAG(LoggingMixin): | |
'parent_dag', | ||
'start_date', | ||
'schedule_interval', | ||
'full_filepath', | ||
'fileloc', | ||
'template_searchpath', | ||
'last_loaded', | ||
} | ||
|
||
__serialized_fields: Optional[FrozenSet[str]] = None | ||
|
||
fileloc: str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This reminds me I was wondering whether |
||
""" | ||
File path that needs to be imported to load this DAG or subdag. | ||
|
||
This may not be an actual file on disk in the case when this DAG is loaded | ||
from a ZIP file or other DAG distribution format. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
dag_id: str, | ||
|
@@ -286,10 +295,16 @@ def __init__( | |
self.params.update(self.default_args['params']) | ||
del self.default_args['params'] | ||
|
||
if full_filepath: | ||
warnings.warn( | ||
"Passing full_filepath to DAG() is deprecated and has no effect", | ||
DeprecationWarning, | ||
stacklevel=2, | ||
) | ||
|
||
validate_key(dag_id) | ||
|
||
self._dag_id = dag_id | ||
self._full_filepath = full_filepath if full_filepath else '' | ||
if concurrency and not max_active_tasks: | ||
# TODO: Remove in Airflow 3.0 | ||
warnings.warn( | ||
|
@@ -655,11 +670,22 @@ def dag_id(self, value: str) -> None: | |
|
||
@property | ||
def full_filepath(self) -> str: | ||
return self._full_filepath | ||
""":meta private:""" | ||
warnings.warn( | ||
"DAG.full_filepath is deprecated in favour of fileloc", | ||
DeprecationWarning, | ||
stacklevel=2, | ||
) | ||
return self.fileloc | ||
|
||
@full_filepath.setter | ||
def full_filepath(self, value) -> None: | ||
self._full_filepath = value | ||
warnings.warn( | ||
"DAG.full_filepath is deprecated in favour of fileloc", | ||
DeprecationWarning, | ||
stacklevel=2, | ||
) | ||
self.fileloc = value | ||
|
||
@property | ||
def concurrency(self) -> int: | ||
|
@@ -735,15 +761,26 @@ def task_group(self) -> "TaskGroup": | |
|
||
@property | ||
def filepath(self) -> str: | ||
"""File location of where the dag object is instantiated""" | ||
fn = self.full_filepath.replace(settings.DAGS_FOLDER + '/', '') | ||
fn = fn.replace(os.path.dirname(__file__) + '/', '') | ||
return fn | ||
""":meta private:""" | ||
warnings.warn( | ||
"filepath is deprecated, use relative_fileloc instead", DeprecationWarning, stacklevel=2 | ||
) | ||
return str(self.relative_fileloc) | ||
|
||
@property | ||
def relative_fileloc(self) -> pathlib.Path: | ||
"""File location of the importable dag 'file' relative to the configured DAGs folder.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How will this field behave for example DAGs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case it would be I'll update/expand the docstring to say how it deals with DAGs outside of the dags folder. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we are also able to handle examples DAGs by adding a suffix? This will allow us to install workers on shared machines without any problems There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this change would be better done along with something like https://cwiki.apache.org/confluence/display/AIRFLOW/AIP-20+DAG+manifest, so I've not done this for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we shouldn’t call this relative fileloc, but just fileloc…? Not sure about this. |
||
path = pathlib.Path(self.fileloc) | ||
try: | ||
return path.relative_to(settings.DAGS_FOLDER) | ||
except ValueError: | ||
# Not relative to DAGS_FOLDER. | ||
return path | ||
|
||
@property | ||
def folder(self) -> str: | ||
"""Folder location of where the DAG object is instantiated.""" | ||
return os.path.dirname(self.full_filepath) | ||
return os.path.dirname(self.fileloc) | ||
|
||
@property | ||
def owner(self) -> str: | ||
|
@@ -2118,9 +2155,11 @@ def bulk_write_to_db(cls, dags: Collection["DAG"], session=None): | |
.group_by(DagRun.dag_id) | ||
.all() | ||
) | ||
filelocs = [] | ||
|
||
for orm_dag in sorted(orm_dags, key=lambda d: d.dag_id): | ||
dag = dag_by_ids[orm_dag.dag_id] | ||
filelocs.append(dag.fileloc) | ||
if dag.is_subdag: | ||
orm_dag.is_subdag = True | ||
orm_dag.fileloc = dag.parent_dag.fileloc # type: ignore | ||
|
@@ -2157,7 +2196,7 @@ def bulk_write_to_db(cls, dags: Collection["DAG"], session=None): | |
session.add(dag_tag_orm) | ||
|
||
if settings.STORE_DAG_CODE: | ||
DagCode.bulk_sync_to_db([dag.fileloc for dag in orm_dags]) | ||
DagCode.bulk_sync_to_db(filelocs) | ||
|
||
# Issue SQL/finish "Unit of Work", but let @provide_session commit (or if passed a session, let caller | ||
# decide when to commit | ||
|
@@ -2274,7 +2313,6 @@ def get_serialized_fields(cls): | |
'_old_context_manager_dags', | ||
'safe_dag_id', | ||
'last_loaded', | ||
'_full_filepath', | ||
'user_defined_filters', | ||
'user_defined_macros', | ||
'partial', | ||
|
@@ -2382,6 +2420,10 @@ class DagModel(Base): | |
Index('idx_next_dagrun_create_after', next_dagrun_create_after, unique=False), | ||
) | ||
|
||
parent_dag = relationship( | ||
"DagModel", remote_side=[dag_id], primaryjoin=root_dag_id == dag_id, foreign_keys=[root_dag_id] | ||
) | ||
|
||
NUM_DAGS_PER_DAGRUN_QUERY = conf.getint('scheduler', 'max_dagruns_to_create_per_loop', fallback=10) | ||
|
||
def __init__(self, concurrency=None, **kwargs): | ||
|
@@ -2410,7 +2452,7 @@ def timezone(self): | |
@staticmethod | ||
@provide_session | ||
def get_dagmodel(dag_id, session=None): | ||
return session.query(DagModel).filter(DagModel.dag_id == dag_id).first() | ||
return session.query(DagModel).options(joinedload(DagModel.parent_dag)).get(dag_id) | ||
|
||
@classmethod | ||
@provide_session | ||
|
@@ -2455,6 +2497,18 @@ def get_default_view(self) -> str: | |
def safe_dag_id(self): | ||
return self.dag_id.replace('.', '__dot__') | ||
|
||
@property | ||
def relative_fileloc(self) -> Optional[pathlib.Path]: | ||
"""File location of the importable dag 'file' relative to the configured DAGs folder.""" | ||
if self.fileloc is None: | ||
return None | ||
path = pathlib.Path(self.fileloc) | ||
try: | ||
return path.relative_to(settings.DAGS_FOLDER) | ||
except ValueError: | ||
# Not relative to DAGS_FOLDER. | ||
return path | ||
|
||
@provide_session | ||
def set_is_paused(self, is_paused: bool, including_subdags: bool = True, session=None) -> None: | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are the defaults, we don't need to pass them again